aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-04 15:21:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-04 15:21:28 -0400
commitb1b6f83ac938d176742c85757960dec2cf10e468 (patch)
treef99e605318232a9327500896b9187b5ec9cad0c1
parent5f82e71a001d14824a7728ad9e49f6aea420f161 (diff)
parent9e52fc2b50de3a1c08b44f94c610fbe998c0031a (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm changes from Ingo Molnar: "PCID support, 5-level paging support, Secure Memory Encryption support The main changes in this cycle are support for three new, complex hardware features of x86 CPUs: - Add 5-level paging support, which is a new hardware feature on upcoming Intel CPUs allowing up to 128 PB of virtual address space and 4 PB of physical RAM space - a 512-fold increase over the old limits. (Supercomputers of the future forecasting hurricanes on an ever warming planet can certainly make good use of more RAM.) Many of the necessary changes went upstream in previous cycles, v4.14 is the first kernel that can enable 5-level paging. This feature is activated via CONFIG_X86_5LEVEL=y - disabled by default. (By Kirill A. Shutemov) - Add 'encrypted memory' support, which is a new hardware feature on upcoming AMD CPUs ('Secure Memory Encryption', SME) allowing system RAM to be encrypted and decrypted (mostly) transparently by the CPU, with a little help from the kernel to transition to/from encrypted RAM. Such RAM should be more secure against various attacks like RAM access via the memory bus and should make the radio signature of memory bus traffic harder to intercept (and decrypt) as well. This feature is activated via CONFIG_AMD_MEM_ENCRYPT=y - disabled by default. (By Tom Lendacky) - Enable PCID optimized TLB flushing on newer Intel CPUs: PCID is a hardware feature that attaches an address space tag to TLB entries and thus allows to skip TLB flushing in many cases, even if we switch mm's. (By Andy Lutomirski) All three of these features were in the works for a long time, and it's coincidence of the three independent development paths that they are all enabled in v4.14 at once" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (65 commits) x86/mm: Enable RCU based page table freeing (CONFIG_HAVE_RCU_TABLE_FREE=y) x86/mm: Use pr_cont() in dump_pagetable() x86/mm: Fix SME encryption stack ptr handling kvm/x86: Avoid clearing the C-bit in rsvd_bits() x86/CPU: Align CR3 defines x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1 mappings of poison pages acpi, x86/mm: Remove encryption mask from ACPI page protection type x86/mm, kexec: Fix memory corruption with SME on successive kexecs x86/mm/pkeys: Fix typo in Documentation/x86/protection-keys.txt x86/mm/dump_pagetables: Speed up page tables dump for CONFIG_KASAN=y x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID x86: Enable 5-level paging support via CONFIG_X86_5LEVEL=y x86/mm: Allow userspace have mappings above 47-bit x86/mm: Prepare to expose larger address space to userspace x86/mpx: Do not allow MPX if we have mappings above 47-bit x86/mm: Rename tasksize_32bit/64bit to task_size_32bit/64bit() x86/xen: Redefine XEN_ELFNOTE_INIT_P2M using PUD_SIZE * PTRS_PER_PUD x86/mm/dump_pagetables: Fix printout of p4d level x86/mm/dump_pagetables: Generalize address normalization x86/boot: Fix memremap() related build failure ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt13
-rw-r--r--Documentation/x86/amd-memory-encryption.txt68
-rw-r--r--Documentation/x86/protection-keys.txt6
-rw-r--r--Documentation/x86/x86_64/5level-paging.txt64
-rw-r--r--arch/ia64/include/asm/acpi.h2
-rw-r--r--arch/ia64/kernel/efi.c4
-rw-r--r--arch/x86/Kconfig49
-rw-r--r--arch/x86/boot/compressed/pagetable.c7
-rw-r--r--arch/x86/include/asm/acpi.h13
-rw-r--r--arch/x86/include/asm/cmdline.h2
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/disabled-features.h4
-rw-r--r--arch/x86/include/asm/dma-mapping.h5
-rw-r--r--arch/x86/include/asm/dmi.h8
-rw-r--r--arch/x86/include/asm/e820/api.h2
-rw-r--r--arch/x86/include/asm/elf.h4
-rw-r--r--arch/x86/include/asm/fixmap.h20
-rw-r--r--arch/x86/include/asm/init.h1
-rw-r--r--arch/x86/include/asm/io.h8
-rw-r--r--arch/x86/include/asm/kexec.h11
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/mem_encrypt.h80
-rw-r--r--arch/x86/include/asm/mmu.h25
-rw-r--r--arch/x86/include/asm/mmu_context.h15
-rw-r--r--arch/x86/include/asm/mpx.h9
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/page_types.h3
-rw-r--r--arch/x86/include/asm/pgtable.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h58
-rw-r--r--arch/x86/include/asm/processor-flags.h13
-rw-r--r--arch/x86/include/asm/processor.h20
-rw-r--r--arch/x86/include/asm/realmode.h12
-rw-r--r--arch/x86/include/asm/set_memory.h3
-rw-r--r--arch/x86/include/asm/tlb.h14
-rw-r--r--arch/x86/include/asm/tlbflush.h87
-rw-r--r--arch/x86/include/asm/vga.h14
-rw-r--r--arch/x86/kernel/acpi/boot.c6
-rw-r--r--arch/x86/kernel/cpu/amd.c29
-rw-r--r--arch/x86/kernel/cpu/bugs.c8
-rw-r--r--arch/x86/kernel/cpu/common.c40
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c43
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/e820.c26
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/head64.c95
-rw-r--r--arch/x86/kernel/head_64.S40
-rw-r--r--arch/x86/kernel/kdebugfs.c34
-rw-r--r--arch/x86/kernel/ksysfs.c28
-rw-r--r--arch/x86/kernel/machine_kexec_64.c25
-rw-r--r--arch/x86/kernel/mpparse.c108
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/pci-nommu.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c15
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S14
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/kernel/sys_x86_64.c30
-rw-r--r--arch/x86/kvm/mmu.c41
-rw-r--r--arch/x86/kvm/svm.c35
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c3
-rw-r--r--arch/x86/lib/cmdline.c105
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c93
-rw-r--r--arch/x86/mm/fault.c26
-rw-r--r--arch/x86/mm/hugetlbpage.c27
-rw-r--r--arch/x86/mm/ident_map.c12
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/ioremap.c287
-rw-r--r--arch/x86/mm/kasan_init_64.c6
-rw-r--r--arch/x86/mm/mem_encrypt.c593
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S149
-rw-r--r--arch/x86/mm/mmap.c12
-rw-r--r--arch/x86/mm/mpx.c33
-rw-r--r--arch/x86/mm/pageattr.c67
-rw-r--r--arch/x86/mm/pat.c9
-rw-r--r--arch/x86/mm/pgtable.c8
-rw-r--r--arch/x86/mm/tlb.c331
-rw-r--r--arch/x86/pci/common.c4
-rw-r--r--arch/x86/platform/efi/efi.c6
-rw-r--r--arch/x86/platform/efi/efi_64.c15
-rw-r--r--arch/x86/realmode/init.c12
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S24
-rw-r--r--arch/x86/xen/Kconfig5
-rw-r--r--arch/x86/xen/enlighten_pv.c7
-rw-r--r--arch/x86/xen/mmu_pv.c5
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--drivers/acpi/processor_idle.c2
-rw-r--r--drivers/firmware/dmi-sysfs.c5
-rw-r--r--drivers/firmware/efi/efi.c33
-rw-r--r--drivers/firmware/pcdp.c4
-rw-r--r--drivers/gpu/drm/drm_gem.c2
-rw-r--r--drivers/gpu/drm/drm_vm.c4
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_vm.c7
-rw-r--r--drivers/gpu/drm/udl/udl_fb.c4
-rw-r--r--drivers/idle/intel_idle.c9
-rw-r--r--drivers/iommu/amd_iommu.c30
-rw-r--r--drivers/iommu/amd_iommu_init.c34
-rw-r--r--drivers/iommu/amd_iommu_proto.h10
-rw-r--r--drivers/iommu/amd_iommu_types.h2
-rw-r--r--drivers/sfi/sfi_core.c23
-rw-r--r--drivers/video/fbdev/core/fbmem.c12
-rw-r--r--include/asm-generic/early_ioremap.h2
-rw-r--r--include/asm-generic/pgtable.h12
-rw-r--r--include/linux/compiler-gcc.h2
-rw-r--r--include/linux/compiler.h4
-rw-r--r--include/linux/dma-mapping.h13
-rw-r--r--include/linux/efi.h9
-rw-r--r--include/linux/io.h2
-rw-r--r--include/linux/kexec.h8
-rw-r--r--include/linux/mem_encrypt.h48
-rw-r--r--include/linux/mm_inline.h6
-rw-r--r--include/linux/swiotlb.h1
-rw-r--r--init/main.c10
-rw-r--r--kernel/kexec_core.c12
-rw-r--r--kernel/memremap.c20
-rw-r--r--lib/swiotlb.c57
-rw-r--r--mm/early_ioremap.c28
-rw-r--r--mm/memory-failure.c2
120 files changed, 3134 insertions, 470 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3a99cc96b6b1..dad6fa01af95 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2233,6 +2233,17 @@
2233 memory contents and reserves bad memory 2233 memory contents and reserves bad memory
2234 regions that are detected. 2234 regions that are detected.
2235 2235
2236 mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control
2237 Valid arguments: on, off
2238 Default (depends on kernel configuration option):
2239 on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y)
2240 off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n)
2241 mem_encrypt=on: Activate SME
2242 mem_encrypt=off: Do not activate SME
2243
2244 Refer to Documentation/x86/amd-memory-encryption.txt
2245 for details on when memory encryption can be activated.
2246
2236 mem_sleep_default= [SUSPEND] Default system suspend mode: 2247 mem_sleep_default= [SUSPEND] Default system suspend mode:
2237 s2idle - Suspend-To-Idle 2248 s2idle - Suspend-To-Idle
2238 shallow - Power-On Suspend or equivalent (if supported) 2249 shallow - Power-On Suspend or equivalent (if supported)
@@ -2697,6 +2708,8 @@
2697 nopat [X86] Disable PAT (page attribute table extension of 2708 nopat [X86] Disable PAT (page attribute table extension of
2698 pagetables) support. 2709 pagetables) support.
2699 2710
2711 nopcid [X86-64] Disable the PCID cpu feature.
2712
2700 norandmaps Don't use address space randomization. Equivalent to 2713 norandmaps Don't use address space randomization. Equivalent to
2701 echo 0 > /proc/sys/kernel/randomize_va_space 2714 echo 0 > /proc/sys/kernel/randomize_va_space
2702 2715
diff --git a/Documentation/x86/amd-memory-encryption.txt b/Documentation/x86/amd-memory-encryption.txt
new file mode 100644
index 000000000000..f512ab718541
--- /dev/null
+++ b/Documentation/x86/amd-memory-encryption.txt
@@ -0,0 +1,68 @@
1Secure Memory Encryption (SME) is a feature found on AMD processors.
2
3SME provides the ability to mark individual pages of memory as encrypted using
4the standard x86 page tables. A page that is marked encrypted will be
5automatically decrypted when read from DRAM and encrypted when written to
6DRAM. SME can therefore be used to protect the contents of DRAM from physical
7attacks on the system.
8
9A page is encrypted when a page table entry has the encryption bit set (see
10below on how to determine its position). The encryption bit can also be
11specified in the cr3 register, allowing the PGD table to be encrypted. Each
12successive level of page tables can also be encrypted by setting the encryption
13bit in the page table entry that points to the next table. This allows the full
14page table hierarchy to be encrypted. Note, this means that just because the
15encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted.
16Each page table entry in the hierarchy needs to have the encryption bit set to
17achieve that. So, theoretically, you could have the encryption bit set in cr3
18so that the PGD is encrypted, but not set the encryption bit in the PGD entry
19for a PUD which results in the PUD pointed to by that entry to not be
20encrypted.
21
22Support for SME can be determined through the CPUID instruction. The CPUID
23function 0x8000001f reports information related to SME:
24
25 0x8000001f[eax]:
26 Bit[0] indicates support for SME
27 0x8000001f[ebx]:
28 Bits[5:0] pagetable bit number used to activate memory
29 encryption
30 Bits[11:6] reduction in physical address space, in bits, when
31 memory encryption is enabled (this only affects
32 system physical addresses, not guest physical
33 addresses)
34
35If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to
36determine if SME is enabled and/or to enable memory encryption:
37
38 0xc0010010:
39 Bit[23] 0 = memory encryption features are disabled
40 1 = memory encryption features are enabled
41
42Linux relies on BIOS to set this bit if BIOS has determined that the reduction
43in the physical address space as a result of enabling memory encryption (see
44CPUID information above) will not conflict with the address space resource
45requirements for the system. If this bit is not set upon Linux startup then
46Linux itself will not set it and memory encryption will not be possible.
47
48The state of SME in the Linux kernel can be documented as follows:
49 - Supported:
50 The CPU supports SME (determined through CPUID instruction).
51
52 - Enabled:
53 Supported and bit 23 of MSR_K8_SYSCFG is set.
54
55 - Active:
56 Supported, Enabled and the Linux kernel is actively applying
57 the encryption bit to page table entries (the SME mask in the
58 kernel is non-zero).
59
60SME can also be enabled and activated in the BIOS. If SME is enabled and
61activated in the BIOS, then all memory accesses will be encrypted and it will
62not be necessary to activate the Linux memory encryption support. If the BIOS
63merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate
64memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or
65by supplying mem_encrypt=on on the kernel command line. However, if BIOS does
66not enable SME, then Linux will not be able to activate memory encryption, even
67if configured to do so by default or the mem_encrypt=on command line parameter
68is specified.
diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt
index b64304540821..fa46dcb347bc 100644
--- a/Documentation/x86/protection-keys.txt
+++ b/Documentation/x86/protection-keys.txt
@@ -34,7 +34,7 @@ with a key. In this example WRPKRU is wrapped by a C function
34called pkey_set(). 34called pkey_set().
35 35
36 int real_prot = PROT_READ|PROT_WRITE; 36 int real_prot = PROT_READ|PROT_WRITE;
37 pkey = pkey_alloc(0, PKEY_DENY_WRITE); 37 pkey = pkey_alloc(0, PKEY_DISABLE_WRITE);
38 ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 38 ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
39 ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey); 39 ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
40 ... application runs here 40 ... application runs here
@@ -42,9 +42,9 @@ called pkey_set().
42Now, if the application needs to update the data at 'ptr', it can 42Now, if the application needs to update the data at 'ptr', it can
43gain access, do the update, then remove its write access: 43gain access, do the update, then remove its write access:
44 44
45 pkey_set(pkey, 0); // clear PKEY_DENY_WRITE 45 pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE
46 *ptr = foo; // assign something 46 *ptr = foo; // assign something
47 pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again 47 pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again
48 48
49Now when it frees the memory, it will also free the pkey since it 49Now when it frees the memory, it will also free the pkey since it
50is no longer in use: 50is no longer in use:
diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt
new file mode 100644
index 000000000000..087251a0d99c
--- /dev/null
+++ b/Documentation/x86/x86_64/5level-paging.txt
@@ -0,0 +1,64 @@
1== Overview ==
2
3Original x86-64 was limited by 4-level paing to 256 TiB of virtual address
4space and 64 TiB of physical address space. We are already bumping into
5this limit: some vendors offers servers with 64 TiB of memory today.
6
7To overcome the limitation upcoming hardware will introduce support for
85-level paging. It is a straight-forward extension of the current page
9table structure adding one more layer of translation.
10
11It bumps the limits to 128 PiB of virtual address space and 4 PiB of
12physical address space. This "ought to be enough for anybody" ©.
13
14QEMU 2.9 and later support 5-level paging.
15
16Virtual memory layout for 5-level paging is described in
17Documentation/x86/x86_64/mm.txt
18
19== Enabling 5-level paging ==
20
21CONFIG_X86_5LEVEL=y enables the feature.
22
23So far, a kernel compiled with the option enabled will be able to boot
24only on machines that supports the feature -- see for 'la57' flag in
25/proc/cpuinfo.
26
27The plan is to implement boot-time switching between 4- and 5-level paging
28in the future.
29
30== User-space and large virtual address space ==
31
32On x86, 5-level paging enables 56-bit userspace virtual address space.
33Not all user space is ready to handle wide addresses. It's known that
34at least some JIT compilers use higher bits in pointers to encode their
35information. It collides with valid pointers with 5-level paging and
36leads to crashes.
37
38To mitigate this, we are not going to allocate virtual address space
39above 47-bit by default.
40
41But userspace can ask for allocation from full address space by
42specifying hint address (with or without MAP_FIXED) above 47-bits.
43
44If hint address set above 47-bit, but MAP_FIXED is not specified, we try
45to look for unmapped area by specified address. If it's already
46occupied, we look for unmapped area in *full* address space, rather than
47from 47-bit window.
48
49A high hint address would only affect the allocation in question, but not
50any future mmap()s.
51
52Specifying high hint address on older kernel or on machine without 5-level
53paging support is safe. The hint will be ignored and kernel will fall back
54to allocation from 47-bit address space.
55
56This approach helps to easily make application's memory allocator aware
57about large address space without manually tracking allocated virtual
58address space.
59
60One important case we need to handle here is interaction with MPX.
61MPX (without MAWA extension) cannot handle addresses above 47-bit, so we
62need to make sure that MPX cannot be enabled we already have VMA above
63the boundary and forbid creating such VMAs once MPX is enabled.
64
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index a3d0211970e9..c86a947f5368 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
112 buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; 112 buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
113} 113}
114 114
115#define acpi_unlazy_tlb(x)
116
117#ifdef CONFIG_ACPI_NUMA 115#ifdef CONFIG_ACPI_NUMA
118extern cpumask_t early_cpu_possible_map; 116extern cpumask_t early_cpu_possible_map;
119#define for_each_possible_early_cpu(cpu) \ 117#define for_each_possible_early_cpu(cpu) \
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 121295637d0d..81416000c5e0 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
757 return 0; 757 return 0;
758} 758}
759 759
760u32 760int
761efi_mem_type (unsigned long phys_addr) 761efi_mem_type (unsigned long phys_addr)
762{ 762{
763 efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); 763 efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
764 764
765 if (md) 765 if (md)
766 return md->type; 766 return md->type;
767 return 0; 767 return -EINVAL;
768} 768}
769 769
770u64 770u64
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cce15191e9e9..b4b27ab016f6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -169,6 +169,7 @@ config X86
169 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI 169 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
170 select HAVE_PERF_REGS 170 select HAVE_PERF_REGS
171 select HAVE_PERF_USER_STACK_DUMP 171 select HAVE_PERF_USER_STACK_DUMP
172 select HAVE_RCU_TABLE_FREE
172 select HAVE_REGS_AND_STACK_ACCESS_API 173 select HAVE_REGS_AND_STACK_ACCESS_API
173 select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION 174 select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
174 select HAVE_STACK_VALIDATION if X86_64 175 select HAVE_STACK_VALIDATION if X86_64
@@ -329,6 +330,7 @@ config FIX_EARLYCON_MEM
329 330
330config PGTABLE_LEVELS 331config PGTABLE_LEVELS
331 int 332 int
333 default 5 if X86_5LEVEL
332 default 4 if X86_64 334 default 4 if X86_64
333 default 3 if X86_PAE 335 default 3 if X86_PAE
334 default 2 336 default 2
@@ -1399,6 +1401,24 @@ config X86_PAE
1399 has the cost of more pagetable lookup overhead, and also 1401 has the cost of more pagetable lookup overhead, and also
1400 consumes more pagetable space per process. 1402 consumes more pagetable space per process.
1401 1403
1404config X86_5LEVEL
1405 bool "Enable 5-level page tables support"
1406 depends on X86_64
1407 ---help---
1408 5-level paging enables access to larger address space:
1409 upto 128 PiB of virtual address space and 4 PiB of
1410 physical address space.
1411
1412 It will be supported by future Intel CPUs.
1413
1414 Note: a kernel with this option enabled can only be booted
1415 on machines that support the feature.
1416
1417 See Documentation/x86/x86_64/5level-paging.txt for more
1418 information.
1419
1420 Say N if unsure.
1421
1402config ARCH_PHYS_ADDR_T_64BIT 1422config ARCH_PHYS_ADDR_T_64BIT
1403 def_bool y 1423 def_bool y
1404 depends on X86_64 || X86_PAE 1424 depends on X86_64 || X86_PAE
@@ -1416,6 +1436,35 @@ config X86_DIRECT_GBPAGES
1416 supports them), so don't confuse the user by printing 1436 supports them), so don't confuse the user by printing
1417 that we have them enabled. 1437 that we have them enabled.
1418 1438
1439config ARCH_HAS_MEM_ENCRYPT
1440 def_bool y
1441
1442config AMD_MEM_ENCRYPT
1443 bool "AMD Secure Memory Encryption (SME) support"
1444 depends on X86_64 && CPU_SUP_AMD
1445 ---help---
1446 Say yes to enable support for the encryption of system memory.
1447 This requires an AMD processor that supports Secure Memory
1448 Encryption (SME).
1449
1450config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
1451 bool "Activate AMD Secure Memory Encryption (SME) by default"
1452 default y
1453 depends on AMD_MEM_ENCRYPT
1454 ---help---
1455 Say yes to have system memory encrypted by default if running on
1456 an AMD processor that supports Secure Memory Encryption (SME).
1457
1458 If set to Y, then the encryption of system memory can be
1459 deactivated with the mem_encrypt=off command line option.
1460
1461 If set to N, then the encryption of system memory can be
1462 activated with the mem_encrypt=on command line option.
1463
1464config ARCH_USE_MEMREMAP_PROT
1465 def_bool y
1466 depends on AMD_MEM_ENCRYPT
1467
1419# Common NUMA Features 1468# Common NUMA Features
1420config NUMA 1469config NUMA
1421 bool "Numa Memory Allocation and Scheduler Support" 1470 bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 28029be47fbb..f1aa43854bed 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -15,6 +15,13 @@
15#define __pa(x) ((unsigned long)(x)) 15#define __pa(x) ((unsigned long)(x))
16#define __va(x) ((void *)((unsigned long)(x))) 16#define __va(x) ((void *)((unsigned long)(x)))
17 17
18/*
19 * The pgtable.h and mm/ident_map.c includes make use of the SME related
20 * information which is not used in the compressed image support. Un-define
21 * the SME support to avoid any compile and link errors.
22 */
23#undef CONFIG_AMD_MEM_ENCRYPT
24
18#include "misc.h" 25#include "misc.h"
19 26
20/* These actually do the work of building the kernel identity maps. */ 27/* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 2efc768e4362..72d867f6b518 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -150,8 +150,6 @@ static inline void disable_acpi(void) { }
150extern int x86_acpi_numa_init(void); 150extern int x86_acpi_numa_init(void);
151#endif /* CONFIG_ACPI_NUMA */ 151#endif /* CONFIG_ACPI_NUMA */
152 152
153#define acpi_unlazy_tlb(x) leave_mm(x)
154
155#ifdef CONFIG_ACPI_APEI 153#ifdef CONFIG_ACPI_APEI
156static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) 154static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
157{ 155{
@@ -162,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
162 * you call efi_mem_attributes() during boot and at runtime, 160 * you call efi_mem_attributes() during boot and at runtime,
163 * you could theoretically see different attributes. 161 * you could theoretically see different attributes.
164 * 162 *
165 * Since we are yet to see any x86 platforms that require 163 * We are yet to see any x86 platforms that require anything
166 * anything other than PAGE_KERNEL (some arm64 platforms 164 * other than PAGE_KERNEL (some ARM64 platforms require the
167 * require the equivalent of PAGE_KERNEL_NOCACHE), return that 165 * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME
168 * until we know differently. 166 * is active, the ACPI information will not be encrypted,
167 * so return PAGE_KERNEL_NOENC until we know differently.
169 */ 168 */
170 return PAGE_KERNEL; 169 return PAGE_KERNEL_NOENC;
171} 170}
172#endif 171#endif
173 172
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
2#define _ASM_X86_CMDLINE_H 2#define _ASM_X86_CMDLINE_H
3 3
4int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); 4int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
5int cmdline_find_option(const char *cmdline_ptr, const char *option,
6 char *buffer, int bufsize);
5 7
6#endif /* _ASM_X86_CMDLINE_H */ 8#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 8ea315a11fe0..42bbbf0f173d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -196,6 +196,7 @@
196 196
197#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ 197#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
198#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ 198#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
199#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
199 200
200#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ 201#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
201#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ 202#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 5dff775af7cd..c10c9128f54e 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -21,11 +21,13 @@
21# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) 21# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31))
22# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) 22# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31))
23# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) 23# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31))
24# define DISABLE_PCID 0
24#else 25#else
25# define DISABLE_VME 0 26# define DISABLE_VME 0
26# define DISABLE_K6_MTRR 0 27# define DISABLE_K6_MTRR 0
27# define DISABLE_CYRIX_ARR 0 28# define DISABLE_CYRIX_ARR 0
28# define DISABLE_CENTAUR_MCR 0 29# define DISABLE_CENTAUR_MCR 0
30# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31))
29#endif /* CONFIG_X86_64 */ 31#endif /* CONFIG_X86_64 */
30 32
31#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 33#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -49,7 +51,7 @@
49#define DISABLED_MASK1 0 51#define DISABLED_MASK1 0
50#define DISABLED_MASK2 0 52#define DISABLED_MASK2 0
51#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) 53#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
52#define DISABLED_MASK4 0 54#define DISABLED_MASK4 (DISABLE_PCID)
53#define DISABLED_MASK5 0 55#define DISABLED_MASK5 0
54#define DISABLED_MASK6 0 56#define DISABLED_MASK6 0
55#define DISABLED_MASK7 0 57#define DISABLED_MASK7 0
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 398c79889f5c..1387dafdba2d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,6 +12,7 @@
12#include <asm/io.h> 12#include <asm/io.h>
13#include <asm/swiotlb.h> 13#include <asm/swiotlb.h>
14#include <linux/dma-contiguous.h> 14#include <linux/dma-contiguous.h>
15#include <linux/mem_encrypt.h>
15 16
16#ifdef CONFIG_ISA 17#ifdef CONFIG_ISA
17# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) 18# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
57 58
58static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) 59static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
59{ 60{
60 return paddr; 61 return __sme_set(paddr);
61} 62}
62 63
63static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) 64static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
64{ 65{
65 return daddr; 66 return __sme_clr(daddr);
66} 67}
67#endif /* CONFIG_X86_DMA_REMAP */ 68#endif /* CONFIG_X86_DMA_REMAP */
68 69
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 3c69fed215c5..a8e15b04565b 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
13} 13}
14 14
15/* Use early IO mappings for DMI because it's initialized early */ 15/* Use early IO mappings for DMI because it's initialized early */
16#define dmi_early_remap early_ioremap 16#define dmi_early_remap early_memremap
17#define dmi_early_unmap early_iounmap 17#define dmi_early_unmap early_memunmap
18#define dmi_remap ioremap_cache 18#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB)
19#define dmi_unmap iounmap 19#define dmi_unmap(_x) memunmap(_x)
20 20
21#endif /* _ASM_X86_DMI_H */ 21#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index a504adc661a4..cd266d830e49 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void);
39extern void e820__reallocate_tables(void); 39extern void e820__reallocate_tables(void);
40extern void e820__register_nosave_regions(unsigned long limit_pfn); 40extern void e820__register_nosave_regions(unsigned long limit_pfn);
41 41
42extern int e820__get_entry_type(u64 start, u64 end);
43
42/* 44/*
43 * Returns true iff the specified range [start,end) is completely contained inside 45 * Returns true iff the specified range [start,end) is completely contained inside
44 * the ISA region. 46 * the ISA region.
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index bda9f94bcb10..04330c8d9af9 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -305,8 +305,8 @@ static inline int mmap_is_ia32(void)
305 test_thread_flag(TIF_ADDR32)); 305 test_thread_flag(TIF_ADDR32));
306} 306}
307 307
308extern unsigned long tasksize_32bit(void); 308extern unsigned long task_size_32bit(void);
309extern unsigned long tasksize_64bit(void); 309extern unsigned long task_size_64bit(int full_addr_space);
310extern unsigned long get_mmap_base(int is_legacy); 310extern unsigned long get_mmap_base(int is_legacy);
311 311
312#ifdef CONFIG_X86_32 312#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b65155cc3760..dcd9fb55e679 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx,
157} 157}
158#endif 158#endif
159 159
160/*
161 * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not
162 * supported for MMIO addresses, so make sure that the memory encryption
163 * mask is not part of the page attributes.
164 */
165#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
166
167/*
168 * Early memremap routines used for in-place encryption. The mappings created
169 * by these routines are intended to be used as temporary mappings.
170 */
171void __init *early_memremap_encrypted(resource_size_t phys_addr,
172 unsigned long size);
173void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
174 unsigned long size);
175void __init *early_memremap_decrypted(resource_size_t phys_addr,
176 unsigned long size);
177void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
178 unsigned long size);
179
160#include <asm-generic/fixmap.h> 180#include <asm-generic/fixmap.h>
161 181
162#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) 182#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 474eb8c66fee..05c4aa00cc86 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -7,6 +7,7 @@ struct x86_mapping_info {
7 unsigned long page_flag; /* page flag for PMD or PUD entry */ 7 unsigned long page_flag; /* page flag for PMD or PUD entry */
8 unsigned long offset; /* ident mapping offset */ 8 unsigned long offset; /* ident mapping offset */
9 bool direct_gbpages; /* PUD level 1GB page support */ 9 bool direct_gbpages; /* PUD level 1GB page support */
10 unsigned long kernpg_flag; /* kernel pagetable flag override */
10}; 11};
11 12
12int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, 13int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 1310e1f1cd65..c40a95c33bb8 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -377,4 +377,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
377#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc 377#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
378#endif 378#endif
379 379
380extern bool arch_memremap_can_ram_remap(resource_size_t offset,
381 unsigned long size,
382 unsigned long flags);
383#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
384
385extern bool phys_mem_access_encrypted(unsigned long phys_addr,
386 unsigned long size);
387
380#endif /* _ASM_X86_IO_H */ 388#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 70ef205489f0..942c1f444da8 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -147,7 +147,8 @@ unsigned long
147relocate_kernel(unsigned long indirection_page, 147relocate_kernel(unsigned long indirection_page,
148 unsigned long page_list, 148 unsigned long page_list,
149 unsigned long start_address, 149 unsigned long start_address,
150 unsigned int preserve_context); 150 unsigned int preserve_context,
151 unsigned int sme_active);
151#endif 152#endif
152 153
153#define ARCH_HAS_KIMAGE_ARCH 154#define ARCH_HAS_KIMAGE_ARCH
@@ -207,6 +208,14 @@ struct kexec_entry64_regs {
207 uint64_t r15; 208 uint64_t r15;
208 uint64_t rip; 209 uint64_t rip;
209}; 210};
211
212extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
213 gfp_t gfp);
214#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
215
216extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
217#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
218
210#endif 219#endif
211 220
212typedef void crash_vmclear_fn(void); 221typedef void crash_vmclear_fn(void);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 92c9032502d8..369e41c23f07 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1079,7 +1079,7 @@ void kvm_mmu_init_vm(struct kvm *kvm);
1079void kvm_mmu_uninit_vm(struct kvm *kvm); 1079void kvm_mmu_uninit_vm(struct kvm *kvm);
1080void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 1080void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
1081 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, 1081 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
1082 u64 acc_track_mask); 1082 u64 acc_track_mask, u64 me_mask);
1083 1083
1084void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 1084void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
1085void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 1085void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..8e618fcf1f7c
--- /dev/null
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -0,0 +1,80 @@
1/*
2 * AMD Memory Encryption Support
3 *
4 * Copyright (C) 2016 Advanced Micro Devices, Inc.
5 *
6 * Author: Tom Lendacky <thomas.lendacky@amd.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#ifndef __X86_MEM_ENCRYPT_H__
14#define __X86_MEM_ENCRYPT_H__
15
16#ifndef __ASSEMBLY__
17
18#include <linux/init.h>
19
20#include <asm/bootparam.h>
21
22#ifdef CONFIG_AMD_MEM_ENCRYPT
23
24extern unsigned long sme_me_mask;
25
26void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
27 unsigned long decrypted_kernel_vaddr,
28 unsigned long kernel_len,
29 unsigned long encryption_wa,
30 unsigned long encryption_pgd);
31
32void __init sme_early_encrypt(resource_size_t paddr,
33 unsigned long size);
34void __init sme_early_decrypt(resource_size_t paddr,
35 unsigned long size);
36
37void __init sme_map_bootdata(char *real_mode_data);
38void __init sme_unmap_bootdata(char *real_mode_data);
39
40void __init sme_early_init(void);
41
42void __init sme_encrypt_kernel(void);
43void __init sme_enable(struct boot_params *bp);
44
45/* Architecture __weak replacement functions */
46void __init mem_encrypt_init(void);
47
48void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
49
50#else /* !CONFIG_AMD_MEM_ENCRYPT */
51
52#define sme_me_mask 0UL
53
54static inline void __init sme_early_encrypt(resource_size_t paddr,
55 unsigned long size) { }
56static inline void __init sme_early_decrypt(resource_size_t paddr,
57 unsigned long size) { }
58
59static inline void __init sme_map_bootdata(char *real_mode_data) { }
60static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
61
62static inline void __init sme_early_init(void) { }
63
64static inline void __init sme_encrypt_kernel(void) { }
65static inline void __init sme_enable(struct boot_params *bp) { }
66
67#endif /* CONFIG_AMD_MEM_ENCRYPT */
68
69/*
70 * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
71 * writing to or comparing values from the cr3 register. Having the
72 * encryption mask set in cr3 enables the PGD entry to be encrypted and
73 * avoid special case handling of PGD allocations.
74 */
75#define __sme_pa(x) (__pa(x) | sme_me_mask)
76#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask)
77
78#endif /* __ASSEMBLY__ */
79
80#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 79b647a7ebd0..bb8c597c2248 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -3,12 +3,28 @@
3 3
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/mutex.h> 5#include <linux/mutex.h>
6#include <linux/atomic.h>
6 7
7/* 8/*
8 * The x86 doesn't have a mmu context, but 9 * x86 has arch-specific MMU state beyond what lives in mm_struct.
9 * we put the segment information here.
10 */ 10 */
11typedef struct { 11typedef struct {
12 /*
13 * ctx_id uniquely identifies this mm_struct. A ctx_id will never
14 * be reused, and zero is not a valid ctx_id.
15 */
16 u64 ctx_id;
17
18 /*
19 * Any code that needs to do any sort of TLB flushing for this
20 * mm will first make its changes to the page tables, then
21 * increment tlb_gen, then flush. This lets the low-level
22 * flushing code keep track of what needs flushing.
23 *
24 * This is not used on Xen PV.
25 */
26 atomic64_t tlb_gen;
27
12#ifdef CONFIG_MODIFY_LDT_SYSCALL 28#ifdef CONFIG_MODIFY_LDT_SYSCALL
13 struct ldt_struct *ldt; 29 struct ldt_struct *ldt;
14#endif 30#endif
@@ -37,6 +53,11 @@ typedef struct {
37#endif 53#endif
38} mm_context_t; 54} mm_context_t;
39 55
56#define INIT_MM_CONTEXT(mm) \
57 .context = { \
58 .ctx_id = 1, \
59 }
60
40void leave_mm(int cpu); 61void leave_mm(int cpu);
41 62
42#endif /* _ASM_X86_MMU_H */ 63#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 7a234be7e298..7ae318c340d9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -12,6 +12,9 @@
12#include <asm/tlbflush.h> 12#include <asm/tlbflush.h>
13#include <asm/paravirt.h> 13#include <asm/paravirt.h>
14#include <asm/mpx.h> 14#include <asm/mpx.h>
15
16extern atomic64_t last_mm_ctx_id;
17
15#ifndef CONFIG_PARAVIRT 18#ifndef CONFIG_PARAVIRT
16static inline void paravirt_activate_mm(struct mm_struct *prev, 19static inline void paravirt_activate_mm(struct mm_struct *prev,
17 struct mm_struct *next) 20 struct mm_struct *next)
@@ -125,13 +128,18 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
125 128
126static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 129static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
127{ 130{
128 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 131 int cpu = smp_processor_id();
129 this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); 132
133 if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
134 cpumask_clear_cpu(cpu, mm_cpumask(mm));
130} 135}
131 136
132static inline int init_new_context(struct task_struct *tsk, 137static inline int init_new_context(struct task_struct *tsk,
133 struct mm_struct *mm) 138 struct mm_struct *mm)
134{ 139{
140 mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
141 atomic64_set(&mm->context.tlb_gen, 0);
142
135 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 143 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
136 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { 144 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
137 /* pkey 0 is the default and always allocated */ 145 /* pkey 0 is the default and always allocated */
@@ -290,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void)
290{ 298{
291 unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); 299 unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
292 300
301 if (static_cpu_has(X86_FEATURE_PCID))
302 cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
303
293 /* For now, be very restrictive about when this can be called. */ 304 /* For now, be very restrictive about when this can be called. */
294 VM_WARN_ON(in_nmi() || preemptible()); 305 VM_WARN_ON(in_nmi() || preemptible());
295 306
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index a0d662be4c5b..7d7404756bb4 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm)
73} 73}
74void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, 74void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
75 unsigned long start, unsigned long end); 75 unsigned long start, unsigned long end);
76
77unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
78 unsigned long flags);
76#else 79#else
77static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) 80static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
78{ 81{
@@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm,
94 unsigned long start, unsigned long end) 97 unsigned long start, unsigned long end)
95{ 98{
96} 99}
100
101static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
102 unsigned long len, unsigned long flags)
103{
104 return addr;
105}
97#endif /* CONFIG_X86_INTEL_MPX */ 106#endif /* CONFIG_X86_INTEL_MPX */
98 107
99#endif /* _ASM_X86_MPX_H */ 108#endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5573c75f8e4c..17f5c12e1afd 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -356,6 +356,8 @@
356#define MSR_K8_TOP_MEM1 0xc001001a 356#define MSR_K8_TOP_MEM1 0xc001001a
357#define MSR_K8_TOP_MEM2 0xc001001d 357#define MSR_K8_TOP_MEM2 0xc001001d
358#define MSR_K8_SYSCFG 0xc0010010 358#define MSR_K8_SYSCFG 0xc0010010
359#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23
360#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
359#define MSR_K8_INT_PENDING_MSG 0xc0010055 361#define MSR_K8_INT_PENDING_MSG 0xc0010055
360/* C1E active bits in int pending message */ 362/* C1E active bits in int pending message */
361#define K8_INTP_C1E_ACTIVE_MASK 0x18000000 363#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b4a0d43248cf..b50df06ad251 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -51,6 +51,10 @@ static inline void clear_page(void *page)
51 51
52void copy_page(void *to, void *from); 52void copy_page(void *to, void *from);
53 53
54#ifdef CONFIG_X86_MCE
55#define arch_unmap_kpfn arch_unmap_kpfn
56#endif
57
54#endif /* !__ASSEMBLY__ */ 58#endif /* !__ASSEMBLY__ */
55 59
56#ifdef CONFIG_X86_VSYSCALL_EMULATION 60#ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 7bd0099384ca..b98ed9d14630 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/mem_encrypt.h>
6 7
7/* PAGE_SHIFT determines the page size */ 8/* PAGE_SHIFT determines the page size */
8#define PAGE_SHIFT 12 9#define PAGE_SHIFT 12
@@ -15,7 +16,7 @@
15#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) 16#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
16#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) 17#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
17 18
18#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) 19#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1)))
19#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) 20#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
20 21
21/* Cast *PAGE_MASK to a signed type so that it is sign-extended if 22/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 77037b6f1caa..bbeae4a2bd01 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_PGTABLE_H 1#ifndef _ASM_X86_PGTABLE_H
2#define _ASM_X86_PGTABLE_H 2#define _ASM_X86_PGTABLE_H
3 3
4#include <linux/mem_encrypt.h>
4#include <asm/page.h> 5#include <asm/page.h>
5#include <asm/pgtable_types.h> 6#include <asm/pgtable_types.h>
6 7
@@ -13,9 +14,18 @@
13 cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ 14 cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \
14 : (prot)) 15 : (prot))
15 16
17/*
18 * Macros to add or remove encryption attribute
19 */
20#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot)))
21#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
22
16#ifndef __ASSEMBLY__ 23#ifndef __ASSEMBLY__
17#include <asm/x86_init.h> 24#include <asm/x86_init.h>
18 25
26extern pgd_t early_top_pgt[PTRS_PER_PGD];
27int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
28
19void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); 29void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
20void ptdump_walk_pgd_level_checkwx(void); 30void ptdump_walk_pgd_level_checkwx(void);
21 31
@@ -38,6 +48,8 @@ extern struct list_head pgd_list;
38 48
39extern struct mm_struct *pgd_page_get_mm(struct page *page); 49extern struct mm_struct *pgd_page_get_mm(struct page *page);
40 50
51extern pmdval_t early_pmd_flags;
52
41#ifdef CONFIG_PARAVIRT 53#ifdef CONFIG_PARAVIRT
42#include <asm/paravirt.h> 54#include <asm/paravirt.h>
43#else /* !CONFIG_PARAVIRT */ 55#else /* !CONFIG_PARAVIRT */
@@ -195,6 +207,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d)
195 return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; 207 return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
196} 208}
197 209
210static inline unsigned long pgd_pfn(pgd_t pgd)
211{
212 return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
213}
214
198static inline int p4d_large(p4d_t p4d) 215static inline int p4d_large(p4d_t p4d)
199{ 216{
200 /* No 512 GiB pages yet */ 217 /* No 512 GiB pages yet */
@@ -704,8 +721,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
704 * Currently stuck as a macro due to indirect forward reference to 721 * Currently stuck as a macro due to indirect forward reference to
705 * linux/mmzone.h's __section_mem_map_addr() definition: 722 * linux/mmzone.h's __section_mem_map_addr() definition:
706 */ 723 */
707#define pmd_page(pmd) \ 724#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
708 pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
709 725
710/* 726/*
711 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] 727 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -773,8 +789,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud)
773 * Currently stuck as a macro due to indirect forward reference to 789 * Currently stuck as a macro due to indirect forward reference to
774 * linux/mmzone.h's __section_mem_map_addr() definition: 790 * linux/mmzone.h's __section_mem_map_addr() definition:
775 */ 791 */
776#define pud_page(pud) \ 792#define pud_page(pud) pfn_to_page(pud_pfn(pud))
777 pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
778 793
779/* Find an entry in the second-level page table.. */ 794/* Find an entry in the second-level page table.. */
780static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) 795static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -824,8 +839,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d)
824 * Currently stuck as a macro due to indirect forward reference to 839 * Currently stuck as a macro due to indirect forward reference to
825 * linux/mmzone.h's __section_mem_map_addr() definition: 840 * linux/mmzone.h's __section_mem_map_addr() definition:
826 */ 841 */
827#define p4d_page(p4d) \ 842#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d))
828 pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT)
829 843
830/* Find an entry in the third-level page table.. */ 844/* Find an entry in the third-level page table.. */
831static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) 845static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
@@ -859,7 +873,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
859 * Currently stuck as a macro due to indirect forward reference to 873 * Currently stuck as a macro due to indirect forward reference to
860 * linux/mmzone.h's __section_mem_map_addr() definition: 874 * linux/mmzone.h's __section_mem_map_addr() definition:
861 */ 875 */
862#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) 876#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd))
863 877
864/* to find an entry in a page-table-directory. */ 878/* to find an entry in a page-table-directory. */
865static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) 879static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index bf9638e1ee42..399261ce904c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -2,6 +2,8 @@
2#define _ASM_X86_PGTABLE_DEFS_H 2#define _ASM_X86_PGTABLE_DEFS_H
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5#include <linux/mem_encrypt.h>
6
5#include <asm/page_types.h> 7#include <asm/page_types.h>
6 8
7#define FIRST_USER_ADDRESS 0UL 9#define FIRST_USER_ADDRESS 0UL
@@ -121,10 +123,10 @@
121 123
122#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) 124#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
123 125
124#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 126#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
125 _PAGE_ACCESSED | _PAGE_DIRTY) 127 _PAGE_ACCESSED | _PAGE_DIRTY)
126#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 128#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \
127 _PAGE_DIRTY) 129 _PAGE_ACCESSED | _PAGE_DIRTY)
128 130
129/* 131/*
130 * Set of bits not changed in pte_modify. The pte's 132 * Set of bits not changed in pte_modify. The pte's
@@ -159,6 +161,7 @@ enum page_cache_mode {
159 161
160#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) 162#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
161#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) 163#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
164#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
162 165
163#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) 166#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
164#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 167#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
@@ -187,22 +190,42 @@ enum page_cache_mode {
187#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) 190#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
188#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) 191#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
189#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) 192#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
193#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP)
190 194
191#define __PAGE_KERNEL_IO (__PAGE_KERNEL) 195#define __PAGE_KERNEL_IO (__PAGE_KERNEL)
192#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) 196#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE)
193 197
194#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) 198#ifndef __ASSEMBLY__
195#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) 199
196#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) 200#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
197#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) 201
198#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) 202#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
199#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) 203 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
200#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) 204#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
201#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) 205 _PAGE_DIRTY | _PAGE_ENC)
202#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) 206
207#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
208#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC)
209
210#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL)
211#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP)
212
213#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC)
214#define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL)
215#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
216#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
217#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC)
218#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
219#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
220#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
221#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
222#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC)
223#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
224
225#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
226#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
203 227
204#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) 228#endif /* __ASSEMBLY__ */
205#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
206 229
207/* xwr */ 230/* xwr */
208#define __P000 PAGE_NONE 231#define __P000 PAGE_NONE
@@ -287,6 +310,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d)
287#else 310#else
288#include <asm-generic/pgtable-nop4d.h> 311#include <asm-generic/pgtable-nop4d.h>
289 312
313static inline p4d_t native_make_p4d(pudval_t val)
314{
315 return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
316}
317
290static inline p4dval_t native_p4d_val(p4d_t p4d) 318static inline p4dval_t native_p4d_val(p4d_t p4d)
291{ 319{
292 return native_pgd_val(p4d.pgd); 320 return native_pgd_val(p4d.pgd);
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 79aa2f98398d..dc723b64acf0 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_PROCESSOR_FLAGS_H 2#define _ASM_X86_PROCESSOR_FLAGS_H
3 3
4#include <uapi/asm/processor-flags.h> 4#include <uapi/asm/processor-flags.h>
5#include <linux/mem_encrypt.h>
5 6
6#ifdef CONFIG_VM86 7#ifdef CONFIG_VM86
7#define X86_VM_MASK X86_EFLAGS_VM 8#define X86_VM_MASK X86_EFLAGS_VM
@@ -32,16 +33,18 @@
32 * CR3_ADDR_MASK is the mask used by read_cr3_pa(). 33 * CR3_ADDR_MASK is the mask used by read_cr3_pa().
33 */ 34 */
34#ifdef CONFIG_X86_64 35#ifdef CONFIG_X86_64
35/* Mask off the address space ID bits. */ 36/* Mask off the address space ID and SME encryption bits. */
36#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull 37#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
37#define CR3_PCID_MASK 0xFFFull 38#define CR3_PCID_MASK 0xFFFull
39#define CR3_NOFLUSH BIT_ULL(63)
38#else 40#else
39/* 41/*
40 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save 42 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
41 * a tiny bit of code size by setting all the bits. 43 * a tiny bit of code size by setting all the bits.
42 */ 44 */
43#define CR3_ADDR_MASK 0xFFFFFFFFull 45#define CR3_ADDR_MASK 0xFFFFFFFFull
44#define CR3_PCID_MASK 0ull 46#define CR3_PCID_MASK 0ull
47#define CR3_NOFLUSH 0
45#endif 48#endif
46 49
47#endif /* _ASM_X86_PROCESSOR_FLAGS_H */ 50#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index abc99b9c7ffd..3fa26a61eabc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -30,6 +30,7 @@ struct vm86;
30#include <linux/math64.h> 30#include <linux/math64.h>
31#include <linux/err.h> 31#include <linux/err.h>
32#include <linux/irqflags.h> 32#include <linux/irqflags.h>
33#include <linux/mem_encrypt.h>
33 34
34/* 35/*
35 * We handle most unaligned accesses in hardware. On the other hand 36 * We handle most unaligned accesses in hardware. On the other hand
@@ -240,9 +241,14 @@ static inline unsigned long read_cr3_pa(void)
240 return __read_cr3() & CR3_ADDR_MASK; 241 return __read_cr3() & CR3_ADDR_MASK;
241} 242}
242 243
244static inline unsigned long native_read_cr3_pa(void)
245{
246 return __native_read_cr3() & CR3_ADDR_MASK;
247}
248
243static inline void load_cr3(pgd_t *pgdir) 249static inline void load_cr3(pgd_t *pgdir)
244{ 250{
245 write_cr3(__pa(pgdir)); 251 write_cr3(__sme_pa(pgdir));
246} 252}
247 253
248#ifdef CONFIG_X86_32 254#ifdef CONFIG_X86_32
@@ -805,7 +811,9 @@ static inline void spin_lock_prefetch(const void *x)
805 */ 811 */
806#define IA32_PAGE_OFFSET PAGE_OFFSET 812#define IA32_PAGE_OFFSET PAGE_OFFSET
807#define TASK_SIZE PAGE_OFFSET 813#define TASK_SIZE PAGE_OFFSET
814#define TASK_SIZE_LOW TASK_SIZE
808#define TASK_SIZE_MAX TASK_SIZE 815#define TASK_SIZE_MAX TASK_SIZE
816#define DEFAULT_MAP_WINDOW TASK_SIZE
809#define STACK_TOP TASK_SIZE 817#define STACK_TOP TASK_SIZE
810#define STACK_TOP_MAX STACK_TOP 818#define STACK_TOP_MAX STACK_TOP
811 819
@@ -845,7 +853,9 @@ static inline void spin_lock_prefetch(const void *x)
845 * particular problem by preventing anything from being mapped 853 * particular problem by preventing anything from being mapped
846 * at the maximum canonical address. 854 * at the maximum canonical address.
847 */ 855 */
848#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) 856#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
857
858#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
849 859
850/* This decides where the kernel will search for a free chunk of vm 860/* This decides where the kernel will search for a free chunk of vm
851 * space during mmap's. 861 * space during mmap's.
@@ -853,12 +863,14 @@ static inline void spin_lock_prefetch(const void *x)
853#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 863#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
854 0xc0000000 : 0xFFFFe000) 864 0xc0000000 : 0xFFFFe000)
855 865
866#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
867 IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
856#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ 868#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
857 IA32_PAGE_OFFSET : TASK_SIZE_MAX) 869 IA32_PAGE_OFFSET : TASK_SIZE_MAX)
858#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ 870#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
859 IA32_PAGE_OFFSET : TASK_SIZE_MAX) 871 IA32_PAGE_OFFSET : TASK_SIZE_MAX)
860 872
861#define STACK_TOP TASK_SIZE 873#define STACK_TOP TASK_SIZE_LOW
862#define STACK_TOP_MAX TASK_SIZE_MAX 874#define STACK_TOP_MAX TASK_SIZE_MAX
863 875
864#define INIT_THREAD { \ 876#define INIT_THREAD { \
@@ -879,7 +891,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
879 * space during mmap's. 891 * space during mmap's.
880 */ 892 */
881#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) 893#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3))
882#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) 894#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
883 895
884#define KSTK_EIP(task) (task_pt_regs(task)->ip) 896#define KSTK_EIP(task) (task_pt_regs(task)->ip)
885 897
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 230e1903acf0..90d91520c13a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -1,6 +1,15 @@
1#ifndef _ARCH_X86_REALMODE_H 1#ifndef _ARCH_X86_REALMODE_H
2#define _ARCH_X86_REALMODE_H 2#define _ARCH_X86_REALMODE_H
3 3
4/*
5 * Flag bit definitions for use with the flags field of the trampoline header
6 * in the CONFIG_X86_64 variant.
7 */
8#define TH_FLAGS_SME_ACTIVE_BIT 0
9#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT)
10
11#ifndef __ASSEMBLY__
12
4#include <linux/types.h> 13#include <linux/types.h>
5#include <asm/io.h> 14#include <asm/io.h>
6 15
@@ -38,6 +47,7 @@ struct trampoline_header {
38 u64 start; 47 u64 start;
39 u64 efer; 48 u64 efer;
40 u32 cr4; 49 u32 cr4;
50 u32 flags;
41#endif 51#endif
42}; 52};
43 53
@@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void)
69void set_real_mode_mem(phys_addr_t mem, size_t size); 79void set_real_mode_mem(phys_addr_t mem, size_t size);
70void reserve_real_mode(void); 80void reserve_real_mode(void);
71 81
82#endif /* __ASSEMBLY__ */
83
72#endif /* _ARCH_X86_REALMODE_H */ 84#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index eaec6c364e42..cd71273ec49d 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -11,6 +11,7 @@
11 * Executability : eXeutable, NoteXecutable 11 * Executability : eXeutable, NoteXecutable
12 * Read/Write : ReadOnly, ReadWrite 12 * Read/Write : ReadOnly, ReadWrite
13 * Presence : NotPresent 13 * Presence : NotPresent
14 * Encryption : Encrypted, Decrypted
14 * 15 *
15 * Within a category, the attributes are mutually exclusive. 16 * Within a category, the attributes are mutually exclusive.
16 * 17 *
@@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages);
42int set_memory_wb(unsigned long addr, int numpages); 43int set_memory_wb(unsigned long addr, int numpages);
43int set_memory_np(unsigned long addr, int numpages); 44int set_memory_np(unsigned long addr, int numpages);
44int set_memory_4k(unsigned long addr, int numpages); 45int set_memory_4k(unsigned long addr, int numpages);
46int set_memory_encrypted(unsigned long addr, int numpages);
47int set_memory_decrypted(unsigned long addr, int numpages);
45 48
46int set_memory_array_uc(unsigned long *addr, int addrinarray); 49int set_memory_array_uc(unsigned long *addr, int addrinarray);
47int set_memory_array_wc(unsigned long *addr, int addrinarray); 50int set_memory_array_wc(unsigned long *addr, int addrinarray);
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index c7797307fc2b..79a4ca6a9606 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -15,4 +15,18 @@
15 15
16#include <asm-generic/tlb.h> 16#include <asm-generic/tlb.h>
17 17
18/*
19 * While x86 architecture in general requires an IPI to perform TLB
20 * shootdown, enablement code for several hypervisors overrides
21 * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
22 * a hypercall. To keep software pagetable walkers safe in this case we
23 * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment
24 * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h
25 * for more details.
26 */
27static inline void __tlb_remove_table(void *table)
28{
29 free_page_and_swap_cache(table);
30}
31
18#endif /* _ASM_X86_TLB_H */ 32#endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 50ea3482e1d1..d23e61dc0640 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void)
57 __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); 57 __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
58} 58}
59 59
60static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
61{
62 u64 new_tlb_gen;
63
64 /*
65 * Bump the generation count. This also serves as a full barrier
66 * that synchronizes with switch_mm(): callers are required to order
67 * their read of mm_cpumask after their writes to the paging
68 * structures.
69 */
70 smp_mb__before_atomic();
71 new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
72 smp_mb__after_atomic();
73
74 return new_tlb_gen;
75}
76
60#ifdef CONFIG_PARAVIRT 77#ifdef CONFIG_PARAVIRT
61#include <asm/paravirt.h> 78#include <asm/paravirt.h>
62#else 79#else
@@ -65,6 +82,17 @@ static inline void invpcid_flush_all_nonglobals(void)
65#define __flush_tlb_single(addr) __native_flush_tlb_single(addr) 82#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
66#endif 83#endif
67 84
85/*
86 * 6 because 6 should be plenty and struct tlb_state will fit in
87 * two cache lines.
88 */
89#define TLB_NR_DYN_ASIDS 6
90
91struct tlb_context {
92 u64 ctx_id;
93 u64 tlb_gen;
94};
95
68struct tlb_state { 96struct tlb_state {
69 /* 97 /*
70 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts 98 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -73,13 +101,35 @@ struct tlb_state {
73 * mode even if we've already switched back to swapper_pg_dir. 101 * mode even if we've already switched back to swapper_pg_dir.
74 */ 102 */
75 struct mm_struct *loaded_mm; 103 struct mm_struct *loaded_mm;
76 int state; 104 u16 loaded_mm_asid;
105 u16 next_asid;
77 106
78 /* 107 /*
79 * Access to this CR4 shadow and to H/W CR4 is protected by 108 * Access to this CR4 shadow and to H/W CR4 is protected by
80 * disabling interrupts when modifying either one. 109 * disabling interrupts when modifying either one.
81 */ 110 */
82 unsigned long cr4; 111 unsigned long cr4;
112
113 /*
114 * This is a list of all contexts that might exist in the TLB.
115 * There is one per ASID that we use, and the ASID (what the
116 * CPU calls PCID) is the index into ctxts.
117 *
118 * For each context, ctx_id indicates which mm the TLB's user
119 * entries came from. As an invariant, the TLB will never
120 * contain entries that are out-of-date as when that mm reached
121 * the tlb_gen in the list.
122 *
123 * To be clear, this means that it's legal for the TLB code to
124 * flush the TLB without updating tlb_gen. This can happen
125 * (for now, at least) due to paravirt remote flushes.
126 *
127 * NB: context 0 is a bit special, since it's also used by
128 * various bits of init code. This is fine -- code that
129 * isn't aware of PCID will end up harmlessly flushing
130 * context 0.
131 */
132 struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
83}; 133};
84DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); 134DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
85 135
@@ -207,6 +257,14 @@ static inline void __flush_tlb_all(void)
207 __flush_tlb_global(); 257 __flush_tlb_global();
208 else 258 else
209 __flush_tlb(); 259 __flush_tlb();
260
261 /*
262 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
263 * we'd end up flushing kernel translations for the current ASID but
264 * we might fail to flush kernel translations for other cached ASIDs.
265 *
266 * To avoid this issue, we force PCID off if PGE is off.
267 */
210} 268}
211 269
212static inline void __flush_tlb_one(unsigned long addr) 270static inline void __flush_tlb_one(unsigned long addr)
@@ -231,9 +289,26 @@ static inline void __flush_tlb_one(unsigned long addr)
231 * and page-granular flushes are available only on i486 and up. 289 * and page-granular flushes are available only on i486 and up.
232 */ 290 */
233struct flush_tlb_info { 291struct flush_tlb_info {
234 struct mm_struct *mm; 292 /*
235 unsigned long start; 293 * We support several kinds of flushes.
236 unsigned long end; 294 *
295 * - Fully flush a single mm. .mm will be set, .end will be
296 * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
297 * which the IPI sender is trying to catch us up.
298 *
299 * - Partially flush a single mm. .mm will be set, .start and
300 * .end will indicate the range, and .new_tlb_gen will be set
301 * such that the changes between generation .new_tlb_gen-1 and
302 * .new_tlb_gen are entirely contained in the indicated range.
303 *
304 * - Fully flush all mms whose tlb_gens have been updated. .mm
305 * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
306 * will be zero.
307 */
308 struct mm_struct *mm;
309 unsigned long start;
310 unsigned long end;
311 u64 new_tlb_gen;
237}; 312};
238 313
239#define local_flush_tlb() __flush_tlb() 314#define local_flush_tlb() __flush_tlb()
@@ -256,12 +331,10 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
256void native_flush_tlb_others(const struct cpumask *cpumask, 331void native_flush_tlb_others(const struct cpumask *cpumask,
257 const struct flush_tlb_info *info); 332 const struct flush_tlb_info *info);
258 333
259#define TLBSTATE_OK 1
260#define TLBSTATE_LAZY 2
261
262static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, 334static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
263 struct mm_struct *mm) 335 struct mm_struct *mm)
264{ 336{
337 inc_mm_tlb_gen(mm);
265 cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); 338 cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
266} 339}
267 340
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index c4b9dc2f67c5..9f42beefc67a 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -7,12 +7,24 @@
7#ifndef _ASM_X86_VGA_H 7#ifndef _ASM_X86_VGA_H
8#define _ASM_X86_VGA_H 8#define _ASM_X86_VGA_H
9 9
10#include <asm/set_memory.h>
11
10/* 12/*
11 * On the PC, we can just recalculate addresses and then 13 * On the PC, we can just recalculate addresses and then
12 * access the videoram directly without any black magic. 14 * access the videoram directly without any black magic.
15 * To support memory encryption however, we need to access
16 * the videoram as decrypted memory.
13 */ 17 */
14 18
15#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x) 19#define VGA_MAP_MEM(x, s) \
20({ \
21 unsigned long start = (unsigned long)phys_to_virt(x); \
22 \
23 if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \
24 set_memory_decrypted(start, (s) >> PAGE_SHIFT); \
25 \
26 start; \
27})
16 28
17#define vga_readb(x) (*(x)) 29#define vga_readb(x) (*(x))
18#define vga_writeb(x, y) (*(y) = (x)) 30#define vga_writeb(x, y) (*(y) = (x))
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7491e73d9253..97bb2caf3428 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
115#define ACPI_INVALID_GSI INT_MIN 115#define ACPI_INVALID_GSI INT_MIN
116 116
117/* 117/*
118 * This is just a simple wrapper around early_ioremap(), 118 * This is just a simple wrapper around early_memremap(),
119 * with sanity checks for phys == 0 and size == 0. 119 * with sanity checks for phys == 0 and size == 0.
120 */ 120 */
121char *__init __acpi_map_table(unsigned long phys, unsigned long size) 121char *__init __acpi_map_table(unsigned long phys, unsigned long size)
@@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
124 if (!phys || !size) 124 if (!phys || !size)
125 return NULL; 125 return NULL;
126 126
127 return early_ioremap(phys, size); 127 return early_memremap(phys, size);
128} 128}
129 129
130void __init __acpi_unmap_table(char *map, unsigned long size) 130void __init __acpi_unmap_table(char *map, unsigned long size)
@@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
132 if (!map || !size) 132 if (!map || !size)
133 return; 133 return;
134 134
135 early_iounmap(map, size); 135 early_memunmap(map, size);
136} 136}
137 137
138#ifdef CONFIG_X86_LOCAL_APIC 138#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e44338dd62dd..9862e2cd6d93 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -558,8 +558,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
558 558
559static void early_init_amd(struct cpuinfo_x86 *c) 559static void early_init_amd(struct cpuinfo_x86 *c)
560{ 560{
561 u32 dummy;
562
561 early_init_amd_mc(c); 563 early_init_amd_mc(c);
562 564
565 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
566
563 /* 567 /*
564 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate 568 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
565 * with P/T states and does not stop in deep C-states 569 * with P/T states and does not stop in deep C-states
@@ -622,6 +626,27 @@ static void early_init_amd(struct cpuinfo_x86 *c)
622 */ 626 */
623 if (cpu_has_amd_erratum(c, amd_erratum_400)) 627 if (cpu_has_amd_erratum(c, amd_erratum_400))
624 set_cpu_bug(c, X86_BUG_AMD_E400); 628 set_cpu_bug(c, X86_BUG_AMD_E400);
629
630 /*
631 * BIOS support is required for SME. If BIOS has enabled SME then
632 * adjust x86_phys_bits by the SME physical address space reduction
633 * value. If BIOS has not enabled SME then don't advertise the
634 * feature (set in scattered.c). Also, since the SME support requires
635 * long mode, don't advertise the feature under CONFIG_X86_32.
636 */
637 if (cpu_has(c, X86_FEATURE_SME)) {
638 u64 msr;
639
640 /* Check if SME is enabled */
641 rdmsrl(MSR_K8_SYSCFG, msr);
642 if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
643 c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
644 if (IS_ENABLED(CONFIG_X86_32))
645 clear_cpu_cap(c, X86_FEATURE_SME);
646 } else {
647 clear_cpu_cap(c, X86_FEATURE_SME);
648 }
649 }
625} 650}
626 651
627static void init_amd_k8(struct cpuinfo_x86 *c) 652static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -740,8 +765,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
740 765
741static void init_amd(struct cpuinfo_x86 *c) 766static void init_amd(struct cpuinfo_x86 *c)
742{ 767{
743 u32 dummy;
744
745 early_init_amd(c); 768 early_init_amd(c);
746 769
747 /* 770 /*
@@ -803,8 +826,6 @@ static void init_amd(struct cpuinfo_x86 *c)
803 if (c->x86 > 0x11) 826 if (c->x86 > 0x11)
804 set_cpu_cap(c, X86_FEATURE_ARAT); 827 set_cpu_cap(c, X86_FEATURE_ARAT);
805 828
806 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
807
808 /* 3DNow or LM implies PREFETCHW */ 829 /* 3DNow or LM implies PREFETCHW */
809 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) 830 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
810 if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) 831 if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0af86d9242da..db684880d74a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,6 +21,14 @@
21 21
22void __init check_bugs(void) 22void __init check_bugs(void)
23{ 23{
24#ifdef CONFIG_X86_32
25 /*
26 * Regardless of whether PCID is enumerated, the SDM says
27 * that it can't be enabled in 32-bit mode.
28 */
29 setup_clear_cpu_cap(X86_FEATURE_PCID);
30#endif
31
24 identify_boot_cpu(); 32 identify_boot_cpu();
25 33
26 if (!IS_ENABLED(CONFIG_SMP)) { 34 if (!IS_ENABLED(CONFIG_SMP)) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c8b39870f33e..b95cd94ca97b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s)
168} 168}
169__setup("nompx", x86_mpx_setup); 169__setup("nompx", x86_mpx_setup);
170 170
171#ifdef CONFIG_X86_64
172static int __init x86_pcid_setup(char *s)
173{
174 /* require an exact match without trailing characters */
175 if (strlen(s))
176 return 0;
177
178 /* do not emit a message if the feature is not present */
179 if (!boot_cpu_has(X86_FEATURE_PCID))
180 return 1;
181
182 setup_clear_cpu_cap(X86_FEATURE_PCID);
183 pr_info("nopcid: PCID feature disabled\n");
184 return 1;
185}
186__setup("nopcid", x86_pcid_setup);
187#endif
188
171static int __init x86_noinvpcid_setup(char *s) 189static int __init x86_noinvpcid_setup(char *s)
172{ 190{
173 /* noinvpcid doesn't accept parameters */ 191 /* noinvpcid doesn't accept parameters */
@@ -311,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
311 } 329 }
312} 330}
313 331
332static void setup_pcid(struct cpuinfo_x86 *c)
333{
334 if (cpu_has(c, X86_FEATURE_PCID)) {
335 if (cpu_has(c, X86_FEATURE_PGE)) {
336 cr4_set_bits(X86_CR4_PCIDE);
337 } else {
338 /*
339 * flush_tlb_all(), as currently implemented, won't
340 * work if PCID is on but PGE is not. Since that
341 * combination doesn't exist on real hardware, there's
342 * no reason to try to fully support it, but it's
343 * polite to avoid corrupting data if we're on
344 * an improperly configured VM.
345 */
346 clear_cpu_cap(c, X86_FEATURE_PCID);
347 }
348 }
349}
350
314/* 351/*
315 * Protection Keys are not available in 32-bit mode. 352 * Protection Keys are not available in 32-bit mode.
316 */ 353 */
@@ -1125,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
1125 setup_smep(c); 1162 setup_smep(c);
1126 setup_smap(c); 1163 setup_smap(c);
1127 1164
1165 /* Set up PCID */
1166 setup_pcid(c);
1167
1128 /* 1168 /*
1129 * The vendor-specific functions might have changed features. 1169 * The vendor-specific functions might have changed features.
1130 * Now we do "generic changes." 1170 * Now we do "generic changes."
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6dde0497efc7..3b413065c613 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,6 +51,7 @@
51#include <asm/mce.h> 51#include <asm/mce.h>
52#include <asm/msr.h> 52#include <asm/msr.h>
53#include <asm/reboot.h> 53#include <asm/reboot.h>
54#include <asm/set_memory.h>
54 55
55#include "mce-internal.h" 56#include "mce-internal.h"
56 57
@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
1051 return ret; 1052 return ret;
1052} 1053}
1053 1054
1055#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
1056
1057void arch_unmap_kpfn(unsigned long pfn)
1058{
1059 unsigned long decoy_addr;
1060
1061 /*
1062 * Unmap this page from the kernel 1:1 mappings to make sure
1063 * we don't log more errors because of speculative access to
1064 * the page.
1065 * We would like to just call:
1066 * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
1067 * but doing that would radically increase the odds of a
1068 * speculative access to the posion page because we'd have
1069 * the virtual address of the kernel 1:1 mapping sitting
1070 * around in registers.
1071 * Instead we get tricky. We create a non-canonical address
1072 * that looks just like the one we want, but has bit 63 flipped.
1073 * This relies on set_memory_np() not checking whether we passed
1074 * a legal address.
1075 */
1076
1077/*
1078 * Build time check to see if we have a spare virtual bit. Don't want
1079 * to leave this until run time because most developers don't have a
1080 * system that can exercise this code path. This will only become a
1081 * problem if/when we move beyond 5-level page tables.
1082 *
1083 * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
1084 */
1085#if PGDIR_SHIFT + 9 < 63
1086 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
1087#else
1088#error "no unused virtual bit available"
1089#endif
1090
1091 if (set_memory_np(decoy_addr, 1))
1092 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
1093
1094}
1095#endif
1096
1054/* 1097/*
1055 * The actual machine check handler. This only handles real 1098 * The actual machine check handler. This only handles real
1056 * exceptions when something got corrupted coming in through int 18. 1099 * exceptions when something got corrupted coming in through int 18.
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 23c23508c012..05459ad3db46 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = {
31 { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, 31 { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
32 { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, 32 { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
33 { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, 33 { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
34 { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
34 { 0, 0, 0, 0, 0 } 35 { 0, 0, 0, 0, 0 }
35}; 36};
36 37
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 532da61d605c..71c11ad5643e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any);
96 * Note: this function only works correctly once the E820 table is sorted and 96 * Note: this function only works correctly once the E820 table is sorted and
97 * not-overlapping (at least for the range specified), which is the case normally. 97 * not-overlapping (at least for the range specified), which is the case normally.
98 */ 98 */
99bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) 99static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
100 enum e820_type type)
100{ 101{
101 int i; 102 int i;
102 103
@@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
122 * coverage of the desired range exists: 123 * coverage of the desired range exists:
123 */ 124 */
124 if (start >= end) 125 if (start >= end)
125 return 1; 126 return entry;
126 } 127 }
127 return 0; 128
129 return NULL;
130}
131
132/*
133 * This function checks if the entire range <start,end> is mapped with type.
134 */
135bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
136{
137 return __e820__mapped_all(start, end, type);
138}
139
140/*
141 * This function returns the type associated with the range <start,end>.
142 */
143int e820__get_entry_type(u64 start, u64 end)
144{
145 struct e820_entry *entry = __e820__mapped_all(start, end, 0);
146
147 return entry ? entry->type : -EINVAL;
128} 148}
129 149
130/* 150/*
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6b91e2eb8d3f..9c4e7ba6870c 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -195,7 +195,7 @@ void init_espfix_ap(int cpu)
195 195
196 pte_p = pte_offset_kernel(&pmd, addr); 196 pte_p = pte_offset_kernel(&pmd, addr);
197 stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); 197 stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
198 pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); 198 pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
199 for (n = 0; n < ESPFIX_PTE_CLONES; n++) 199 for (n = 0; n < ESPFIX_PTE_CLONES; n++)
200 set_pte(&pte_p[n*PTE_STRIDE], pte); 200 set_pte(&pte_p[n*PTE_STRIDE], pte);
201 201
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9ba79543d9ee..6a193b93fd95 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -14,6 +14,7 @@
14#include <linux/start_kernel.h> 14#include <linux/start_kernel.h>
15#include <linux/io.h> 15#include <linux/io.h>
16#include <linux/memblock.h> 16#include <linux/memblock.h>
17#include <linux/mem_encrypt.h>
17 18
18#include <asm/processor.h> 19#include <asm/processor.h>
19#include <asm/proto.h> 20#include <asm/proto.h>
@@ -33,7 +34,6 @@
33/* 34/*
34 * Manage page tables very early on. 35 * Manage page tables very early on.
35 */ 36 */
36extern pgd_t early_top_pgt[PTRS_PER_PGD];
37extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; 37extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
38static unsigned int __initdata next_early_pgt; 38static unsigned int __initdata next_early_pgt;
39pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); 39pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
@@ -45,9 +45,11 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
45 return ptr - (void *)_text + (void *)physaddr; 45 return ptr - (void *)_text + (void *)physaddr;
46} 46}
47 47
48void __head __startup_64(unsigned long physaddr) 48unsigned long __head __startup_64(unsigned long physaddr,
49 struct boot_params *bp)
49{ 50{
50 unsigned long load_delta, *p; 51 unsigned long load_delta, *p;
52 unsigned long pgtable_flags;
51 pgdval_t *pgd; 53 pgdval_t *pgd;
52 p4dval_t *p4d; 54 p4dval_t *p4d;
53 pudval_t *pud; 55 pudval_t *pud;
@@ -69,6 +71,12 @@ void __head __startup_64(unsigned long physaddr)
69 if (load_delta & ~PMD_PAGE_MASK) 71 if (load_delta & ~PMD_PAGE_MASK)
70 for (;;); 72 for (;;);
71 73
74 /* Activate Secure Memory Encryption (SME) if supported and enabled */
75 sme_enable(bp);
76
77 /* Include the SME encryption mask in the fixup value */
78 load_delta += sme_get_me_mask();
79
72 /* Fixup the physical addresses in the page table */ 80 /* Fixup the physical addresses in the page table */
73 81
74 pgd = fixup_pointer(&early_top_pgt, physaddr); 82 pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -92,31 +100,35 @@ void __head __startup_64(unsigned long physaddr)
92 * creates a bunch of nonsense entries but that is fine -- 100 * creates a bunch of nonsense entries but that is fine --
93 * it avoids problems around wraparound. 101 * it avoids problems around wraparound.
94 */ 102 */
103
95 next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); 104 next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
96 pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); 105 pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
97 pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); 106 pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
98 107
108 pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
109
99 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 110 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
100 p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); 111 p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
101 112
102 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; 113 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
103 pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; 114 pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
104 pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; 115 pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
105 116
106 i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; 117 i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
107 p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; 118 p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
108 p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; 119 p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
109 } else { 120 } else {
110 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; 121 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
111 pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; 122 pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
112 pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; 123 pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
113 } 124 }
114 125
115 i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; 126 i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
116 pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; 127 pud[i + 0] = (pudval_t)pmd + pgtable_flags;
117 pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; 128 pud[i + 1] = (pudval_t)pmd + pgtable_flags;
118 129
119 pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; 130 pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
131 pmd_entry += sme_get_me_mask();
120 pmd_entry += physaddr; 132 pmd_entry += physaddr;
121 133
122 for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { 134 for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
@@ -137,9 +149,30 @@ void __head __startup_64(unsigned long physaddr)
137 pmd[i] += load_delta; 149 pmd[i] += load_delta;
138 } 150 }
139 151
140 /* Fixup phys_base */ 152 /*
153 * Fixup phys_base - remove the memory encryption mask to obtain
154 * the true physical address.
155 */
141 p = fixup_pointer(&phys_base, physaddr); 156 p = fixup_pointer(&phys_base, physaddr);
142 *p += load_delta; 157 *p += load_delta - sme_get_me_mask();
158
159 /* Encrypt the kernel (if SME is active) */
160 sme_encrypt_kernel();
161
162 /*
163 * Return the SME encryption mask (if SME is active) to be used as a
164 * modifier for the initial pgdir entry programmed into CR3.
165 */
166 return sme_get_me_mask();
167}
168
169unsigned long __startup_secondary_64(void)
170{
171 /*
172 * Return the SME encryption mask (if SME is active) to be used as a
173 * modifier for the initial pgdir entry programmed into CR3.
174 */
175 return sme_get_me_mask();
143} 176}
144 177
145/* Wipe all early page tables except for the kernel symbol map */ 178/* Wipe all early page tables except for the kernel symbol map */
@@ -147,17 +180,17 @@ static void __init reset_early_page_tables(void)
147{ 180{
148 memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); 181 memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
149 next_early_pgt = 0; 182 next_early_pgt = 0;
150 write_cr3(__pa_nodebug(early_top_pgt)); 183 write_cr3(__sme_pa_nodebug(early_top_pgt));
151} 184}
152 185
153/* Create a new PMD entry */ 186/* Create a new PMD entry */
154int __init early_make_pgtable(unsigned long address) 187int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
155{ 188{
156 unsigned long physaddr = address - __PAGE_OFFSET; 189 unsigned long physaddr = address - __PAGE_OFFSET;
157 pgdval_t pgd, *pgd_p; 190 pgdval_t pgd, *pgd_p;
158 p4dval_t p4d, *p4d_p; 191 p4dval_t p4d, *p4d_p;
159 pudval_t pud, *pud_p; 192 pudval_t pud, *pud_p;
160 pmdval_t pmd, *pmd_p; 193 pmdval_t *pmd_p;
161 194
162 /* Invalid address or early pgt is done ? */ 195 /* Invalid address or early pgt is done ? */
163 if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) 196 if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
@@ -216,12 +249,21 @@ again:
216 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); 249 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
217 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; 250 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
218 } 251 }
219 pmd = (physaddr & PMD_MASK) + early_pmd_flags;
220 pmd_p[pmd_index(address)] = pmd; 252 pmd_p[pmd_index(address)] = pmd;
221 253
222 return 0; 254 return 0;
223} 255}
224 256
257int __init early_make_pgtable(unsigned long address)
258{
259 unsigned long physaddr = address - __PAGE_OFFSET;
260 pmdval_t pmd;
261
262 pmd = (physaddr & PMD_MASK) + early_pmd_flags;
263
264 return __early_make_pgtable(address, pmd);
265}
266
225/* Don't add a printk in there. printk relies on the PDA which is not initialized 267/* Don't add a printk in there. printk relies on the PDA which is not initialized
226 yet. */ 268 yet. */
227static void __init clear_bss(void) 269static void __init clear_bss(void)
@@ -244,6 +286,12 @@ static void __init copy_bootdata(char *real_mode_data)
244 char * command_line; 286 char * command_line;
245 unsigned long cmd_line_ptr; 287 unsigned long cmd_line_ptr;
246 288
289 /*
290 * If SME is active, this will create decrypted mappings of the
291 * boot data in advance of the copy operations.
292 */
293 sme_map_bootdata(real_mode_data);
294
247 memcpy(&boot_params, real_mode_data, sizeof boot_params); 295 memcpy(&boot_params, real_mode_data, sizeof boot_params);
248 sanitize_boot_params(&boot_params); 296 sanitize_boot_params(&boot_params);
249 cmd_line_ptr = get_cmd_line_ptr(); 297 cmd_line_ptr = get_cmd_line_ptr();
@@ -251,6 +299,14 @@ static void __init copy_bootdata(char *real_mode_data)
251 command_line = __va(cmd_line_ptr); 299 command_line = __va(cmd_line_ptr);
252 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 300 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
253 } 301 }
302
303 /*
304 * The old boot data is no longer needed and won't be reserved,
305 * freeing up that memory for use by the system. If SME is active,
306 * we need to remove the mappings that were created so that the
307 * memory doesn't remain mapped as decrypted.
308 */
309 sme_unmap_bootdata(real_mode_data);
254} 310}
255 311
256asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) 312asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
@@ -280,6 +336,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
280 336
281 clear_page(init_top_pgt); 337 clear_page(init_top_pgt);
282 338
339 /*
340 * SME support may update early_pmd_flags to include the memory
341 * encryption mask, so it needs to be called before anything
342 * that may generate a page fault.
343 */
344 sme_early_init();
345
283 kasan_early_init(); 346 kasan_early_init();
284 347
285 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) 348 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6225550883df..513cbb012ecc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -73,12 +73,19 @@ startup_64:
73 /* Sanitize CPU configuration */ 73 /* Sanitize CPU configuration */
74 call verify_cpu 74 call verify_cpu
75 75
76 /*
77 * Perform pagetable fixups. Additionally, if SME is active, encrypt
78 * the kernel and retrieve the modifier (SME encryption mask if SME
79 * is active) to be added to the initial pgdir entry that will be
80 * programmed into CR3.
81 */
76 leaq _text(%rip), %rdi 82 leaq _text(%rip), %rdi
77 pushq %rsi 83 pushq %rsi
78 call __startup_64 84 call __startup_64
79 popq %rsi 85 popq %rsi
80 86
81 movq $(early_top_pgt - __START_KERNEL_map), %rax 87 /* Form the CR3 value being sure to include the CR3 modifier */
88 addq $(early_top_pgt - __START_KERNEL_map), %rax
82 jmp 1f 89 jmp 1f
83ENTRY(secondary_startup_64) 90ENTRY(secondary_startup_64)
84 /* 91 /*
@@ -98,7 +105,16 @@ ENTRY(secondary_startup_64)
98 /* Sanitize CPU configuration */ 105 /* Sanitize CPU configuration */
99 call verify_cpu 106 call verify_cpu
100 107
101 movq $(init_top_pgt - __START_KERNEL_map), %rax 108 /*
109 * Retrieve the modifier (SME encryption mask if SME is active) to be
110 * added to the initial pgdir entry that will be programmed into CR3.
111 */
112 pushq %rsi
113 call __startup_secondary_64
114 popq %rsi
115
116 /* Form the CR3 value being sure to include the CR3 modifier */
117 addq $(init_top_pgt - __START_KERNEL_map), %rax
1021: 1181:
103 119
104 /* Enable PAE mode, PGE and LA57 */ 120 /* Enable PAE mode, PGE and LA57 */
@@ -335,9 +351,9 @@ GLOBAL(name)
335NEXT_PAGE(early_top_pgt) 351NEXT_PAGE(early_top_pgt)
336 .fill 511,8,0 352 .fill 511,8,0
337#ifdef CONFIG_X86_5LEVEL 353#ifdef CONFIG_X86_5LEVEL
338 .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 354 .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
339#else 355#else
340 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 356 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
341#endif 357#endif
342 358
343NEXT_PAGE(early_dynamic_pgts) 359NEXT_PAGE(early_dynamic_pgts)
@@ -350,15 +366,15 @@ NEXT_PAGE(init_top_pgt)
350 .fill 512,8,0 366 .fill 512,8,0
351#else 367#else
352NEXT_PAGE(init_top_pgt) 368NEXT_PAGE(init_top_pgt)
353 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 369 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
354 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 370 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
355 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 371 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
356 .org init_top_pgt + PGD_START_KERNEL*8, 0 372 .org init_top_pgt + PGD_START_KERNEL*8, 0
357 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 373 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
358 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 374 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
359 375
360NEXT_PAGE(level3_ident_pgt) 376NEXT_PAGE(level3_ident_pgt)
361 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 377 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
362 .fill 511, 8, 0 378 .fill 511, 8, 0
363NEXT_PAGE(level2_ident_pgt) 379NEXT_PAGE(level2_ident_pgt)
364 /* Since I easily can, map the first 1G. 380 /* Since I easily can, map the first 1G.
@@ -370,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt)
370#ifdef CONFIG_X86_5LEVEL 386#ifdef CONFIG_X86_5LEVEL
371NEXT_PAGE(level4_kernel_pgt) 387NEXT_PAGE(level4_kernel_pgt)
372 .fill 511,8,0 388 .fill 511,8,0
373 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 389 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
374#endif 390#endif
375 391
376NEXT_PAGE(level3_kernel_pgt) 392NEXT_PAGE(level3_kernel_pgt)
377 .fill L3_START_KERNEL,8,0 393 .fill L3_START_KERNEL,8,0
378 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 394 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
379 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 395 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
380 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 396 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
381 397
382NEXT_PAGE(level2_kernel_pgt) 398NEXT_PAGE(level2_kernel_pgt)
383 /* 399 /*
@@ -395,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt)
395 411
396NEXT_PAGE(level2_fixmap_pgt) 412NEXT_PAGE(level2_fixmap_pgt)
397 .fill 506,8,0 413 .fill 506,8,0
398 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 414 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
399 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ 415 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
400 .fill 5,8,0 416 .fill 5,8,0
401 417
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 38b64587b31b..fd6f8fbbe6f2 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
33 struct setup_data_node *node = file->private_data; 33 struct setup_data_node *node = file->private_data;
34 unsigned long remain; 34 unsigned long remain;
35 loff_t pos = *ppos; 35 loff_t pos = *ppos;
36 struct page *pg;
37 void *p; 36 void *p;
38 u64 pa; 37 u64 pa;
39 38
@@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
47 count = node->len - pos; 46 count = node->len - pos;
48 47
49 pa = node->paddr + sizeof(struct setup_data) + pos; 48 pa = node->paddr + sizeof(struct setup_data) + pos;
50 pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); 49 p = memremap(pa, count, MEMREMAP_WB);
51 if (PageHighMem(pg)) { 50 if (!p)
52 p = ioremap_cache(pa, count); 51 return -ENOMEM;
53 if (!p)
54 return -ENXIO;
55 } else
56 p = __va(pa);
57 52
58 remain = copy_to_user(user_buf, p, count); 53 remain = copy_to_user(user_buf, p, count);
59 54
60 if (PageHighMem(pg)) 55 memunmap(p);
61 iounmap(p);
62 56
63 if (remain) 57 if (remain)
64 return -EFAULT; 58 return -EFAULT;
@@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent)
109 struct setup_data *data; 103 struct setup_data *data;
110 int error; 104 int error;
111 struct dentry *d; 105 struct dentry *d;
112 struct page *pg;
113 u64 pa_data; 106 u64 pa_data;
114 int no = 0; 107 int no = 0;
115 108
@@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent)
126 goto err_dir; 119 goto err_dir;
127 } 120 }
128 121
129 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); 122 data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
130 if (PageHighMem(pg)) { 123 if (!data) {
131 data = ioremap_cache(pa_data, sizeof(*data)); 124 kfree(node);
132 if (!data) { 125 error = -ENOMEM;
133 kfree(node); 126 goto err_dir;
134 error = -ENXIO; 127 }
135 goto err_dir;
136 }
137 } else
138 data = __va(pa_data);
139 128
140 node->paddr = pa_data; 129 node->paddr = pa_data;
141 node->type = data->type; 130 node->type = data->type;
@@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
143 error = create_setup_data_node(d, no, node); 132 error = create_setup_data_node(d, no, node);
144 pa_data = data->next; 133 pa_data = data->next;
145 134
146 if (PageHighMem(pg)) 135 memunmap(data);
147 iounmap(data);
148 if (error) 136 if (error)
149 goto err_dir; 137 goto err_dir;
150 no++; 138 no++;
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 06e1ff5562c0..4b0592ca9e47 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -16,8 +16,8 @@
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/io.h>
19 20
20#include <asm/io.h>
21#include <asm/setup.h> 21#include <asm/setup.h>
22 22
23static ssize_t version_show(struct kobject *kobj, 23static ssize_t version_show(struct kobject *kobj,
@@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr)
79 *paddr = pa_data; 79 *paddr = pa_data;
80 return 0; 80 return 0;
81 } 81 }
82 data = ioremap_cache(pa_data, sizeof(*data)); 82 data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
83 if (!data) 83 if (!data)
84 return -ENOMEM; 84 return -ENOMEM;
85 85
86 pa_data = data->next; 86 pa_data = data->next;
87 iounmap(data); 87 memunmap(data);
88 i++; 88 i++;
89 } 89 }
90 return -EINVAL; 90 return -EINVAL;
@@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size)
97 u64 pa_data = boot_params.hdr.setup_data; 97 u64 pa_data = boot_params.hdr.setup_data;
98 98
99 while (pa_data) { 99 while (pa_data) {
100 data = ioremap_cache(pa_data, sizeof(*data)); 100 data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
101 if (!data) 101 if (!data)
102 return -ENOMEM; 102 return -ENOMEM;
103 if (nr == i) { 103 if (nr == i) {
104 *size = data->len; 104 *size = data->len;
105 iounmap(data); 105 memunmap(data);
106 return 0; 106 return 0;
107 } 107 }
108 108
109 pa_data = data->next; 109 pa_data = data->next;
110 iounmap(data); 110 memunmap(data);
111 i++; 111 i++;
112 } 112 }
113 return -EINVAL; 113 return -EINVAL;
@@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj,
127 ret = get_setup_data_paddr(nr, &paddr); 127 ret = get_setup_data_paddr(nr, &paddr);
128 if (ret) 128 if (ret)
129 return ret; 129 return ret;
130 data = ioremap_cache(paddr, sizeof(*data)); 130 data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
131 if (!data) 131 if (!data)
132 return -ENOMEM; 132 return -ENOMEM;
133 133
134 ret = sprintf(buf, "0x%x\n", data->type); 134 ret = sprintf(buf, "0x%x\n", data->type);
135 iounmap(data); 135 memunmap(data);
136 return ret; 136 return ret;
137} 137}
138 138
@@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp,
154 ret = get_setup_data_paddr(nr, &paddr); 154 ret = get_setup_data_paddr(nr, &paddr);
155 if (ret) 155 if (ret)
156 return ret; 156 return ret;
157 data = ioremap_cache(paddr, sizeof(*data)); 157 data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
158 if (!data) 158 if (!data)
159 return -ENOMEM; 159 return -ENOMEM;
160 160
@@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp,
170 goto out; 170 goto out;
171 171
172 ret = count; 172 ret = count;
173 p = ioremap_cache(paddr + sizeof(*data), data->len); 173 p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
174 if (!p) { 174 if (!p) {
175 ret = -ENOMEM; 175 ret = -ENOMEM;
176 goto out; 176 goto out;
177 } 177 }
178 memcpy(buf, p + off, count); 178 memcpy(buf, p + off, count);
179 iounmap(p); 179 memunmap(p);
180out: 180out:
181 iounmap(data); 181 memunmap(data);
182 return ret; 182 return ret;
183} 183}
184 184
@@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr)
250 *nr = 0; 250 *nr = 0;
251 while (pa_data) { 251 while (pa_data) {
252 *nr += 1; 252 *nr += 1;
253 data = ioremap_cache(pa_data, sizeof(*data)); 253 data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
254 if (!data) { 254 if (!data) {
255 ret = -ENOMEM; 255 ret = -ENOMEM;
256 goto out; 256 goto out;
257 } 257 }
258 pa_data = data->next; 258 pa_data = data->next;
259 iounmap(data); 259 memunmap(data);
260 } 260 }
261 261
262out: 262out:
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index cb0a30473c23..1f790cf9d38f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
87 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); 87 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
88 } 88 }
89 pte = pte_offset_kernel(pmd, vaddr); 89 pte = pte_offset_kernel(pmd, vaddr);
90 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); 90 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
91 return 0; 91 return 0;
92err: 92err:
93 free_transition_pgtable(image); 93 free_transition_pgtable(image);
@@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
115 .alloc_pgt_page = alloc_pgt_page, 115 .alloc_pgt_page = alloc_pgt_page,
116 .context = image, 116 .context = image,
117 .page_flag = __PAGE_KERNEL_LARGE_EXEC, 117 .page_flag = __PAGE_KERNEL_LARGE_EXEC,
118 .kernpg_flag = _KERNPG_TABLE_NOENC,
118 }; 119 };
119 unsigned long mstart, mend; 120 unsigned long mstart, mend;
120 pgd_t *level4p; 121 pgd_t *level4p;
@@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image)
334 image->start = relocate_kernel((unsigned long)image->head, 335 image->start = relocate_kernel((unsigned long)image->head,
335 (unsigned long)page_list, 336 (unsigned long)page_list,
336 image->start, 337 image->start,
337 image->preserve_context); 338 image->preserve_context,
339 sme_active());
338 340
339#ifdef CONFIG_KEXEC_JUMP 341#ifdef CONFIG_KEXEC_JUMP
340 if (image->preserve_context) 342 if (image->preserve_context)
@@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void)
602{ 604{
603 kexec_mark_crashkres(false); 605 kexec_mark_crashkres(false);
604} 606}
607
608int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
609{
610 /*
611 * If SME is active we need to be sure that kexec pages are
612 * not encrypted because when we boot to the new kernel the
613 * pages won't be accessed encrypted (initially).
614 */
615 return set_memory_decrypted((unsigned long)vaddr, pages);
616}
617
618void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
619{
620 /*
621 * If SME is active we need to reset the pages back to being
622 * an encrypted mapping before freeing them.
623 */
624 set_memory_encrypted((unsigned long)vaddr, pages);
625}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0d904d759ff1..5cbb3177ed17 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -429,16 +429,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
429 } 429 }
430} 430}
431 431
432static struct mpf_intel *mpf_found; 432static unsigned long mpf_base;
433 433
434static unsigned long __init get_mpc_size(unsigned long physptr) 434static unsigned long __init get_mpc_size(unsigned long physptr)
435{ 435{
436 struct mpc_table *mpc; 436 struct mpc_table *mpc;
437 unsigned long size; 437 unsigned long size;
438 438
439 mpc = early_ioremap(physptr, PAGE_SIZE); 439 mpc = early_memremap(physptr, PAGE_SIZE);
440 size = mpc->length; 440 size = mpc->length;
441 early_iounmap(mpc, PAGE_SIZE); 441 early_memunmap(mpc, PAGE_SIZE);
442 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); 442 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
443 443
444 return size; 444 return size;
@@ -450,7 +450,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
450 unsigned long size; 450 unsigned long size;
451 451
452 size = get_mpc_size(mpf->physptr); 452 size = get_mpc_size(mpf->physptr);
453 mpc = early_ioremap(mpf->physptr, size); 453 mpc = early_memremap(mpf->physptr, size);
454
454 /* 455 /*
455 * Read the physical hardware table. Anything here will 456 * Read the physical hardware table. Anything here will
456 * override the defaults. 457 * override the defaults.
@@ -461,10 +462,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
461#endif 462#endif
462 pr_err("BIOS bug, MP table errors detected!...\n"); 463 pr_err("BIOS bug, MP table errors detected!...\n");
463 pr_cont("... disabling SMP support. (tell your hw vendor)\n"); 464 pr_cont("... disabling SMP support. (tell your hw vendor)\n");
464 early_iounmap(mpc, size); 465 early_memunmap(mpc, size);
465 return -1; 466 return -1;
466 } 467 }
467 early_iounmap(mpc, size); 468 early_memunmap(mpc, size);
468 469
469 if (early) 470 if (early)
470 return -1; 471 return -1;
@@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
497 */ 498 */
498void __init default_get_smp_config(unsigned int early) 499void __init default_get_smp_config(unsigned int early)
499{ 500{
500 struct mpf_intel *mpf = mpf_found; 501 struct mpf_intel *mpf;
501 502
502 if (!smp_found_config) 503 if (!smp_found_config)
503 return; 504 return;
504 505
505 if (!mpf) 506 if (!mpf_base)
506 return; 507 return;
507 508
508 if (acpi_lapic && early) 509 if (acpi_lapic && early)
@@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early)
515 if (acpi_lapic && acpi_ioapic) 516 if (acpi_lapic && acpi_ioapic)
516 return; 517 return;
517 518
519 mpf = early_memremap(mpf_base, sizeof(*mpf));
520 if (!mpf) {
521 pr_err("MPTABLE: error mapping MP table\n");
522 return;
523 }
524
518 pr_info("Intel MultiProcessor Specification v1.%d\n", 525 pr_info("Intel MultiProcessor Specification v1.%d\n",
519 mpf->specification); 526 mpf->specification);
520#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 527#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early)
529 /* 536 /*
530 * Now see if we need to read further. 537 * Now see if we need to read further.
531 */ 538 */
532 if (mpf->feature1 != 0) { 539 if (mpf->feature1) {
533 if (early) { 540 if (early) {
534 /* 541 /*
535 * local APIC has default address 542 * local APIC has default address
@@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early)
542 construct_default_ISA_mptable(mpf->feature1); 549 construct_default_ISA_mptable(mpf->feature1);
543 550
544 } else if (mpf->physptr) { 551 } else if (mpf->physptr) {
545 if (check_physptr(mpf, early)) 552 if (check_physptr(mpf, early)) {
553 early_memunmap(mpf, sizeof(*mpf));
546 return; 554 return;
555 }
547 } else 556 } else
548 BUG(); 557 BUG();
549 558
@@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early)
552 /* 561 /*
553 * Only use the first configuration found. 562 * Only use the first configuration found.
554 */ 563 */
564
565 early_memunmap(mpf, sizeof(*mpf));
555} 566}
556 567
557static void __init smp_reserve_memory(struct mpf_intel *mpf) 568static void __init smp_reserve_memory(struct mpf_intel *mpf)
@@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
561 572
562static int __init smp_scan_config(unsigned long base, unsigned long length) 573static int __init smp_scan_config(unsigned long base, unsigned long length)
563{ 574{
564 unsigned int *bp = phys_to_virt(base); 575 unsigned int *bp;
565 struct mpf_intel *mpf; 576 struct mpf_intel *mpf;
566 unsigned long mem; 577 int ret = 0;
567 578
568 apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", 579 apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
569 base, base + length - 1); 580 base, base + length - 1);
570 BUILD_BUG_ON(sizeof(*mpf) != 16); 581 BUILD_BUG_ON(sizeof(*mpf) != 16);
571 582
572 while (length > 0) { 583 while (length > 0) {
584 bp = early_memremap(base, length);
573 mpf = (struct mpf_intel *)bp; 585 mpf = (struct mpf_intel *)bp;
574 if ((*bp == SMP_MAGIC_IDENT) && 586 if ((*bp == SMP_MAGIC_IDENT) &&
575 (mpf->length == 1) && 587 (mpf->length == 1) &&
@@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
579#ifdef CONFIG_X86_LOCAL_APIC 591#ifdef CONFIG_X86_LOCAL_APIC
580 smp_found_config = 1; 592 smp_found_config = 1;
581#endif 593#endif
582 mpf_found = mpf; 594 mpf_base = base;
583 595
584 pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", 596 pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
585 (unsigned long long) virt_to_phys(mpf), 597 base, base + sizeof(*mpf) - 1, mpf);
586 (unsigned long long) virt_to_phys(mpf) +
587 sizeof(*mpf) - 1, mpf);
588 598
589 mem = virt_to_phys(mpf); 599 memblock_reserve(base, sizeof(*mpf));
590 memblock_reserve(mem, sizeof(*mpf));
591 if (mpf->physptr) 600 if (mpf->physptr)
592 smp_reserve_memory(mpf); 601 smp_reserve_memory(mpf);
593 602
594 return 1; 603 ret = 1;
595 } 604 }
596 bp += 4; 605 early_memunmap(bp, length);
606
607 if (ret)
608 break;
609
610 base += 16;
597 length -= 16; 611 length -= 16;
598 } 612 }
599 return 0; 613 return ret;
600} 614}
601 615
602void __init default_find_smp_config(void) 616void __init default_find_smp_config(void)
@@ -838,29 +852,40 @@ static int __init update_mp_table(void)
838 char oem[10]; 852 char oem[10];
839 struct mpf_intel *mpf; 853 struct mpf_intel *mpf;
840 struct mpc_table *mpc, *mpc_new; 854 struct mpc_table *mpc, *mpc_new;
855 unsigned long size;
841 856
842 if (!enable_update_mptable) 857 if (!enable_update_mptable)
843 return 0; 858 return 0;
844 859
845 mpf = mpf_found; 860 if (!mpf_base)
846 if (!mpf)
847 return 0; 861 return 0;
848 862
863 mpf = early_memremap(mpf_base, sizeof(*mpf));
864 if (!mpf) {
865 pr_err("MPTABLE: mpf early_memremap() failed\n");
866 return 0;
867 }
868
849 /* 869 /*
850 * Now see if we need to go further. 870 * Now see if we need to go further.
851 */ 871 */
852 if (mpf->feature1 != 0) 872 if (mpf->feature1)
853 return 0; 873 goto do_unmap_mpf;
854 874
855 if (!mpf->physptr) 875 if (!mpf->physptr)
856 return 0; 876 goto do_unmap_mpf;
857 877
858 mpc = phys_to_virt(mpf->physptr); 878 size = get_mpc_size(mpf->physptr);
879 mpc = early_memremap(mpf->physptr, size);
880 if (!mpc) {
881 pr_err("MPTABLE: mpc early_memremap() failed\n");
882 goto do_unmap_mpf;
883 }
859 884
860 if (!smp_check_mpc(mpc, oem, str)) 885 if (!smp_check_mpc(mpc, oem, str))
861 return 0; 886 goto do_unmap_mpc;
862 887
863 pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf)); 888 pr_info("mpf: %llx\n", (u64)mpf_base);
864 pr_info("physptr: %x\n", mpf->physptr); 889 pr_info("physptr: %x\n", mpf->physptr);
865 890
866 if (mpc_new_phys && mpc->length > mpc_new_length) { 891 if (mpc_new_phys && mpc->length > mpc_new_length) {
@@ -878,21 +903,32 @@ static int __init update_mp_table(void)
878 new = mpf_checksum((unsigned char *)mpc, mpc->length); 903 new = mpf_checksum((unsigned char *)mpc, mpc->length);
879 if (old == new) { 904 if (old == new) {
880 pr_info("mpc is readonly, please try alloc_mptable instead\n"); 905 pr_info("mpc is readonly, please try alloc_mptable instead\n");
881 return 0; 906 goto do_unmap_mpc;
882 } 907 }
883 pr_info("use in-position replacing\n"); 908 pr_info("use in-position replacing\n");
884 } else { 909 } else {
910 mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
911 if (!mpc_new) {
912 pr_err("MPTABLE: new mpc early_memremap() failed\n");
913 goto do_unmap_mpc;
914 }
885 mpf->physptr = mpc_new_phys; 915 mpf->physptr = mpc_new_phys;
886 mpc_new = phys_to_virt(mpc_new_phys);
887 memcpy(mpc_new, mpc, mpc->length); 916 memcpy(mpc_new, mpc, mpc->length);
917 early_memunmap(mpc, size);
888 mpc = mpc_new; 918 mpc = mpc_new;
919 size = mpc_new_length;
889 /* check if we can modify that */ 920 /* check if we can modify that */
890 if (mpc_new_phys - mpf->physptr) { 921 if (mpc_new_phys - mpf->physptr) {
891 struct mpf_intel *mpf_new; 922 struct mpf_intel *mpf_new;
892 /* steal 16 bytes from [0, 1k) */ 923 /* steal 16 bytes from [0, 1k) */
924 mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
925 if (!mpf_new) {
926 pr_err("MPTABLE: new mpf early_memremap() failed\n");
927 goto do_unmap_mpc;
928 }
893 pr_info("mpf new: %x\n", 0x400 - 16); 929 pr_info("mpf new: %x\n", 0x400 - 16);
894 mpf_new = phys_to_virt(0x400 - 16);
895 memcpy(mpf_new, mpf, 16); 930 memcpy(mpf_new, mpf, 16);
931 early_memunmap(mpf, sizeof(*mpf));
896 mpf = mpf_new; 932 mpf = mpf_new;
897 mpf->physptr = mpc_new_phys; 933 mpf->physptr = mpc_new_phys;
898 } 934 }
@@ -909,6 +945,12 @@ static int __init update_mp_table(void)
909 */ 945 */
910 replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); 946 replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
911 947
948do_unmap_mpc:
949 early_memunmap(mpc, size);
950
951do_unmap_mpf:
952 early_memunmap(mpf, sizeof(*mpf));
953
912 return 0; 954 return 0;
913} 955}
914 956
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 5e16d3f29594..0accc2404b92 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -93,9 +93,12 @@ again:
93 if (gfpflags_allow_blocking(flag)) { 93 if (gfpflags_allow_blocking(flag)) {
94 page = dma_alloc_from_contiguous(dev, count, get_order(size), 94 page = dma_alloc_from_contiguous(dev, count, get_order(size),
95 flag); 95 flag);
96 if (page && page_to_phys(page) + size > dma_mask) { 96 if (page) {
97 dma_release_from_contiguous(dev, page, count); 97 addr = phys_to_dma(dev, page_to_phys(page));
98 page = NULL; 98 if (addr + size > dma_mask) {
99 dma_release_from_contiguous(dev, page, count);
100 page = NULL;
101 }
99 } 102 }
100 } 103 }
101 /* fallback */ 104 /* fallback */
@@ -104,7 +107,7 @@ again:
104 if (!page) 107 if (!page)
105 return NULL; 108 return NULL;
106 109
107 addr = page_to_phys(page); 110 addr = phys_to_dma(dev, page_to_phys(page));
108 if (addr + size > dma_mask) { 111 if (addr + size > dma_mask) {
109 __free_pages(page, get_order(size)); 112 __free_pages(page, get_order(size));
110 113
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a6d404087fe3..4fc3cb60ea11 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
32 enum dma_data_direction dir, 32 enum dma_data_direction dir,
33 unsigned long attrs) 33 unsigned long attrs)
34{ 34{
35 dma_addr_t bus = page_to_phys(page) + offset; 35 dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
36 WARN_ON(size == 0); 36 WARN_ON(size == 0);
37 if (!check_addr("map_single", dev, bus, size)) 37 if (!check_addr("map_single", dev, bus, size))
38 return NOMMU_MAPPING_ERROR; 38 return NOMMU_MAPPING_ERROR;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 1e23577e17cf..677077510e30 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,12 +6,14 @@
6#include <linux/swiotlb.h> 6#include <linux/swiotlb.h>
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/dma-mapping.h> 8#include <linux/dma-mapping.h>
9#include <linux/mem_encrypt.h>
9 10
10#include <asm/iommu.h> 11#include <asm/iommu.h>
11#include <asm/swiotlb.h> 12#include <asm/swiotlb.h>
12#include <asm/dma.h> 13#include <asm/dma.h>
13#include <asm/xen/swiotlb-xen.h> 14#include <asm/xen/swiotlb-xen.h>
14#include <asm/iommu_table.h> 15#include <asm/iommu_table.h>
16
15int swiotlb __read_mostly; 17int swiotlb __read_mostly;
16 18
17void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 19void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
79 pci_swiotlb_late_init); 81 pci_swiotlb_late_init);
80 82
81/* 83/*
82 * if 4GB or more detected (and iommu=off not set) return 1 84 * If 4GB or more detected (and iommu=off not set) or if SME is active
83 * and set swiotlb to 1. 85 * then set swiotlb to 1 and return 1.
84 */ 86 */
85int __init pci_swiotlb_detect_4gb(void) 87int __init pci_swiotlb_detect_4gb(void)
86{ 88{
@@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void)
89 if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) 91 if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
90 swiotlb = 1; 92 swiotlb = 1;
91#endif 93#endif
94
95 /*
96 * If SME is active then swiotlb will be set to 1 so that bounce
97 * buffers are allocated and used for devices that do not support
98 * the addressing range required for the encryption mask.
99 */
100 if (sme_active())
101 swiotlb = 1;
102
92 return swiotlb; 103 return swiotlb;
93} 104}
94IOMMU_INIT(pci_swiotlb_detect_4gb, 105IOMMU_INIT(pci_swiotlb_detect_4gb,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ca198080ea9..bd6b85fac666 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -355,6 +355,7 @@ bool xen_set_default_idle(void)
355 return ret; 355 return ret;
356} 356}
357#endif 357#endif
358
358void stop_this_cpu(void *dummy) 359void stop_this_cpu(void *dummy)
359{ 360{
360 local_irq_disable(); 361 local_irq_disable();
@@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy)
365 disable_local_APIC(); 366 disable_local_APIC();
366 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 367 mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
367 368
368 for (;;) 369 for (;;) {
369 halt(); 370 /*
371 * Use wbinvd followed by hlt to stop the processor. This
372 * provides support for kexec on a processor that supports
373 * SME. With kexec, going from SME inactive to SME active
374 * requires clearing cache entries so that addresses without
375 * the encryption bit set don't corrupt the same physical
376 * address that has the encryption bit set when caches are
377 * flushed. To achieve this a wbinvd is performed followed by
378 * a hlt. Even if the processor is not in the kexec/SME
379 * scenario this only adds a wbinvd to a halting processor.
380 */
381 asm volatile("wbinvd; hlt" : : : "memory");
382 }
370} 383}
371 384
372/* 385/*
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 98111b38ebfd..307d3bac5f04 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -47,6 +47,7 @@ relocate_kernel:
47 * %rsi page_list 47 * %rsi page_list
48 * %rdx start address 48 * %rdx start address
49 * %rcx preserve_context 49 * %rcx preserve_context
50 * %r8 sme_active
50 */ 51 */
51 52
52 /* Save the CPU context, used for jumping back */ 53 /* Save the CPU context, used for jumping back */
@@ -71,6 +72,9 @@ relocate_kernel:
71 pushq $0 72 pushq $0
72 popfq 73 popfq
73 74
75 /* Save SME active flag */
76 movq %r8, %r12
77
74 /* 78 /*
75 * get physical address of control page now 79 * get physical address of control page now
76 * this is impossible after page table switch 80 * this is impossible after page table switch
@@ -132,6 +136,16 @@ identity_mapped:
132 /* Flush the TLB (needed?) */ 136 /* Flush the TLB (needed?) */
133 movq %r9, %cr3 137 movq %r9, %cr3
134 138
139 /*
140 * If SME is active, there could be old encrypted cache line
141 * entries that will conflict with the now unencrypted memory
142 * used by kexec. Flush the caches before copying the kernel.
143 */
144 testq %r12, %r12
145 jz 1f
146 wbinvd
1471:
148
135 movq %rcx, %r11 149 movq %rcx, %r11
136 call swap_pages 150 call swap_pages
137 151
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ecab32282f0f..022ebddb3734 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -69,6 +69,7 @@
69#include <linux/crash_dump.h> 69#include <linux/crash_dump.h>
70#include <linux/tboot.h> 70#include <linux/tboot.h>
71#include <linux/jiffies.h> 71#include <linux/jiffies.h>
72#include <linux/mem_encrypt.h>
72 73
73#include <linux/usb/xhci-dbgp.h> 74#include <linux/usb/xhci-dbgp.h>
74#include <video/edid.h> 75#include <video/edid.h>
@@ -375,6 +376,14 @@ static void __init reserve_initrd(void)
375 !ramdisk_image || !ramdisk_size) 376 !ramdisk_image || !ramdisk_size)
376 return; /* No initrd provided by bootloader */ 377 return; /* No initrd provided by bootloader */
377 378
379 /*
380 * If SME is active, this memory will be marked encrypted by the
381 * kernel when it is accessed (including relocation). However, the
382 * ramdisk image was loaded decrypted by the bootloader, so make
383 * sure that it is encrypted before accessing it.
384 */
385 sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
386
378 initrd_start = 0; 387 initrd_start = 0;
379 388
380 mapped_size = memblock_mem_size(max_pfn_mapped); 389 mapped_size = memblock_mem_size(max_pfn_mapped);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 213ddf3e937d..73e4d28112f8 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -21,6 +21,7 @@
21#include <asm/compat.h> 21#include <asm/compat.h>
22#include <asm/ia32.h> 22#include <asm/ia32.h>
23#include <asm/syscalls.h> 23#include <asm/syscalls.h>
24#include <asm/mpx.h>
24 25
25/* 26/*
26 * Align a virtual address to avoid aliasing in the I$ on AMD F15h. 27 * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
@@ -100,8 +101,8 @@ out:
100 return error; 101 return error;
101} 102}
102 103
103static void find_start_end(unsigned long flags, unsigned long *begin, 104static void find_start_end(unsigned long addr, unsigned long flags,
104 unsigned long *end) 105 unsigned long *begin, unsigned long *end)
105{ 106{
106 if (!in_compat_syscall() && (flags & MAP_32BIT)) { 107 if (!in_compat_syscall() && (flags & MAP_32BIT)) {
107 /* This is usually used needed to map code in small 108 /* This is usually used needed to map code in small
@@ -120,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
120 } 121 }
121 122
122 *begin = get_mmap_base(1); 123 *begin = get_mmap_base(1);
123 *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); 124 if (in_compat_syscall())
125 *end = task_size_32bit();
126 else
127 *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
124} 128}
125 129
126unsigned long 130unsigned long
@@ -132,10 +136,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
132 struct vm_unmapped_area_info info; 136 struct vm_unmapped_area_info info;
133 unsigned long begin, end; 137 unsigned long begin, end;
134 138
139 addr = mpx_unmapped_area_check(addr, len, flags);
140 if (IS_ERR_VALUE(addr))
141 return addr;
142
135 if (flags & MAP_FIXED) 143 if (flags & MAP_FIXED)
136 return addr; 144 return addr;
137 145
138 find_start_end(flags, &begin, &end); 146 find_start_end(addr, flags, &begin, &end);
139 147
140 if (len > end) 148 if (len > end)
141 return -ENOMEM; 149 return -ENOMEM;
@@ -171,6 +179,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
171 unsigned long addr = addr0; 179 unsigned long addr = addr0;
172 struct vm_unmapped_area_info info; 180 struct vm_unmapped_area_info info;
173 181
182 addr = mpx_unmapped_area_check(addr, len, flags);
183 if (IS_ERR_VALUE(addr))
184 return addr;
185
174 /* requested length too big for entire address space */ 186 /* requested length too big for entire address space */
175 if (len > TASK_SIZE) 187 if (len > TASK_SIZE)
176 return -ENOMEM; 188 return -ENOMEM;
@@ -195,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
195 info.length = len; 207 info.length = len;
196 info.low_limit = PAGE_SIZE; 208 info.low_limit = PAGE_SIZE;
197 info.high_limit = get_mmap_base(0); 209 info.high_limit = get_mmap_base(0);
210
211 /*
212 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
213 * in the full address space.
214 *
215 * !in_compat_syscall() check to avoid high addresses for x32.
216 */
217 if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
218 info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
219
198 info.align_mask = 0; 220 info.align_mask = 0;
199 info.align_offset = pgoff << PAGE_SHIFT; 221 info.align_offset = pgoff << PAGE_SHIFT;
200 if (filp) { 222 if (filp) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b1dd114956a..04d750813c9d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -108,7 +108,7 @@ module_param(dbg, bool, 0644);
108 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 108 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
109 109
110 110
111#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 111#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
112#define PT64_DIR_BASE_ADDR_MASK \ 112#define PT64_DIR_BASE_ADDR_MASK \
113 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 113 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
114#define PT64_LVL_ADDR_MASK(level) \ 114#define PT64_LVL_ADDR_MASK(level) \
@@ -126,7 +126,7 @@ module_param(dbg, bool, 0644);
126 * PT32_LEVEL_BITS))) - 1)) 126 * PT32_LEVEL_BITS))) - 1))
127 127
128#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ 128#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
129 | shadow_x_mask | shadow_nx_mask) 129 | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
130 130
131#define ACC_EXEC_MASK 1 131#define ACC_EXEC_MASK 1
132#define ACC_WRITE_MASK PT_WRITABLE_MASK 132#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -186,6 +186,7 @@ static u64 __read_mostly shadow_dirty_mask;
186static u64 __read_mostly shadow_mmio_mask; 186static u64 __read_mostly shadow_mmio_mask;
187static u64 __read_mostly shadow_mmio_value; 187static u64 __read_mostly shadow_mmio_value;
188static u64 __read_mostly shadow_present_mask; 188static u64 __read_mostly shadow_present_mask;
189static u64 __read_mostly shadow_me_mask;
189 190
190/* 191/*
191 * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. 192 * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
@@ -349,7 +350,7 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
349 */ 350 */
350void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 351void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
351 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, 352 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
352 u64 acc_track_mask) 353 u64 acc_track_mask, u64 me_mask)
353{ 354{
354 BUG_ON(!dirty_mask != !accessed_mask); 355 BUG_ON(!dirty_mask != !accessed_mask);
355 BUG_ON(!accessed_mask && !acc_track_mask); 356 BUG_ON(!accessed_mask && !acc_track_mask);
@@ -362,6 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
362 shadow_x_mask = x_mask; 363 shadow_x_mask = x_mask;
363 shadow_present_mask = p_mask; 364 shadow_present_mask = p_mask;
364 shadow_acc_track_mask = acc_track_mask; 365 shadow_acc_track_mask = acc_track_mask;
366 shadow_me_mask = me_mask;
365} 367}
366EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 368EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
367 369
@@ -2433,7 +2435,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2433 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2435 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2434 2436
2435 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | 2437 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2436 shadow_user_mask | shadow_x_mask; 2438 shadow_user_mask | shadow_x_mask | shadow_me_mask;
2437 2439
2438 if (sp_ad_disabled(sp)) 2440 if (sp_ad_disabled(sp))
2439 spte |= shadow_acc_track_value; 2441 spte |= shadow_acc_track_value;
@@ -2745,6 +2747,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2745 pte_access &= ~ACC_WRITE_MASK; 2747 pte_access &= ~ACC_WRITE_MASK;
2746 2748
2747 spte |= (u64)pfn << PAGE_SHIFT; 2749 spte |= (u64)pfn << PAGE_SHIFT;
2750 spte |= shadow_me_mask;
2748 2751
2749 if (pte_access & ACC_WRITE_MASK) { 2752 if (pte_access & ACC_WRITE_MASK) {
2750 2753
@@ -4106,16 +4109,28 @@ void
4106reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 4109reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4107{ 4110{
4108 bool uses_nx = context->nx || context->base_role.smep_andnot_wp; 4111 bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
4112 struct rsvd_bits_validate *shadow_zero_check;
4113 int i;
4109 4114
4110 /* 4115 /*
4111 * Passing "true" to the last argument is okay; it adds a check 4116 * Passing "true" to the last argument is okay; it adds a check
4112 * on bit 8 of the SPTEs which KVM doesn't use anyway. 4117 * on bit 8 of the SPTEs which KVM doesn't use anyway.
4113 */ 4118 */
4114 __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 4119 shadow_zero_check = &context->shadow_zero_check;
4120 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4115 boot_cpu_data.x86_phys_bits, 4121 boot_cpu_data.x86_phys_bits,
4116 context->shadow_root_level, uses_nx, 4122 context->shadow_root_level, uses_nx,
4117 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), 4123 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
4118 true); 4124 true);
4125
4126 if (!shadow_me_mask)
4127 return;
4128
4129 for (i = context->shadow_root_level; --i >= 0;) {
4130 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4131 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4132 }
4133
4119} 4134}
4120EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); 4135EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4121 4136
@@ -4133,17 +4148,29 @@ static void
4133reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4148reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4134 struct kvm_mmu *context) 4149 struct kvm_mmu *context)
4135{ 4150{
4151 struct rsvd_bits_validate *shadow_zero_check;
4152 int i;
4153
4154 shadow_zero_check = &context->shadow_zero_check;
4155
4136 if (boot_cpu_is_amd()) 4156 if (boot_cpu_is_amd())
4137 __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 4157 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4138 boot_cpu_data.x86_phys_bits, 4158 boot_cpu_data.x86_phys_bits,
4139 context->shadow_root_level, false, 4159 context->shadow_root_level, false,
4140 boot_cpu_has(X86_FEATURE_GBPAGES), 4160 boot_cpu_has(X86_FEATURE_GBPAGES),
4141 true, true); 4161 true, true);
4142 else 4162 else
4143 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 4163 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4144 boot_cpu_data.x86_phys_bits, 4164 boot_cpu_data.x86_phys_bits,
4145 false); 4165 false);
4146 4166
4167 if (!shadow_me_mask)
4168 return;
4169
4170 for (i = context->shadow_root_level; --i >= 0;) {
4171 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4172 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4173 }
4147} 4174}
4148 4175
4149/* 4176/*
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af256b786a70..8dbd8dbc83eb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1167,9 +1167,9 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
1167{ 1167{
1168 struct vmcb *vmcb = svm->vmcb; 1168 struct vmcb *vmcb = svm->vmcb;
1169 struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; 1169 struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
1170 phys_addr_t bpa = page_to_phys(svm->avic_backing_page); 1170 phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
1171 phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page); 1171 phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
1172 phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page); 1172 phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
1173 1173
1174 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; 1174 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
1175 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; 1175 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1232,8 +1232,8 @@ static void init_vmcb(struct vcpu_svm *svm)
1232 set_intercept(svm, INTERCEPT_MWAIT); 1232 set_intercept(svm, INTERCEPT_MWAIT);
1233 } 1233 }
1234 1234
1235 control->iopm_base_pa = iopm_base; 1235 control->iopm_base_pa = __sme_set(iopm_base);
1236 control->msrpm_base_pa = __pa(svm->msrpm); 1236 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1237 control->int_ctl = V_INTR_MASKING_MASK; 1237 control->int_ctl = V_INTR_MASKING_MASK;
1238 1238
1239 init_seg(&save->es); 1239 init_seg(&save->es);
@@ -1377,9 +1377,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
1377 return -EINVAL; 1377 return -EINVAL;
1378 1378
1379 new_entry = READ_ONCE(*entry); 1379 new_entry = READ_ONCE(*entry);
1380 new_entry = (page_to_phys(svm->avic_backing_page) & 1380 new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
1381 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | 1381 AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
1382 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; 1382 AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
1383 WRITE_ONCE(*entry, new_entry); 1383 WRITE_ONCE(*entry, new_entry);
1384 1384
1385 svm->avic_physical_id_cache = entry; 1385 svm->avic_physical_id_cache = entry;
@@ -1647,7 +1647,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1647 1647
1648 svm->vmcb = page_address(page); 1648 svm->vmcb = page_address(page);
1649 clear_page(svm->vmcb); 1649 clear_page(svm->vmcb);
1650 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1650 svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
1651 svm->asid_generation = 0; 1651 svm->asid_generation = 0;
1652 init_vmcb(svm); 1652 init_vmcb(svm);
1653 1653
@@ -1675,7 +1675,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1675{ 1675{
1676 struct vcpu_svm *svm = to_svm(vcpu); 1676 struct vcpu_svm *svm = to_svm(vcpu);
1677 1677
1678 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); 1678 __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
1679 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 1679 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1680 __free_page(virt_to_page(svm->nested.hsave)); 1680 __free_page(virt_to_page(svm->nested.hsave));
1681 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); 1681 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
@@ -2330,7 +2330,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
2330 u64 pdpte; 2330 u64 pdpte;
2331 int ret; 2331 int ret;
2332 2332
2333 ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, 2333 ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
2334 offset_in_page(cr3) + index * 8, 8); 2334 offset_in_page(cr3) + index * 8, 8);
2335 if (ret) 2335 if (ret)
2336 return 0; 2336 return 0;
@@ -2342,7 +2342,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
2342{ 2342{
2343 struct vcpu_svm *svm = to_svm(vcpu); 2343 struct vcpu_svm *svm = to_svm(vcpu);
2344 2344
2345 svm->vmcb->control.nested_cr3 = root; 2345 svm->vmcb->control.nested_cr3 = __sme_set(root);
2346 mark_dirty(svm->vmcb, VMCB_NPT); 2346 mark_dirty(svm->vmcb, VMCB_NPT);
2347 svm_flush_tlb(vcpu); 2347 svm_flush_tlb(vcpu);
2348} 2348}
@@ -2873,7 +2873,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2873 svm->nested.msrpm[p] = svm->msrpm[p] | value; 2873 svm->nested.msrpm[p] = svm->msrpm[p] | value;
2874 } 2874 }
2875 2875
2876 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); 2876 svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
2877 2877
2878 return true; 2878 return true;
2879} 2879}
@@ -4506,7 +4506,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
4506 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, 4506 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
4507 irq.vector); 4507 irq.vector);
4508 *svm = to_svm(vcpu); 4508 *svm = to_svm(vcpu);
4509 vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); 4509 vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
4510 vcpu_info->vector = irq.vector; 4510 vcpu_info->vector = irq.vector;
4511 4511
4512 return 0; 4512 return 0;
@@ -4557,7 +4557,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
4557 struct amd_iommu_pi_data pi; 4557 struct amd_iommu_pi_data pi;
4558 4558
4559 /* Try to enable guest_mode in IRTE */ 4559 /* Try to enable guest_mode in IRTE */
4560 pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; 4560 pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
4561 AVIC_HPA_MASK);
4561 pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, 4562 pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
4562 svm->vcpu.vcpu_id); 4563 svm->vcpu.vcpu_id);
4563 pi.is_guest_mode = true; 4564 pi.is_guest_mode = true;
@@ -5006,7 +5007,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
5006{ 5007{
5007 struct vcpu_svm *svm = to_svm(vcpu); 5008 struct vcpu_svm *svm = to_svm(vcpu);
5008 5009
5009 svm->vmcb->save.cr3 = root; 5010 svm->vmcb->save.cr3 = __sme_set(root);
5010 mark_dirty(svm->vmcb, VMCB_CR); 5011 mark_dirty(svm->vmcb, VMCB_CR);
5011 svm_flush_tlb(vcpu); 5012 svm_flush_tlb(vcpu);
5012} 5013}
@@ -5015,7 +5016,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
5015{ 5016{
5016 struct vcpu_svm *svm = to_svm(vcpu); 5017 struct vcpu_svm *svm = to_svm(vcpu);
5017 5018
5018 svm->vmcb->control.nested_cr3 = root; 5019 svm->vmcb->control.nested_cr3 = __sme_set(root);
5019 mark_dirty(svm->vmcb, VMCB_NPT); 5020 mark_dirty(svm->vmcb, VMCB_NPT);
5020 5021
5021 /* Also sync guest cr3 here in case we live migrate */ 5022 /* Also sync guest cr3 here in case we live migrate */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c6ef2940119b..d40900914a72 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6556,7 +6556,7 @@ void vmx_enable_tdp(void)
6556 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 6556 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
6557 0ull, VMX_EPT_EXECUTABLE_MASK, 6557 0ull, VMX_EPT_EXECUTABLE_MASK,
6558 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, 6558 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
6559 VMX_EPT_RWX_MASK); 6559 VMX_EPT_RWX_MASK, 0ull);
6560 6560
6561 ept_set_mmio_spte_mask(); 6561 ept_set_mmio_spte_mask();
6562 kvm_enable_tdp(); 6562 kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 272320eb328c..ef5102f80497 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@
54#include <linux/kvm_irqfd.h> 54#include <linux/kvm_irqfd.h>
55#include <linux/irqbypass.h> 55#include <linux/irqbypass.h>
56#include <linux/sched/stat.h> 56#include <linux/sched/stat.h>
57#include <linux/mem_encrypt.h>
57 58
58#include <trace/events/kvm.h> 59#include <trace/events/kvm.h>
59 60
@@ -6125,7 +6126,7 @@ int kvm_arch_init(void *opaque)
6125 6126
6126 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 6127 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
6127 PT_DIRTY_MASK, PT64_NX_MASK, 0, 6128 PT_DIRTY_MASK, PT64_NX_MASK, 0,
6128 PT_PRESENT_MASK, 0); 6129 PT_PRESENT_MASK, 0, sme_me_mask);
6129 kvm_timer_init(); 6130 kvm_timer_init();
6130 6131
6131 perf_register_guest_info_callbacks(&kvm_guest_cbs); 6132 perf_register_guest_info_callbacks(&kvm_guest_cbs);
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 5cc78bf57232..3261abb21ef4 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
104 return 0; /* Buffer overrun */ 104 return 0; /* Buffer overrun */
105} 105}
106 106
107/*
108 * Find a non-boolean option (i.e. option=argument). In accordance with
109 * standard Linux practice, if this option is repeated, this returns the
110 * last instance on the command line.
111 *
112 * @cmdline: the cmdline string
113 * @max_cmdline_size: the maximum size of cmdline
114 * @option: option string to look for
115 * @buffer: memory buffer to return the option argument
116 * @bufsize: size of the supplied memory buffer
117 *
118 * Returns the length of the argument (regardless of if it was
119 * truncated to fit in the buffer), or -1 on not found.
120 */
121static int
122__cmdline_find_option(const char *cmdline, int max_cmdline_size,
123 const char *option, char *buffer, int bufsize)
124{
125 char c;
126 int pos = 0, len = -1;
127 const char *opptr = NULL;
128 char *bufptr = buffer;
129 enum {
130 st_wordstart = 0, /* Start of word/after whitespace */
131 st_wordcmp, /* Comparing this word */
132 st_wordskip, /* Miscompare, skip */
133 st_bufcpy, /* Copying this to buffer */
134 } state = st_wordstart;
135
136 if (!cmdline)
137 return -1; /* No command line */
138
139 /*
140 * This 'pos' check ensures we do not overrun
141 * a non-NULL-terminated 'cmdline'
142 */
143 while (pos++ < max_cmdline_size) {
144 c = *(char *)cmdline++;
145 if (!c)
146 break;
147
148 switch (state) {
149 case st_wordstart:
150 if (myisspace(c))
151 break;
152
153 state = st_wordcmp;
154 opptr = option;
155 /* fall through */
156
157 case st_wordcmp:
158 if ((c == '=') && !*opptr) {
159 /*
160 * We matched all the way to the end of the
161 * option we were looking for, prepare to
162 * copy the argument.
163 */
164 len = 0;
165 bufptr = buffer;
166 state = st_bufcpy;
167 break;
168 } else if (c == *opptr++) {
169 /*
170 * We are currently matching, so continue
171 * to the next character on the cmdline.
172 */
173 break;
174 }
175 state = st_wordskip;
176 /* fall through */
177
178 case st_wordskip:
179 if (myisspace(c))
180 state = st_wordstart;
181 break;
182
183 case st_bufcpy:
184 if (myisspace(c)) {
185 state = st_wordstart;
186 } else {
187 /*
188 * Increment len, but don't overrun the
189 * supplied buffer and leave room for the
190 * NULL terminator.
191 */
192 if (++len < bufsize)
193 *bufptr++ = c;
194 }
195 break;
196 }
197 }
198
199 if (bufsize)
200 *bufptr = '\0';
201
202 return len;
203}
204
107int cmdline_find_option_bool(const char *cmdline, const char *option) 205int cmdline_find_option_bool(const char *cmdline, const char *option)
108{ 206{
109 return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); 207 return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
110} 208}
209
210int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
211 int bufsize)
212{
213 return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
214 buffer, bufsize);
215}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 0fbdcb64f9f8..72bf8c01c6e3 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -39,3 +39,5 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
39obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o 39obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
40obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o 40obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
41 41
42obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
43obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0470826d2bdc..5e3ac6fe6c9e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -13,12 +13,12 @@
13 */ 13 */
14 14
15#include <linux/debugfs.h> 15#include <linux/debugfs.h>
16#include <linux/kasan.h>
16#include <linux/mm.h> 17#include <linux/mm.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/seq_file.h> 20#include <linux/seq_file.h>
20 21
21#include <asm/kasan.h>
22#include <asm/pgtable.h> 22#include <asm/pgtable.h>
23 23
24/* 24/*
@@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
138{ 138{
139 pgprotval_t pr = pgprot_val(prot); 139 pgprotval_t pr = pgprot_val(prot);
140 static const char * const level_name[] = 140 static const char * const level_name[] =
141 { "cr3", "pgd", "pud", "pmd", "pte" }; 141 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
142 142
143 if (!pgprot_val(prot)) { 143 if (!pgprot_val(prot)) {
144 /* Not present */ 144 /* Not present */
@@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
162 pt_dump_cont_printf(m, dmsg, " "); 162 pt_dump_cont_printf(m, dmsg, " ");
163 163
164 /* Bit 7 has a different meaning on level 3 vs 4 */ 164 /* Bit 7 has a different meaning on level 3 vs 4 */
165 if (level <= 3 && pr & _PAGE_PSE) 165 if (level <= 4 && pr & _PAGE_PSE)
166 pt_dump_cont_printf(m, dmsg, "PSE "); 166 pt_dump_cont_printf(m, dmsg, "PSE ");
167 else 167 else
168 pt_dump_cont_printf(m, dmsg, " "); 168 pt_dump_cont_printf(m, dmsg, " ");
169 if ((level == 4 && pr & _PAGE_PAT) || 169 if ((level == 5 && pr & _PAGE_PAT) ||
170 ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) 170 ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
171 pt_dump_cont_printf(m, dmsg, "PAT "); 171 pt_dump_cont_printf(m, dmsg, "PAT ");
172 else 172 else
173 pt_dump_cont_printf(m, dmsg, " "); 173 pt_dump_cont_printf(m, dmsg, " ");
@@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
188 */ 188 */
189static unsigned long normalize_addr(unsigned long u) 189static unsigned long normalize_addr(unsigned long u)
190{ 190{
191#ifdef CONFIG_X86_64 191 int shift;
192 return (signed long)(u << 16) >> 16; 192 if (!IS_ENABLED(CONFIG_X86_64))
193#else 193 return u;
194 return u; 194
195#endif 195 shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
196 return (signed long)(u << shift) >> shift;
196} 197}
197 198
198/* 199/*
@@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
297 for (i = 0; i < PTRS_PER_PTE; i++) { 298 for (i = 0; i < PTRS_PER_PTE; i++) {
298 prot = pte_flags(*start); 299 prot = pte_flags(*start);
299 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 300 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
300 note_page(m, st, __pgprot(prot), 4); 301 note_page(m, st, __pgprot(prot), 5);
301 start++; 302 start++;
302 } 303 }
303} 304}
305#ifdef CONFIG_KASAN
306
307/*
308 * This is an optimization for KASAN=y case. Since all kasan page tables
309 * eventually point to the kasan_zero_page we could call note_page()
310 * right away without walking through lower level page tables. This saves
311 * us dozens of seconds (minutes for 5-level config) while checking for
312 * W+X mapping or reading kernel_page_tables debugfs file.
313 */
314static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
315 void *pt)
316{
317 if (__pa(pt) == __pa(kasan_zero_pmd) ||
318#ifdef CONFIG_X86_5LEVEL
319 __pa(pt) == __pa(kasan_zero_p4d) ||
320#endif
321 __pa(pt) == __pa(kasan_zero_pud)) {
322 pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
323 note_page(m, st, __pgprot(prot), 5);
324 return true;
325 }
326 return false;
327}
328#else
329static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
330 void *pt)
331{
332 return false;
333}
334#endif
304 335
305#if PTRS_PER_PMD > 1 336#if PTRS_PER_PMD > 1
306 337
307static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) 338static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
308{ 339{
309 int i; 340 int i;
310 pmd_t *start; 341 pmd_t *start, *pmd_start;
311 pgprotval_t prot; 342 pgprotval_t prot;
312 343
313 start = (pmd_t *)pud_page_vaddr(addr); 344 pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
314 for (i = 0; i < PTRS_PER_PMD; i++) { 345 for (i = 0; i < PTRS_PER_PMD; i++) {
315 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 346 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
316 if (!pmd_none(*start)) { 347 if (!pmd_none(*start)) {
317 if (pmd_large(*start) || !pmd_present(*start)) { 348 if (pmd_large(*start) || !pmd_present(*start)) {
318 prot = pmd_flags(*start); 349 prot = pmd_flags(*start);
319 note_page(m, st, __pgprot(prot), 3); 350 note_page(m, st, __pgprot(prot), 4);
320 } else { 351 } else if (!kasan_page_table(m, st, pmd_start)) {
321 walk_pte_level(m, st, *start, 352 walk_pte_level(m, st, *start,
322 P + i * PMD_LEVEL_MULT); 353 P + i * PMD_LEVEL_MULT);
323 } 354 }
324 } else 355 } else
325 note_page(m, st, __pgprot(0), 3); 356 note_page(m, st, __pgprot(0), 4);
326 start++; 357 start++;
327 } 358 }
328} 359}
@@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
335 366
336#if PTRS_PER_PUD > 1 367#if PTRS_PER_PUD > 1
337 368
338/*
339 * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
340 * KASAN fills page tables with the same values. Since there is no
341 * point in checking page table more than once we just skip repeated
342 * entries. This saves us dozens of seconds during boot.
343 */
344static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
345{
346 return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
347}
348
349static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) 369static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
350{ 370{
351 int i; 371 int i;
352 pud_t *start; 372 pud_t *start, *pud_start;
353 pgprotval_t prot; 373 pgprotval_t prot;
354 pud_t *prev_pud = NULL; 374 pud_t *prev_pud = NULL;
355 375
356 start = (pud_t *)p4d_page_vaddr(addr); 376 pud_start = start = (pud_t *)p4d_page_vaddr(addr);
357 377
358 for (i = 0; i < PTRS_PER_PUD; i++) { 378 for (i = 0; i < PTRS_PER_PUD; i++) {
359 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 379 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
360 if (!pud_none(*start) && 380 if (!pud_none(*start)) {
361 !pud_already_checked(prev_pud, start, st->check_wx)) {
362 if (pud_large(*start) || !pud_present(*start)) { 381 if (pud_large(*start) || !pud_present(*start)) {
363 prot = pud_flags(*start); 382 prot = pud_flags(*start);
364 note_page(m, st, __pgprot(prot), 2); 383 note_page(m, st, __pgprot(prot), 3);
365 } else { 384 } else if (!kasan_page_table(m, st, pud_start)) {
366 walk_pmd_level(m, st, *start, 385 walk_pmd_level(m, st, *start,
367 P + i * PUD_LEVEL_MULT); 386 P + i * PUD_LEVEL_MULT);
368 } 387 }
369 } else 388 } else
370 note_page(m, st, __pgprot(0), 2); 389 note_page(m, st, __pgprot(0), 3);
371 390
372 prev_pud = start; 391 prev_pud = start;
373 start++; 392 start++;
@@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
385static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) 404static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
386{ 405{
387 int i; 406 int i;
388 p4d_t *start; 407 p4d_t *start, *p4d_start;
389 pgprotval_t prot; 408 pgprotval_t prot;
390 409
391 start = (p4d_t *)pgd_page_vaddr(addr); 410 p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
392 411
393 for (i = 0; i < PTRS_PER_P4D; i++) { 412 for (i = 0; i < PTRS_PER_P4D; i++) {
394 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); 413 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
@@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
396 if (p4d_large(*start) || !p4d_present(*start)) { 415 if (p4d_large(*start) || !p4d_present(*start)) {
397 prot = p4d_flags(*start); 416 prot = p4d_flags(*start);
398 note_page(m, st, __pgprot(prot), 2); 417 note_page(m, st, __pgprot(prot), 2);
399 } else { 418 } else if (!kasan_page_table(m, st, p4d_start)) {
400 walk_pud_level(m, st, *start, 419 walk_pud_level(m, st, *start,
401 P + i * P4D_LEVEL_MULT); 420 P + i * P4D_LEVEL_MULT);
402 } 421 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2a1fa10c6a98..0cdf14cf3270 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -396,14 +396,18 @@ static void dump_pagetable(unsigned long address)
396 pte_t *pte; 396 pte_t *pte;
397 397
398#ifdef CONFIG_X86_PAE 398#ifdef CONFIG_X86_PAE
399 printk("*pdpt = %016Lx ", pgd_val(*pgd)); 399 pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
400 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 400 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
401 goto out; 401 goto out;
402#define pr_pde pr_cont
403#else
404#define pr_pde pr_info
402#endif 405#endif
403 p4d = p4d_offset(pgd, address); 406 p4d = p4d_offset(pgd, address);
404 pud = pud_offset(p4d, address); 407 pud = pud_offset(p4d, address);
405 pmd = pmd_offset(pud, address); 408 pmd = pmd_offset(pud, address);
406 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 409 pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
410#undef pr_pde
407 411
408 /* 412 /*
409 * We must not directly access the pte in the highpte 413 * We must not directly access the pte in the highpte
@@ -415,9 +419,9 @@ static void dump_pagetable(unsigned long address)
415 goto out; 419 goto out;
416 420
417 pte = pte_offset_kernel(pmd, address); 421 pte = pte_offset_kernel(pmd, address);
418 printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 422 pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
419out: 423out:
420 printk("\n"); 424 pr_cont("\n");
421} 425}
422 426
423#else /* CONFIG_X86_64: */ 427#else /* CONFIG_X86_64: */
@@ -565,7 +569,7 @@ static void dump_pagetable(unsigned long address)
565 if (bad_address(pgd)) 569 if (bad_address(pgd))
566 goto bad; 570 goto bad;
567 571
568 printk("PGD %lx ", pgd_val(*pgd)); 572 pr_info("PGD %lx ", pgd_val(*pgd));
569 573
570 if (!pgd_present(*pgd)) 574 if (!pgd_present(*pgd))
571 goto out; 575 goto out;
@@ -574,7 +578,7 @@ static void dump_pagetable(unsigned long address)
574 if (bad_address(p4d)) 578 if (bad_address(p4d))
575 goto bad; 579 goto bad;
576 580
577 printk("P4D %lx ", p4d_val(*p4d)); 581 pr_cont("P4D %lx ", p4d_val(*p4d));
578 if (!p4d_present(*p4d) || p4d_large(*p4d)) 582 if (!p4d_present(*p4d) || p4d_large(*p4d))
579 goto out; 583 goto out;
580 584
@@ -582,7 +586,7 @@ static void dump_pagetable(unsigned long address)
582 if (bad_address(pud)) 586 if (bad_address(pud))
583 goto bad; 587 goto bad;
584 588
585 printk("PUD %lx ", pud_val(*pud)); 589 pr_cont("PUD %lx ", pud_val(*pud));
586 if (!pud_present(*pud) || pud_large(*pud)) 590 if (!pud_present(*pud) || pud_large(*pud))
587 goto out; 591 goto out;
588 592
@@ -590,7 +594,7 @@ static void dump_pagetable(unsigned long address)
590 if (bad_address(pmd)) 594 if (bad_address(pmd))
591 goto bad; 595 goto bad;
592 596
593 printk("PMD %lx ", pmd_val(*pmd)); 597 pr_cont("PMD %lx ", pmd_val(*pmd));
594 if (!pmd_present(*pmd) || pmd_large(*pmd)) 598 if (!pmd_present(*pmd) || pmd_large(*pmd))
595 goto out; 599 goto out;
596 600
@@ -598,12 +602,12 @@ static void dump_pagetable(unsigned long address)
598 if (bad_address(pte)) 602 if (bad_address(pte))
599 goto bad; 603 goto bad;
600 604
601 printk("PTE %lx", pte_val(*pte)); 605 pr_cont("PTE %lx", pte_val(*pte));
602out: 606out:
603 printk("\n"); 607 pr_cont("\n");
604 return; 608 return;
605bad: 609bad:
606 printk("BAD\n"); 610 pr_info("BAD\n");
607} 611}
608 612
609#endif /* CONFIG_X86_64 */ 613#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 2824607df108..6d06cf33e3de 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -18,6 +18,7 @@
18#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
19#include <asm/pgalloc.h> 19#include <asm/pgalloc.h>
20#include <asm/elf.h> 20#include <asm/elf.h>
21#include <asm/mpx.h>
21 22
22#if 0 /* This is just for testing */ 23#if 0 /* This is just for testing */
23struct page * 24struct page *
@@ -85,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
85 info.flags = 0; 86 info.flags = 0;
86 info.length = len; 87 info.length = len;
87 info.low_limit = get_mmap_base(1); 88 info.low_limit = get_mmap_base(1);
89
90 /*
91 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
92 * in the full address space.
93 */
88 info.high_limit = in_compat_syscall() ? 94 info.high_limit = in_compat_syscall() ?
89 tasksize_32bit() : tasksize_64bit(); 95 task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
96
90 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 97 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
91 info.align_offset = 0; 98 info.align_offset = 0;
92 return vm_unmapped_area(&info); 99 return vm_unmapped_area(&info);
93} 100}
94 101
95static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, 102static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
96 unsigned long addr0, unsigned long len, 103 unsigned long addr, unsigned long len,
97 unsigned long pgoff, unsigned long flags) 104 unsigned long pgoff, unsigned long flags)
98{ 105{
99 struct hstate *h = hstate_file(file); 106 struct hstate *h = hstate_file(file);
100 struct vm_unmapped_area_info info; 107 struct vm_unmapped_area_info info;
101 unsigned long addr;
102 108
103 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 109 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
104 info.length = len; 110 info.length = len;
105 info.low_limit = PAGE_SIZE; 111 info.low_limit = PAGE_SIZE;
106 info.high_limit = get_mmap_base(0); 112 info.high_limit = get_mmap_base(0);
113
114 /*
115 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
116 * in the full address space.
117 */
118 if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
119 info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
120
107 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 121 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
108 info.align_offset = 0; 122 info.align_offset = 0;
109 addr = vm_unmapped_area(&info); 123 addr = vm_unmapped_area(&info);
@@ -118,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
118 VM_BUG_ON(addr != -ENOMEM); 132 VM_BUG_ON(addr != -ENOMEM);
119 info.flags = 0; 133 info.flags = 0;
120 info.low_limit = TASK_UNMAPPED_BASE; 134 info.low_limit = TASK_UNMAPPED_BASE;
121 info.high_limit = TASK_SIZE; 135 info.high_limit = TASK_SIZE_LOW;
122 addr = vm_unmapped_area(&info); 136 addr = vm_unmapped_area(&info);
123 } 137 }
124 138
@@ -135,6 +149,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
135 149
136 if (len & ~huge_page_mask(h)) 150 if (len & ~huge_page_mask(h))
137 return -EINVAL; 151 return -EINVAL;
152
153 addr = mpx_unmapped_area_check(addr, len, flags);
154 if (IS_ERR_VALUE(addr))
155 return addr;
156
138 if (len > TASK_SIZE) 157 if (len > TASK_SIZE)
139 return -ENOMEM; 158 return -ENOMEM;
140 159
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index adab1595f4bd..31cea988fa36 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
51 if (!pmd) 51 if (!pmd)
52 return -ENOMEM; 52 return -ENOMEM;
53 ident_pmd_init(info, pmd, addr, next); 53 ident_pmd_init(info, pmd, addr, next);
54 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 54 set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
55 } 55 }
56 56
57 return 0; 57 return 0;
@@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
79 if (!pud) 79 if (!pud)
80 return -ENOMEM; 80 return -ENOMEM;
81 ident_pud_init(info, pud, addr, next); 81 ident_pud_init(info, pud, addr, next);
82 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); 82 set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
83 } 83 }
84 84
85 return 0; 85 return 0;
@@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
93 unsigned long next; 93 unsigned long next;
94 int result; 94 int result;
95 95
96 /* Set the default pagetable flags if not supplied */
97 if (!info->kernpg_flag)
98 info->kernpg_flag = _KERNPG_TABLE;
99
96 for (; addr < end; addr = next) { 100 for (; addr < end; addr = next) {
97 pgd_t *pgd = pgd_page + pgd_index(addr); 101 pgd_t *pgd = pgd_page + pgd_index(addr);
98 p4d_t *p4d; 102 p4d_t *p4d;
@@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
116 if (result) 120 if (result)
117 return result; 121 return result;
118 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 122 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
119 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); 123 set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
120 } else { 124 } else {
121 /* 125 /*
122 * With p4d folded, pgd is equal to p4d. 126 * With p4d folded, pgd is equal to p4d.
123 * The pgd entry has to point to the pud page table in this case. 127 * The pgd entry has to point to the pud page table in this case.
124 */ 128 */
125 pud_t *pud = pud_offset(p4d, 0); 129 pud_t *pud = pud_offset(p4d, 0);
126 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); 130 set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
127 } 131 }
128 } 132 }
129 133
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bf3f1065d6ad..7777ccc0e9f9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -815,7 +815,7 @@ void __init zone_sizes_init(void)
815 815
816DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 816DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
817 .loaded_mm = &init_mm, 817 .loaded_mm = &init_mm,
818 .state = 0, 818 .next_asid = 1,
819 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ 819 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
820}; 820};
821EXPORT_SYMBOL_GPL(cpu_tlbstate); 821EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 4c1b5fd0c7ad..34f0e1847dd6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,8 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/mmiotrace.h> 15#include <linux/mmiotrace.h>
16#include <linux/mem_encrypt.h>
17#include <linux/efi.h>
16 18
17#include <asm/set_memory.h> 19#include <asm/set_memory.h>
18#include <asm/e820/api.h> 20#include <asm/e820/api.h>
@@ -21,6 +23,7 @@
21#include <asm/tlbflush.h> 23#include <asm/tlbflush.h>
22#include <asm/pgalloc.h> 24#include <asm/pgalloc.h>
23#include <asm/pat.h> 25#include <asm/pat.h>
26#include <asm/setup.h>
24 27
25#include "physaddr.h" 28#include "physaddr.h"
26 29
@@ -106,12 +109,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
106 } 109 }
107 110
108 /* 111 /*
109 * Don't remap the low PCI/ISA area, it's always mapped..
110 */
111 if (is_ISA_range(phys_addr, last_addr))
112 return (__force void __iomem *)phys_to_virt(phys_addr);
113
114 /*
115 * Don't allow anybody to remap normal RAM that we're using.. 112 * Don't allow anybody to remap normal RAM that we're using..
116 */ 113 */
117 pfn = phys_addr >> PAGE_SHIFT; 114 pfn = phys_addr >> PAGE_SHIFT;
@@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr)
340 return; 337 return;
341 338
342 /* 339 /*
343 * __ioremap special-cases the PCI/ISA range by not instantiating a 340 * The PCI/ISA range special-casing was removed from __ioremap()
344 * vm_area and by simply returning an address into the kernel mapping 341 * so this check, in theory, can be removed. However, there are
345 * of ISA space. So handle that here. 342 * cases where iounmap() is called for addresses not obtained via
343 * ioremap() (vga16fb for example). Add a warning so that these
344 * cases can be caught and fixed.
346 */ 345 */
347 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && 346 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
348 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) 347 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) {
348 WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n");
349 return; 349 return;
350 }
350 351
351 addr = (volatile void __iomem *) 352 addr = (volatile void __iomem *)
352 (PAGE_MASK & (unsigned long __force)addr); 353 (PAGE_MASK & (unsigned long __force)addr);
@@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
399 unsigned long offset = phys & ~PAGE_MASK; 400 unsigned long offset = phys & ~PAGE_MASK;
400 void *vaddr; 401 void *vaddr;
401 402
402 /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ 403 /* memremap() maps if RAM, otherwise falls back to ioremap() */
403 if (page_is_ram(start >> PAGE_SHIFT)) 404 vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB);
404 return __va(phys);
405 405
406 vaddr = ioremap_cache(start, PAGE_SIZE); 406 /* Only add the offset on success and return NULL if memremap() failed */
407 /* Only add the offset on success and return NULL if the ioremap() failed: */
408 if (vaddr) 407 if (vaddr)
409 vaddr += offset; 408 vaddr += offset;
410 409
@@ -413,11 +412,263 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
413 412
414void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) 413void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
415{ 414{
416 if (page_is_ram(phys >> PAGE_SHIFT)) 415 memunmap((void *)((unsigned long)addr & PAGE_MASK));
417 return; 416}
417
418/*
419 * Examine the physical address to determine if it is an area of memory
420 * that should be mapped decrypted. If the memory is not part of the
421 * kernel usable area it was accessed and created decrypted, so these
422 * areas should be mapped decrypted. And since the encryption key can
423 * change across reboots, persistent memory should also be mapped
424 * decrypted.
425 */
426static bool memremap_should_map_decrypted(resource_size_t phys_addr,
427 unsigned long size)
428{
429 int is_pmem;
430
431 /*
432 * Check if the address is part of a persistent memory region.
433 * This check covers areas added by E820, EFI and ACPI.
434 */
435 is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM,
436 IORES_DESC_PERSISTENT_MEMORY);
437 if (is_pmem != REGION_DISJOINT)
438 return true;
439
440 /*
441 * Check if the non-volatile attribute is set for an EFI
442 * reserved area.
443 */
444 if (efi_enabled(EFI_BOOT)) {
445 switch (efi_mem_type(phys_addr)) {
446 case EFI_RESERVED_TYPE:
447 if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV)
448 return true;
449 break;
450 default:
451 break;
452 }
453 }
454
455 /* Check if the address is outside kernel usable area */
456 switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
457 case E820_TYPE_RESERVED:
458 case E820_TYPE_ACPI:
459 case E820_TYPE_NVS:
460 case E820_TYPE_UNUSABLE:
461 case E820_TYPE_PRAM:
462 return true;
463 default:
464 break;
465 }
466
467 return false;
468}
469
470/*
471 * Examine the physical address to determine if it is EFI data. Check
472 * it against the boot params structure and EFI tables and memory types.
473 */
474static bool memremap_is_efi_data(resource_size_t phys_addr,
475 unsigned long size)
476{
477 u64 paddr;
478
479 /* Check if the address is part of EFI boot/runtime data */
480 if (!efi_enabled(EFI_BOOT))
481 return false;
482
483 paddr = boot_params.efi_info.efi_memmap_hi;
484 paddr <<= 32;
485 paddr |= boot_params.efi_info.efi_memmap;
486 if (phys_addr == paddr)
487 return true;
488
489 paddr = boot_params.efi_info.efi_systab_hi;
490 paddr <<= 32;
491 paddr |= boot_params.efi_info.efi_systab;
492 if (phys_addr == paddr)
493 return true;
494
495 if (efi_is_table_address(phys_addr))
496 return true;
497
498 switch (efi_mem_type(phys_addr)) {
499 case EFI_BOOT_SERVICES_DATA:
500 case EFI_RUNTIME_SERVICES_DATA:
501 return true;
502 default:
503 break;
504 }
505
506 return false;
507}
508
509/*
510 * Examine the physical address to determine if it is boot data by checking
511 * it against the boot params setup_data chain.
512 */
513static bool memremap_is_setup_data(resource_size_t phys_addr,
514 unsigned long size)
515{
516 struct setup_data *data;
517 u64 paddr, paddr_next;
518
519 paddr = boot_params.hdr.setup_data;
520 while (paddr) {
521 unsigned int len;
522
523 if (phys_addr == paddr)
524 return true;
525
526 data = memremap(paddr, sizeof(*data),
527 MEMREMAP_WB | MEMREMAP_DEC);
528
529 paddr_next = data->next;
530 len = data->len;
531
532 memunmap(data);
533
534 if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
535 return true;
536
537 paddr = paddr_next;
538 }
539
540 return false;
541}
542
543/*
544 * Examine the physical address to determine if it is boot data by checking
545 * it against the boot params setup_data chain (early boot version).
546 */
547static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
548 unsigned long size)
549{
550 struct setup_data *data;
551 u64 paddr, paddr_next;
552
553 paddr = boot_params.hdr.setup_data;
554 while (paddr) {
555 unsigned int len;
556
557 if (phys_addr == paddr)
558 return true;
559
560 data = early_memremap_decrypted(paddr, sizeof(*data));
561
562 paddr_next = data->next;
563 len = data->len;
564
565 early_memunmap(data, sizeof(*data));
566
567 if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
568 return true;
569
570 paddr = paddr_next;
571 }
572
573 return false;
574}
575
576/*
577 * Architecture function to determine if RAM remap is allowed. By default, a
578 * RAM remap will map the data as encrypted. Determine if a RAM remap should
579 * not be done so that the data will be mapped decrypted.
580 */
581bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
582 unsigned long flags)
583{
584 if (!sme_active())
585 return true;
586
587 if (flags & MEMREMAP_ENC)
588 return true;
589
590 if (flags & MEMREMAP_DEC)
591 return false;
592
593 if (memremap_is_setup_data(phys_addr, size) ||
594 memremap_is_efi_data(phys_addr, size) ||
595 memremap_should_map_decrypted(phys_addr, size))
596 return false;
597
598 return true;
599}
600
601/*
602 * Architecture override of __weak function to adjust the protection attributes
603 * used when remapping memory. By default, early_memremap() will map the data
604 * as encrypted. Determine if an encrypted mapping should not be done and set
605 * the appropriate protection attributes.
606 */
607pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
608 unsigned long size,
609 pgprot_t prot)
610{
611 if (!sme_active())
612 return prot;
613
614 if (early_memremap_is_setup_data(phys_addr, size) ||
615 memremap_is_efi_data(phys_addr, size) ||
616 memremap_should_map_decrypted(phys_addr, size))
617 prot = pgprot_decrypted(prot);
618 else
619 prot = pgprot_encrypted(prot);
620
621 return prot;
622}
623
624bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
625{
626 return arch_memremap_can_ram_remap(phys_addr, size, 0);
627}
628
629#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
630/* Remap memory with encryption */
631void __init *early_memremap_encrypted(resource_size_t phys_addr,
632 unsigned long size)
633{
634 return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
635}
636
637/*
638 * Remap memory with encryption and write-protected - cannot be called
639 * before pat_init() is called
640 */
641void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
642 unsigned long size)
643{
644 /* Be sure the write-protect PAT entry is set for write-protect */
645 if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
646 return NULL;
647
648 return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
649}
650
651/* Remap memory without encryption */
652void __init *early_memremap_decrypted(resource_size_t phys_addr,
653 unsigned long size)
654{
655 return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
656}
657
658/*
659 * Remap memory without encryption and write-protected - cannot be called
660 * before pat_init() is called
661 */
662void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
663 unsigned long size)
664{
665 /* Be sure the write-protect PAT entry is set for write-protect */
666 if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
667 return NULL;
418 668
419 iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); 669 return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
420} 670}
671#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */
421 672
422static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; 673static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
423 674
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..bc84b73684b7 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -11,8 +11,8 @@
11#include <asm/e820/types.h> 11#include <asm/e820/types.h>
12#include <asm/tlbflush.h> 12#include <asm/tlbflush.h>
13#include <asm/sections.h> 13#include <asm/sections.h>
14#include <asm/pgtable.h>
14 15
15extern pgd_t early_top_pgt[PTRS_PER_PGD];
16extern struct range pfn_mapped[E820_MAX_ENTRIES]; 16extern struct range pfn_mapped[E820_MAX_ENTRIES];
17 17
18static int __init map_range(struct range *range) 18static int __init map_range(struct range *range)
@@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = {
87void __init kasan_early_init(void) 87void __init kasan_early_init(void)
88{ 88{
89 int i; 89 int i;
90 pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; 90 pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
91 pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; 91 pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
92 pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; 92 pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
93 p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; 93 p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
@@ -153,7 +153,7 @@ void __init kasan_init(void)
153 */ 153 */
154 memset(kasan_zero_page, 0, PAGE_SIZE); 154 memset(kasan_zero_page, 0, PAGE_SIZE);
155 for (i = 0; i < PTRS_PER_PTE; i++) { 155 for (i = 0; i < PTRS_PER_PTE; i++) {
156 pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); 156 pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC);
157 set_pte(&kasan_zero_pte[i], pte); 157 set_pte(&kasan_zero_pte[i], pte);
158 } 158 }
159 /* Flush TLBs again to be sure that write protection applied. */ 159 /* Flush TLBs again to be sure that write protection applied. */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
new file mode 100644
index 000000000000..0fbd09269757
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt.c
@@ -0,0 +1,593 @@
1/*
2 * AMD Memory Encryption Support
3 *
4 * Copyright (C) 2016 Advanced Micro Devices, Inc.
5 *
6 * Author: Tom Lendacky <thomas.lendacky@amd.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/linkage.h>
14#include <linux/init.h>
15#include <linux/mm.h>
16#include <linux/dma-mapping.h>
17#include <linux/swiotlb.h>
18#include <linux/mem_encrypt.h>
19
20#include <asm/tlbflush.h>
21#include <asm/fixmap.h>
22#include <asm/setup.h>
23#include <asm/bootparam.h>
24#include <asm/set_memory.h>
25#include <asm/cacheflush.h>
26#include <asm/sections.h>
27#include <asm/processor-flags.h>
28#include <asm/msr.h>
29#include <asm/cmdline.h>
30
31static char sme_cmdline_arg[] __initdata = "mem_encrypt";
32static char sme_cmdline_on[] __initdata = "on";
33static char sme_cmdline_off[] __initdata = "off";
34
35/*
36 * Since SME related variables are set early in the boot process they must
37 * reside in the .data section so as not to be zeroed out when the .bss
38 * section is later cleared.
39 */
40unsigned long sme_me_mask __section(.data) = 0;
41EXPORT_SYMBOL_GPL(sme_me_mask);
42
43/* Buffer used for early in-place encryption by BSP, no locking needed */
44static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
45
46/*
47 * This routine does not change the underlying encryption setting of the
48 * page(s) that map this memory. It assumes that eventually the memory is
49 * meant to be accessed as either encrypted or decrypted but the contents
50 * are currently not in the desired state.
51 *
52 * This routine follows the steps outlined in the AMD64 Architecture
53 * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
54 */
55static void __init __sme_early_enc_dec(resource_size_t paddr,
56 unsigned long size, bool enc)
57{
58 void *src, *dst;
59 size_t len;
60
61 if (!sme_me_mask)
62 return;
63
64 local_flush_tlb();
65 wbinvd();
66
67 /*
68 * There are limited number of early mapping slots, so map (at most)
69 * one page at time.
70 */
71 while (size) {
72 len = min_t(size_t, sizeof(sme_early_buffer), size);
73
74 /*
75 * Create mappings for the current and desired format of
76 * the memory. Use a write-protected mapping for the source.
77 */
78 src = enc ? early_memremap_decrypted_wp(paddr, len) :
79 early_memremap_encrypted_wp(paddr, len);
80
81 dst = enc ? early_memremap_encrypted(paddr, len) :
82 early_memremap_decrypted(paddr, len);
83
84 /*
85 * If a mapping can't be obtained to perform the operation,
86 * then eventual access of that area in the desired mode
87 * will cause a crash.
88 */
89 BUG_ON(!src || !dst);
90
91 /*
92 * Use a temporary buffer, of cache-line multiple size, to
93 * avoid data corruption as documented in the APM.
94 */
95 memcpy(sme_early_buffer, src, len);
96 memcpy(dst, sme_early_buffer, len);
97
98 early_memunmap(dst, len);
99 early_memunmap(src, len);
100
101 paddr += len;
102 size -= len;
103 }
104}
105
106void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
107{
108 __sme_early_enc_dec(paddr, size, true);
109}
110
111void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
112{
113 __sme_early_enc_dec(paddr, size, false);
114}
115
116static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
117 bool map)
118{
119 unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
120 pmdval_t pmd_flags, pmd;
121
122 /* Use early_pmd_flags but remove the encryption mask */
123 pmd_flags = __sme_clr(early_pmd_flags);
124
125 do {
126 pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
127 __early_make_pgtable((unsigned long)vaddr, pmd);
128
129 vaddr += PMD_SIZE;
130 paddr += PMD_SIZE;
131 size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
132 } while (size);
133
134 __native_flush_tlb();
135}
136
137void __init sme_unmap_bootdata(char *real_mode_data)
138{
139 struct boot_params *boot_data;
140 unsigned long cmdline_paddr;
141
142 if (!sme_active())
143 return;
144
145 /* Get the command line address before unmapping the real_mode_data */
146 boot_data = (struct boot_params *)real_mode_data;
147 cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
148
149 __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
150
151 if (!cmdline_paddr)
152 return;
153
154 __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
155}
156
157void __init sme_map_bootdata(char *real_mode_data)
158{
159 struct boot_params *boot_data;
160 unsigned long cmdline_paddr;
161
162 if (!sme_active())
163 return;
164
165 __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
166
167 /* Get the command line address after mapping the real_mode_data */
168 boot_data = (struct boot_params *)real_mode_data;
169 cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
170
171 if (!cmdline_paddr)
172 return;
173
174 __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
175}
176
177void __init sme_early_init(void)
178{
179 unsigned int i;
180
181 if (!sme_me_mask)
182 return;
183
184 early_pmd_flags = __sme_set(early_pmd_flags);
185
186 __supported_pte_mask = __sme_set(__supported_pte_mask);
187
188 /* Update the protection map with memory encryption mask */
189 for (i = 0; i < ARRAY_SIZE(protection_map); i++)
190 protection_map[i] = pgprot_encrypted(protection_map[i]);
191}
192
193/* Architecture __weak replacement functions */
194void __init mem_encrypt_init(void)
195{
196 if (!sme_me_mask)
197 return;
198
199 /* Call into SWIOTLB to update the SWIOTLB DMA buffers */
200 swiotlb_update_mem_attributes();
201
202 pr_info("AMD Secure Memory Encryption (SME) active\n");
203}
204
205void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
206{
207 WARN(PAGE_ALIGN(size) != size,
208 "size is not page-aligned (%#lx)\n", size);
209
210 /* Make the SWIOTLB buffer area decrypted */
211 set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
212}
213
214static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
215 unsigned long end)
216{
217 unsigned long pgd_start, pgd_end, pgd_size;
218 pgd_t *pgd_p;
219
220 pgd_start = start & PGDIR_MASK;
221 pgd_end = end & PGDIR_MASK;
222
223 pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1);
224 pgd_size *= sizeof(pgd_t);
225
226 pgd_p = pgd_base + pgd_index(start);
227
228 memset(pgd_p, 0, pgd_size);
229}
230
231#define PGD_FLAGS _KERNPG_TABLE_NOENC
232#define P4D_FLAGS _KERNPG_TABLE_NOENC
233#define PUD_FLAGS _KERNPG_TABLE_NOENC
234#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
235
236static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
237 unsigned long vaddr, pmdval_t pmd_val)
238{
239 pgd_t *pgd_p;
240 p4d_t *p4d_p;
241 pud_t *pud_p;
242 pmd_t *pmd_p;
243
244 pgd_p = pgd_base + pgd_index(vaddr);
245 if (native_pgd_val(*pgd_p)) {
246 if (IS_ENABLED(CONFIG_X86_5LEVEL))
247 p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
248 else
249 pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
250 } else {
251 pgd_t pgd;
252
253 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
254 p4d_p = pgtable_area;
255 memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
256 pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
257
258 pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
259 } else {
260 pud_p = pgtable_area;
261 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
262 pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
263
264 pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
265 }
266 native_set_pgd(pgd_p, pgd);
267 }
268
269 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
270 p4d_p += p4d_index(vaddr);
271 if (native_p4d_val(*p4d_p)) {
272 pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
273 } else {
274 p4d_t p4d;
275
276 pud_p = pgtable_area;
277 memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
278 pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
279
280 p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
281 native_set_p4d(p4d_p, p4d);
282 }
283 }
284
285 pud_p += pud_index(vaddr);
286 if (native_pud_val(*pud_p)) {
287 if (native_pud_val(*pud_p) & _PAGE_PSE)
288 goto out;
289
290 pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
291 } else {
292 pud_t pud;
293
294 pmd_p = pgtable_area;
295 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
296 pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
297
298 pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
299 native_set_pud(pud_p, pud);
300 }
301
302 pmd_p += pmd_index(vaddr);
303 if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
304 native_set_pmd(pmd_p, native_make_pmd(pmd_val));
305
306out:
307 return pgtable_area;
308}
309
310static unsigned long __init sme_pgtable_calc(unsigned long len)
311{
312 unsigned long p4d_size, pud_size, pmd_size;
313 unsigned long total;
314
315 /*
316 * Perform a relatively simplistic calculation of the pagetable
317 * entries that are needed. That mappings will be covered by 2MB
318 * PMD entries so we can conservatively calculate the required
319 * number of P4D, PUD and PMD structures needed to perform the
320 * mappings. Incrementing the count for each covers the case where
321 * the addresses cross entries.
322 */
323 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
324 p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
325 p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
326 pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
327 pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
328 } else {
329 p4d_size = 0;
330 pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
331 pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
332 }
333 pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
334 pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
335
336 total = p4d_size + pud_size + pmd_size;
337
338 /*
339 * Now calculate the added pagetable structures needed to populate
340 * the new pagetables.
341 */
342 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
343 p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
344 p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
345 pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
346 pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
347 } else {
348 p4d_size = 0;
349 pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
350 pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
351 }
352 pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
353 pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
354
355 total += p4d_size + pud_size + pmd_size;
356
357 return total;
358}
359
360void __init sme_encrypt_kernel(void)
361{
362 unsigned long workarea_start, workarea_end, workarea_len;
363 unsigned long execute_start, execute_end, execute_len;
364 unsigned long kernel_start, kernel_end, kernel_len;
365 unsigned long pgtable_area_len;
366 unsigned long paddr, pmd_flags;
367 unsigned long decrypted_base;
368 void *pgtable_area;
369 pgd_t *pgd;
370
371 if (!sme_active())
372 return;
373
374 /*
375 * Prepare for encrypting the kernel by building new pagetables with
376 * the necessary attributes needed to encrypt the kernel in place.
377 *
378 * One range of virtual addresses will map the memory occupied
379 * by the kernel as encrypted.
380 *
381 * Another range of virtual addresses will map the memory occupied
382 * by the kernel as decrypted and write-protected.
383 *
384 * The use of write-protect attribute will prevent any of the
385 * memory from being cached.
386 */
387
388 /* Physical addresses gives us the identity mapped virtual addresses */
389 kernel_start = __pa_symbol(_text);
390 kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
391 kernel_len = kernel_end - kernel_start;
392
393 /* Set the encryption workarea to be immediately after the kernel */
394 workarea_start = kernel_end;
395
396 /*
397 * Calculate required number of workarea bytes needed:
398 * executable encryption area size:
399 * stack page (PAGE_SIZE)
400 * encryption routine page (PAGE_SIZE)
401 * intermediate copy buffer (PMD_PAGE_SIZE)
402 * pagetable structures for the encryption of the kernel
403 * pagetable structures for workarea (in case not currently mapped)
404 */
405 execute_start = workarea_start;
406 execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
407 execute_len = execute_end - execute_start;
408
409 /*
410 * One PGD for both encrypted and decrypted mappings and a set of
411 * PUDs and PMDs for each of the encrypted and decrypted mappings.
412 */
413 pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
414 pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
415
416 /* PUDs and PMDs needed in the current pagetables for the workarea */
417 pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
418
419 /*
420 * The total workarea includes the executable encryption area and
421 * the pagetable area.
422 */
423 workarea_len = execute_len + pgtable_area_len;
424 workarea_end = workarea_start + workarea_len;
425
426 /*
427 * Set the address to the start of where newly created pagetable
428 * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
429 * structures are created when the workarea is added to the current
430 * pagetables and when the new encrypted and decrypted kernel
431 * mappings are populated.
432 */
433 pgtable_area = (void *)execute_end;
434
435 /*
436 * Make sure the current pagetable structure has entries for
437 * addressing the workarea.
438 */
439 pgd = (pgd_t *)native_read_cr3_pa();
440 paddr = workarea_start;
441 while (paddr < workarea_end) {
442 pgtable_area = sme_populate_pgd(pgd, pgtable_area,
443 paddr,
444 paddr + PMD_FLAGS);
445
446 paddr += PMD_PAGE_SIZE;
447 }
448
449 /* Flush the TLB - no globals so cr3 is enough */
450 native_write_cr3(__native_read_cr3());
451
452 /*
453 * A new pagetable structure is being built to allow for the kernel
454 * to be encrypted. It starts with an empty PGD that will then be
455 * populated with new PUDs and PMDs as the encrypted and decrypted
456 * kernel mappings are created.
457 */
458 pgd = pgtable_area;
459 memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
460 pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
461
462 /* Add encrypted kernel (identity) mappings */
463 pmd_flags = PMD_FLAGS | _PAGE_ENC;
464 paddr = kernel_start;
465 while (paddr < kernel_end) {
466 pgtable_area = sme_populate_pgd(pgd, pgtable_area,
467 paddr,
468 paddr + pmd_flags);
469
470 paddr += PMD_PAGE_SIZE;
471 }
472
473 /*
474 * A different PGD index/entry must be used to get different
475 * pagetable entries for the decrypted mapping. Choose the next
476 * PGD index and convert it to a virtual address to be used as
477 * the base of the mapping.
478 */
479 decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
480 decrypted_base <<= PGDIR_SHIFT;
481
482 /* Add decrypted, write-protected kernel (non-identity) mappings */
483 pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
484 paddr = kernel_start;
485 while (paddr < kernel_end) {
486 pgtable_area = sme_populate_pgd(pgd, pgtable_area,
487 paddr + decrypted_base,
488 paddr + pmd_flags);
489
490 paddr += PMD_PAGE_SIZE;
491 }
492
493 /* Add decrypted workarea mappings to both kernel mappings */
494 paddr = workarea_start;
495 while (paddr < workarea_end) {
496 pgtable_area = sme_populate_pgd(pgd, pgtable_area,
497 paddr,
498 paddr + PMD_FLAGS);
499
500 pgtable_area = sme_populate_pgd(pgd, pgtable_area,
501 paddr + decrypted_base,
502 paddr + PMD_FLAGS);
503
504 paddr += PMD_PAGE_SIZE;
505 }
506
507 /* Perform the encryption */
508 sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
509 kernel_len, workarea_start, (unsigned long)pgd);
510
511 /*
512 * At this point we are running encrypted. Remove the mappings for
513 * the decrypted areas - all that is needed for this is to remove
514 * the PGD entry/entries.
515 */
516 sme_clear_pgd(pgd, kernel_start + decrypted_base,
517 kernel_end + decrypted_base);
518
519 sme_clear_pgd(pgd, workarea_start + decrypted_base,
520 workarea_end + decrypted_base);
521
522 /* Flush the TLB - no globals so cr3 is enough */
523 native_write_cr3(__native_read_cr3());
524}
525
526void __init __nostackprotector sme_enable(struct boot_params *bp)
527{
528 const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
529 unsigned int eax, ebx, ecx, edx;
530 bool active_by_default;
531 unsigned long me_mask;
532 char buffer[16];
533 u64 msr;
534
535 /* Check for the SME support leaf */
536 eax = 0x80000000;
537 ecx = 0;
538 native_cpuid(&eax, &ebx, &ecx, &edx);
539 if (eax < 0x8000001f)
540 return;
541
542 /*
543 * Check for the SME feature:
544 * CPUID Fn8000_001F[EAX] - Bit 0
545 * Secure Memory Encryption support
546 * CPUID Fn8000_001F[EBX] - Bits 5:0
547 * Pagetable bit position used to indicate encryption
548 */
549 eax = 0x8000001f;
550 ecx = 0;
551 native_cpuid(&eax, &ebx, &ecx, &edx);
552 if (!(eax & 1))
553 return;
554
555 me_mask = 1UL << (ebx & 0x3f);
556
557 /* Check if SME is enabled */
558 msr = __rdmsr(MSR_K8_SYSCFG);
559 if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
560 return;
561
562 /*
563 * Fixups have not been applied to phys_base yet and we're running
564 * identity mapped, so we must obtain the address to the SME command
565 * line argument data using rip-relative addressing.
566 */
567 asm ("lea sme_cmdline_arg(%%rip), %0"
568 : "=r" (cmdline_arg)
569 : "p" (sme_cmdline_arg));
570 asm ("lea sme_cmdline_on(%%rip), %0"
571 : "=r" (cmdline_on)
572 : "p" (sme_cmdline_on));
573 asm ("lea sme_cmdline_off(%%rip), %0"
574 : "=r" (cmdline_off)
575 : "p" (sme_cmdline_off));
576
577 if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
578 active_by_default = true;
579 else
580 active_by_default = false;
581
582 cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
583 ((u64)bp->ext_cmd_line_ptr << 32));
584
585 cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
586
587 if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
588 sme_me_mask = me_mask;
589 else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
590 sme_me_mask = 0;
591 else
592 sme_me_mask = active_by_default ? me_mask : 0;
593}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
new file mode 100644
index 000000000000..730e6d541df1
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -0,0 +1,149 @@
1/*
2 * AMD Memory Encryption Support
3 *
4 * Copyright (C) 2016 Advanced Micro Devices, Inc.
5 *
6 * Author: Tom Lendacky <thomas.lendacky@amd.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/linkage.h>
14#include <asm/pgtable.h>
15#include <asm/page.h>
16#include <asm/processor-flags.h>
17#include <asm/msr-index.h>
18
19 .text
20 .code64
21ENTRY(sme_encrypt_execute)
22
23 /*
24 * Entry parameters:
25 * RDI - virtual address for the encrypted kernel mapping
26 * RSI - virtual address for the decrypted kernel mapping
27 * RDX - length of kernel
28 * RCX - virtual address of the encryption workarea, including:
29 * - stack page (PAGE_SIZE)
30 * - encryption routine page (PAGE_SIZE)
31 * - intermediate copy buffer (PMD_PAGE_SIZE)
32 * R8 - physcial address of the pagetables to use for encryption
33 */
34
35 push %rbp
36 movq %rsp, %rbp /* RBP now has original stack pointer */
37
38 /* Set up a one page stack in the non-encrypted memory area */
39 movq %rcx, %rax /* Workarea stack page */
40 leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */
41 addq $PAGE_SIZE, %rax /* Workarea encryption routine */
42
43 push %r12
44 movq %rdi, %r10 /* Encrypted kernel */
45 movq %rsi, %r11 /* Decrypted kernel */
46 movq %rdx, %r12 /* Kernel length */
47
48 /* Copy encryption routine into the workarea */
49 movq %rax, %rdi /* Workarea encryption routine */
50 leaq __enc_copy(%rip), %rsi /* Encryption routine */
51 movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */
52 rep movsb
53
54 /* Setup registers for call */
55 movq %r10, %rdi /* Encrypted kernel */
56 movq %r11, %rsi /* Decrypted kernel */
57 movq %r8, %rdx /* Pagetables used for encryption */
58 movq %r12, %rcx /* Kernel length */
59 movq %rax, %r8 /* Workarea encryption routine */
60 addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */
61
62 call *%rax /* Call the encryption routine */
63
64 pop %r12
65
66 movq %rbp, %rsp /* Restore original stack pointer */
67 pop %rbp
68
69 ret
70ENDPROC(sme_encrypt_execute)
71
72ENTRY(__enc_copy)
73/*
74 * Routine used to encrypt kernel.
75 * This routine must be run outside of the kernel proper since
76 * the kernel will be encrypted during the process. So this
77 * routine is defined here and then copied to an area outside
78 * of the kernel where it will remain and run decrypted
79 * during execution.
80 *
81 * On entry the registers must be:
82 * RDI - virtual address for the encrypted kernel mapping
83 * RSI - virtual address for the decrypted kernel mapping
84 * RDX - address of the pagetables to use for encryption
85 * RCX - length of kernel
86 * R8 - intermediate copy buffer
87 *
88 * RAX - points to this routine
89 *
90 * The kernel will be encrypted by copying from the non-encrypted
91 * kernel space to an intermediate buffer and then copying from the
92 * intermediate buffer back to the encrypted kernel space. The physical
93 * addresses of the two kernel space mappings are the same which
94 * results in the kernel being encrypted "in place".
95 */
96 /* Enable the new page tables */
97 mov %rdx, %cr3
98
99 /* Flush any global TLBs */
100 mov %cr4, %rdx
101 andq $~X86_CR4_PGE, %rdx
102 mov %rdx, %cr4
103 orq $X86_CR4_PGE, %rdx
104 mov %rdx, %cr4
105
106 /* Set the PAT register PA5 entry to write-protect */
107 push %rcx
108 movl $MSR_IA32_CR_PAT, %ecx
109 rdmsr
110 push %rdx /* Save original PAT value */
111 andl $0xffff00ff, %edx /* Clear PA5 */
112 orl $0x00000500, %edx /* Set PA5 to WP */
113 wrmsr
114 pop %rdx /* RDX contains original PAT value */
115 pop %rcx
116
117 movq %rcx, %r9 /* Save kernel length */
118 movq %rdi, %r10 /* Save encrypted kernel address */
119 movq %rsi, %r11 /* Save decrypted kernel address */
120
121 wbinvd /* Invalidate any cache entries */
122
123 /* Copy/encrypt 2MB at a time */
1241:
125 movq %r11, %rsi /* Source - decrypted kernel */
126 movq %r8, %rdi /* Dest - intermediate copy buffer */
127 movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
128 rep movsb
129
130 movq %r8, %rsi /* Source - intermediate copy buffer */
131 movq %r10, %rdi /* Dest - encrypted kernel */
132 movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
133 rep movsb
134
135 addq $PMD_PAGE_SIZE, %r11
136 addq $PMD_PAGE_SIZE, %r10
137 subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */
138 jnz 1b /* Kernel length not zero? */
139
140 /* Restore PAT register */
141 push %rdx /* Save original PAT value */
142 movl $MSR_IA32_CR_PAT, %ecx
143 rdmsr
144 pop %rdx /* Restore original PAT value */
145 wrmsr
146
147 ret
148.L__enc_copy_end:
149ENDPROC(__enc_copy)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index a88cfbfbd078..a99679826846 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -37,21 +37,21 @@ struct va_alignment __read_mostly va_align = {
37 .flags = -1, 37 .flags = -1,
38}; 38};
39 39
40unsigned long tasksize_32bit(void) 40unsigned long task_size_32bit(void)
41{ 41{
42 return IA32_PAGE_OFFSET; 42 return IA32_PAGE_OFFSET;
43} 43}
44 44
45unsigned long tasksize_64bit(void) 45unsigned long task_size_64bit(int full_addr_space)
46{ 46{
47 return TASK_SIZE_MAX; 47 return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
48} 48}
49 49
50static unsigned long stack_maxrandom_size(unsigned long task_size) 50static unsigned long stack_maxrandom_size(unsigned long task_size)
51{ 51{
52 unsigned long max = 0; 52 unsigned long max = 0;
53 if (current->flags & PF_RANDOMIZE) { 53 if (current->flags & PF_RANDOMIZE) {
54 max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); 54 max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
55 max <<= PAGE_SHIFT; 55 max <<= PAGE_SHIFT;
56 } 56 }
57 57
@@ -141,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
141 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 141 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
142 142
143 arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, 143 arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
144 arch_rnd(mmap64_rnd_bits), tasksize_64bit()); 144 arch_rnd(mmap64_rnd_bits), task_size_64bit(0));
145 145
146#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES 146#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
147 /* 147 /*
@@ -151,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
151 * mmap_base, the compat syscall uses mmap_compat_base. 151 * mmap_base, the compat syscall uses mmap_compat_base.
152 */ 152 */
153 arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, 153 arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
154 arch_rnd(mmap32_rnd_bits), tasksize_32bit()); 154 arch_rnd(mmap32_rnd_bits), task_size_32bit());
155#endif 155#endif
156} 156}
157 157
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 1c34b767c84c..9ceaa955d2ba 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -355,10 +355,19 @@ int mpx_enable_management(void)
355 */ 355 */
356 bd_base = mpx_get_bounds_dir(); 356 bd_base = mpx_get_bounds_dir();
357 down_write(&mm->mmap_sem); 357 down_write(&mm->mmap_sem);
358
359 /* MPX doesn't support addresses above 47 bits yet. */
360 if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
361 pr_warn_once("%s (%d): MPX cannot handle addresses "
362 "above 47-bits. Disabling.",
363 current->comm, current->pid);
364 ret = -ENXIO;
365 goto out;
366 }
358 mm->context.bd_addr = bd_base; 367 mm->context.bd_addr = bd_base;
359 if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) 368 if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
360 ret = -ENXIO; 369 ret = -ENXIO;
361 370out:
362 up_write(&mm->mmap_sem); 371 up_write(&mm->mmap_sem);
363 return ret; 372 return ret;
364} 373}
@@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
1030 if (ret) 1039 if (ret)
1031 force_sig(SIGSEGV, current); 1040 force_sig(SIGSEGV, current);
1032} 1041}
1042
1043/* MPX cannot handle addresses above 47 bits yet. */
1044unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
1045 unsigned long flags)
1046{
1047 if (!kernel_managing_mpx_tables(current->mm))
1048 return addr;
1049 if (addr + len <= DEFAULT_MAP_WINDOW)
1050 return addr;
1051 if (flags & MAP_FIXED)
1052 return -ENOMEM;
1053
1054 /*
1055 * Requested len is larger than the whole area we're allowed to map in.
1056 * Resetting hinting address wouldn't do much good -- fail early.
1057 */
1058 if (len > DEFAULT_MAP_WINDOW)
1059 return -ENOMEM;
1060
1061 /* Look for unmap area within DEFAULT_MAP_WINDOW */
1062 return 0;
1063}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 757b0bcdf712..dfb7d657cf43 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1775,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages)
1775 __pgprot(0), 1, 0, NULL); 1775 __pgprot(0), 1, 0, NULL);
1776} 1776}
1777 1777
1778static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
1779{
1780 struct cpa_data cpa;
1781 unsigned long start;
1782 int ret;
1783
1784 /* Nothing to do if the SME is not active */
1785 if (!sme_active())
1786 return 0;
1787
1788 /* Should not be working on unaligned addresses */
1789 if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
1790 addr &= PAGE_MASK;
1791
1792 start = addr;
1793
1794 memset(&cpa, 0, sizeof(cpa));
1795 cpa.vaddr = &addr;
1796 cpa.numpages = numpages;
1797 cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
1798 cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
1799 cpa.pgd = init_mm.pgd;
1800
1801 /* Must avoid aliasing mappings in the highmem code */
1802 kmap_flush_unused();
1803 vm_unmap_aliases();
1804
1805 /*
1806 * Before changing the encryption attribute, we need to flush caches.
1807 */
1808 if (static_cpu_has(X86_FEATURE_CLFLUSH))
1809 cpa_flush_range(start, numpages, 1);
1810 else
1811 cpa_flush_all(1);
1812
1813 ret = __change_page_attr_set_clr(&cpa, 1);
1814
1815 /*
1816 * After changing the encryption attribute, we need to flush TLBs
1817 * again in case any speculative TLB caching occurred (but no need
1818 * to flush caches again). We could just use cpa_flush_all(), but
1819 * in case TLB flushing gets optimized in the cpa_flush_range()
1820 * path use the same logic as above.
1821 */
1822 if (static_cpu_has(X86_FEATURE_CLFLUSH))
1823 cpa_flush_range(start, numpages, 0);
1824 else
1825 cpa_flush_all(0);
1826
1827 return ret;
1828}
1829
1830int set_memory_encrypted(unsigned long addr, int numpages)
1831{
1832 return __set_memory_enc_dec(addr, numpages, true);
1833}
1834EXPORT_SYMBOL_GPL(set_memory_encrypted);
1835
1836int set_memory_decrypted(unsigned long addr, int numpages)
1837{
1838 return __set_memory_enc_dec(addr, numpages, false);
1839}
1840EXPORT_SYMBOL_GPL(set_memory_decrypted);
1841
1778int set_pages_uc(struct page *page, int numpages) 1842int set_pages_uc(struct page *page, int numpages)
1779{ 1843{
1780 unsigned long addr = (unsigned long)page_address(page); 1844 unsigned long addr = (unsigned long)page_address(page);
@@ -2020,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
2020 if (!(page_flags & _PAGE_RW)) 2084 if (!(page_flags & _PAGE_RW))
2021 cpa.mask_clr = __pgprot(_PAGE_RW); 2085 cpa.mask_clr = __pgprot(_PAGE_RW);
2022 2086
2087 if (!(page_flags & _PAGE_ENC))
2088 cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
2089
2023 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); 2090 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
2024 2091
2025 retval = __change_page_attr_set_clr(&cpa, 0); 2092 retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 45979502f64b..fe7d57a8fb60 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -293,7 +293,7 @@ void init_cache_modes(void)
293 * pat_init - Initialize PAT MSR and PAT table 293 * pat_init - Initialize PAT MSR and PAT table
294 * 294 *
295 * This function initializes PAT MSR and PAT table with an OS-defined value 295 * This function initializes PAT MSR and PAT table with an OS-defined value
296 * to enable additional cache attributes, WC and WT. 296 * to enable additional cache attributes, WC, WT and WP.
297 * 297 *
298 * This function must be called on all CPUs using the specific sequence of 298 * This function must be called on all CPUs using the specific sequence of
299 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this 299 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
@@ -352,7 +352,7 @@ void pat_init(void)
352 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 352 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
353 * 011 3 UC : _PAGE_CACHE_MODE_UC 353 * 011 3 UC : _PAGE_CACHE_MODE_UC
354 * 100 4 WB : Reserved 354 * 100 4 WB : Reserved
355 * 101 5 WC : Reserved 355 * 101 5 WP : _PAGE_CACHE_MODE_WP
356 * 110 6 UC-: Reserved 356 * 110 6 UC-: Reserved
357 * 111 7 WT : _PAGE_CACHE_MODE_WT 357 * 111 7 WT : _PAGE_CACHE_MODE_WT
358 * 358 *
@@ -360,7 +360,7 @@ void pat_init(void)
360 * corresponding types in the presence of PAT errata. 360 * corresponding types in the presence of PAT errata.
361 */ 361 */
362 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 362 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
363 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); 363 PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
364 } 364 }
365 365
366 if (!boot_cpu_done) { 366 if (!boot_cpu_done) {
@@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc);
744pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 744pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
745 unsigned long size, pgprot_t vma_prot) 745 unsigned long size, pgprot_t vma_prot)
746{ 746{
747 if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
748 vma_prot = pgprot_decrypted(vma_prot);
749
747 return vma_prot; 750 return vma_prot;
748} 751}
749 752
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 508a708eb9a6..218834a3e9ad 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -56,7 +56,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
56{ 56{
57 pgtable_page_dtor(pte); 57 pgtable_page_dtor(pte);
58 paravirt_release_pte(page_to_pfn(pte)); 58 paravirt_release_pte(page_to_pfn(pte));
59 tlb_remove_page(tlb, pte); 59 tlb_remove_table(tlb, pte);
60} 60}
61 61
62#if CONFIG_PGTABLE_LEVELS > 2 62#if CONFIG_PGTABLE_LEVELS > 2
@@ -72,21 +72,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
72 tlb->need_flush_all = 1; 72 tlb->need_flush_all = 1;
73#endif 73#endif
74 pgtable_pmd_page_dtor(page); 74 pgtable_pmd_page_dtor(page);
75 tlb_remove_page(tlb, page); 75 tlb_remove_table(tlb, page);
76} 76}
77 77
78#if CONFIG_PGTABLE_LEVELS > 3 78#if CONFIG_PGTABLE_LEVELS > 3
79void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 79void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
80{ 80{
81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
82 tlb_remove_page(tlb, virt_to_page(pud)); 82 tlb_remove_table(tlb, virt_to_page(pud));
83} 83}
84 84
85#if CONFIG_PGTABLE_LEVELS > 4 85#if CONFIG_PGTABLE_LEVELS > 4
86void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 86void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
87{ 87{
88 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 88 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
89 tlb_remove_page(tlb, virt_to_page(p4d)); 89 tlb_remove_table(tlb, virt_to_page(p4d));
90} 90}
91#endif /* CONFIG_PGTABLE_LEVELS > 4 */ 91#endif /* CONFIG_PGTABLE_LEVELS > 4 */
92#endif /* CONFIG_PGTABLE_LEVELS > 3 */ 92#endif /* CONFIG_PGTABLE_LEVELS > 3 */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 014d07a80053..ce104b962a17 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,42 @@
28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
29 */ 29 */
30 30
31atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
32
33static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
34 u16 *new_asid, bool *need_flush)
35{
36 u16 asid;
37
38 if (!static_cpu_has(X86_FEATURE_PCID)) {
39 *new_asid = 0;
40 *need_flush = true;
41 return;
42 }
43
44 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
45 if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
46 next->context.ctx_id)
47 continue;
48
49 *new_asid = asid;
50 *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
51 next_tlb_gen);
52 return;
53 }
54
55 /*
56 * We don't currently own an ASID slot on this CPU.
57 * Allocate a slot.
58 */
59 *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
60 if (*new_asid >= TLB_NR_DYN_ASIDS) {
61 *new_asid = 0;
62 this_cpu_write(cpu_tlbstate.next_asid, 1);
63 }
64 *need_flush = true;
65}
66
31void leave_mm(int cpu) 67void leave_mm(int cpu)
32{ 68{
33 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 69 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -43,12 +79,11 @@ void leave_mm(int cpu)
43 if (loaded_mm == &init_mm) 79 if (loaded_mm == &init_mm)
44 return; 80 return;
45 81
46 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 82 /* Warn if we're not lazy. */
47 BUG(); 83 WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
48 84
49 switch_mm(NULL, &init_mm, NULL); 85 switch_mm(NULL, &init_mm, NULL);
50} 86}
51EXPORT_SYMBOL_GPL(leave_mm);
52 87
53void switch_mm(struct mm_struct *prev, struct mm_struct *next, 88void switch_mm(struct mm_struct *prev, struct mm_struct *next,
54 struct task_struct *tsk) 89 struct task_struct *tsk)
@@ -63,115 +98,219 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
63void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 98void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
64 struct task_struct *tsk) 99 struct task_struct *tsk)
65{ 100{
66 unsigned cpu = smp_processor_id();
67 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 101 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
102 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
103 unsigned cpu = smp_processor_id();
104 u64 next_tlb_gen;
68 105
69 /* 106 /*
70 * NB: The scheduler will call us with prev == next when 107 * NB: The scheduler will call us with prev == next when switching
71 * switching from lazy TLB mode to normal mode if active_mm 108 * from lazy TLB mode to normal mode if active_mm isn't changing.
72 * isn't changing. When this happens, there is no guarantee 109 * When this happens, we don't assume that CR3 (and hence
73 * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. 110 * cpu_tlbstate.loaded_mm) matches next.
74 * 111 *
75 * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 112 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
76 */ 113 */
77 114
78 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 115 /* We don't want flush_tlb_func_* to run concurrently with us. */
116 if (IS_ENABLED(CONFIG_PROVE_LOCKING))
117 WARN_ON_ONCE(!irqs_disabled());
118
119 /*
120 * Verify that CR3 is what we think it is. This will catch
121 * hypothetical buggy code that directly switches to swapper_pg_dir
122 * without going through leave_mm() / switch_mm_irqs_off() or that
123 * does something like write_cr3(read_cr3_pa()).
124 */
125 VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
79 126
80 if (real_prev == next) { 127 if (real_prev == next) {
81 /* 128 VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
82 * There's nothing to do: we always keep the per-mm control 129 next->context.ctx_id);
83 * regs in sync with cpu_tlbstate.loaded_mm. Just 130
84 * sanity-check mm_cpumask. 131 if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
85 */ 132 /*
86 if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) 133 * There's nothing to do: we weren't lazy, and we
87 cpumask_set_cpu(cpu, mm_cpumask(next)); 134 * aren't changing our mm. We don't need to flush
88 return; 135 * anything, nor do we need to update CR3, CR4, or
89 } 136 * LDTR.
137 */
138 return;
139 }
140
141 /* Resume remote flushes and then read tlb_gen. */
142 cpumask_set_cpu(cpu, mm_cpumask(next));
143 next_tlb_gen = atomic64_read(&next->context.tlb_gen);
144
145 if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
146 next_tlb_gen) {
147 /*
148 * Ideally, we'd have a flush_tlb() variant that
149 * takes the known CR3 value as input. This would
150 * be faster on Xen PV and on hypothetical CPUs
151 * on which INVPCID is fast.
152 */
153 this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
154 next_tlb_gen);
155 write_cr3(__sme_pa(next->pgd) | prev_asid);
156 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
157 TLB_FLUSH_ALL);
158 }
90 159
91 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
92 /* 160 /*
93 * If our current stack is in vmalloc space and isn't 161 * We just exited lazy mode, which means that CR4 and/or LDTR
94 * mapped in the new pgd, we'll double-fault. Forcibly 162 * may be stale. (Changes to the required CR4 and LDTR states
95 * map it. 163 * are not reflected in tlb_gen.)
96 */ 164 */
97 unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); 165 } else {
98 166 u16 new_asid;
99 pgd_t *pgd = next->pgd + stack_pgd_index; 167 bool need_flush;
100 168
101 if (unlikely(pgd_none(*pgd))) 169 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
102 set_pgd(pgd, init_mm.pgd[stack_pgd_index]); 170 /*
103 } 171 * If our current stack is in vmalloc space and isn't
172 * mapped in the new pgd, we'll double-fault. Forcibly
173 * map it.
174 */
175 unsigned int index = pgd_index(current_stack_pointer());
176 pgd_t *pgd = next->pgd + index;
177
178 if (unlikely(pgd_none(*pgd)))
179 set_pgd(pgd, init_mm.pgd[index]);
180 }
104 181
105 this_cpu_write(cpu_tlbstate.loaded_mm, next); 182 /* Stop remote flushes for the previous mm */
183 if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
184 cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
106 185
107 WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 186 VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
108 cpumask_set_cpu(cpu, mm_cpumask(next));
109 187
110 /* 188 /*
111 * Re-load page tables. 189 * Start remote flushes and then read tlb_gen.
112 * 190 */
113 * This logic has an ordering constraint: 191 cpumask_set_cpu(cpu, mm_cpumask(next));
114 * 192 next_tlb_gen = atomic64_read(&next->context.tlb_gen);
115 * CPU 0: Write to a PTE for 'next' 193
116 * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. 194 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
117 * CPU 1: set bit 1 in next's mm_cpumask 195
118 * CPU 1: load from the PTE that CPU 0 writes (implicit) 196 if (need_flush) {
119 * 197 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
120 * We need to prevent an outcome in which CPU 1 observes 198 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
121 * the new PTE value and CPU 0 observes bit 1 clear in 199 write_cr3(__sme_pa(next->pgd) | new_asid);
122 * mm_cpumask. (If that occurs, then the IPI will never 200 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
123 * be sent, and CPU 0's TLB will contain a stale entry.) 201 TLB_FLUSH_ALL);
124 * 202 } else {
125 * The bad outcome can occur if either CPU's load is 203 /* The new ASID is already up to date. */
126 * reordered before that CPU's store, so both CPUs must 204 write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
127 * execute full barriers to prevent this from happening. 205 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
128 * 206 }
129 * Thus, switch_mm needs a full barrier between the
130 * store to mm_cpumask and any operation that could load
131 * from next->pgd. TLB fills are special and can happen
132 * due to instruction fetches or for no reason at all,
133 * and neither LOCK nor MFENCE orders them.
134 * Fortunately, load_cr3() is serializing and gives the
135 * ordering guarantee we need.
136 */
137 load_cr3(next->pgd);
138
139 /*
140 * This gets called via leave_mm() in the idle path where RCU
141 * functions differently. Tracing normally uses RCU, so we have to
142 * call the tracepoint specially here.
143 */
144 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
145 207
146 /* Stop flush ipis for the previous mm */ 208 this_cpu_write(cpu_tlbstate.loaded_mm, next);
147 WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && 209 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
148 real_prev != &init_mm); 210 }
149 cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
150 211
151 /* Load per-mm CR4 and LDTR state */
152 load_mm_cr4(next); 212 load_mm_cr4(next);
153 switch_ldt(real_prev, next); 213 switch_ldt(real_prev, next);
154} 214}
155 215
216/*
217 * flush_tlb_func_common()'s memory ordering requirement is that any
218 * TLB fills that happen after we flush the TLB are ordered after we
219 * read active_mm's tlb_gen. We don't need any explicit barriers
220 * because all x86 flush operations are serializing and the
221 * atomic64_read operation won't be reordered by the compiler.
222 */
156static void flush_tlb_func_common(const struct flush_tlb_info *f, 223static void flush_tlb_func_common(const struct flush_tlb_info *f,
157 bool local, enum tlb_flush_reason reason) 224 bool local, enum tlb_flush_reason reason)
158{ 225{
226 /*
227 * We have three different tlb_gen values in here. They are:
228 *
229 * - mm_tlb_gen: the latest generation.
230 * - local_tlb_gen: the generation that this CPU has already caught
231 * up to.
232 * - f->new_tlb_gen: the generation that the requester of the flush
233 * wants us to catch up to.
234 */
235 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
236 u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
237 u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
238 u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
239
159 /* This code cannot presently handle being reentered. */ 240 /* This code cannot presently handle being reentered. */
160 VM_WARN_ON(!irqs_disabled()); 241 VM_WARN_ON(!irqs_disabled());
161 242
162 if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { 243 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
163 leave_mm(smp_processor_id()); 244 loaded_mm->context.ctx_id);
245
246 if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
247 /*
248 * We're in lazy mode -- don't flush. We can get here on
249 * remote flushes due to races and on local flushes if a
250 * kernel thread coincidentally flushes the mm it's lazily
251 * still using.
252 */
164 return; 253 return;
165 } 254 }
166 255
167 if (f->end == TLB_FLUSH_ALL) { 256 if (unlikely(local_tlb_gen == mm_tlb_gen)) {
168 local_flush_tlb(); 257 /*
169 if (local) 258 * There's nothing to do: we're already up to date. This can
170 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 259 * happen if two concurrent flushes happen -- the first flush to
171 trace_tlb_flush(reason, TLB_FLUSH_ALL); 260 * be handled can catch us all the way up, leaving no work for
172 } else { 261 * the second flush.
262 */
263 trace_tlb_flush(reason, 0);
264 return;
265 }
266
267 WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
268 WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
269
270 /*
271 * If we get to this point, we know that our TLB is out of date.
272 * This does not strictly imply that we need to flush (it's
273 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
274 * going to need to flush in the very near future, so we might
275 * as well get it over with.
276 *
277 * The only question is whether to do a full or partial flush.
278 *
279 * We do a partial flush if requested and two extra conditions
280 * are met:
281 *
282 * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
283 * we've always done all needed flushes to catch up to
284 * local_tlb_gen. If, for example, local_tlb_gen == 2 and
285 * f->new_tlb_gen == 3, then we know that the flush needed to bring
286 * us up to date for tlb_gen 3 is the partial flush we're
287 * processing.
288 *
289 * As an example of why this check is needed, suppose that there
290 * are two concurrent flushes. The first is a full flush that
291 * changes context.tlb_gen from 1 to 2. The second is a partial
292 * flush that changes context.tlb_gen from 2 to 3. If they get
293 * processed on this CPU in reverse order, we'll see
294 * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
295 * If we were to use __flush_tlb_single() and set local_tlb_gen to
296 * 3, we'd be break the invariant: we'd update local_tlb_gen above
297 * 1 without the full flush that's needed for tlb_gen 2.
298 *
299 * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
300 * Partial TLB flushes are not all that much cheaper than full TLB
301 * flushes, so it seems unlikely that it would be a performance win
302 * to do a partial flush if that won't bring our TLB fully up to
303 * date. By doing a full flush instead, we can increase
304 * local_tlb_gen all the way to mm_tlb_gen and we can probably
305 * avoid another flush in the very near future.
306 */
307 if (f->end != TLB_FLUSH_ALL &&
308 f->new_tlb_gen == local_tlb_gen + 1 &&
309 f->new_tlb_gen == mm_tlb_gen) {
310 /* Partial flush */
173 unsigned long addr; 311 unsigned long addr;
174 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; 312 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
313
175 addr = f->start; 314 addr = f->start;
176 while (addr < f->end) { 315 while (addr < f->end) {
177 __flush_tlb_single(addr); 316 __flush_tlb_single(addr);
@@ -180,7 +319,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
180 if (local) 319 if (local)
181 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); 320 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
182 trace_tlb_flush(reason, nr_pages); 321 trace_tlb_flush(reason, nr_pages);
322 } else {
323 /* Full flush. */
324 local_flush_tlb();
325 if (local)
326 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
327 trace_tlb_flush(reason, TLB_FLUSH_ALL);
183 } 328 }
329
330 /* Both paths above update our state to mm_tlb_gen. */
331 this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
184} 332}
185 333
186static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) 334static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
@@ -214,6 +362,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
214 (info->end - info->start) >> PAGE_SHIFT); 362 (info->end - info->start) >> PAGE_SHIFT);
215 363
216 if (is_uv_system()) { 364 if (is_uv_system()) {
365 /*
366 * This whole special case is confused. UV has a "Broadcast
367 * Assist Unit", which seems to be a fancy way to send IPIs.
368 * Back when x86 used an explicit TLB flush IPI, UV was
369 * optimized to use its own mechanism. These days, x86 uses
370 * smp_call_function_many(), but UV still uses a manual IPI,
371 * and that IPI's action is out of date -- it does a manual
372 * flush instead of calling flush_tlb_func_remote(). This
373 * means that the percpu tlb_gen variables won't be updated
374 * and we'll do pointless flushes on future context switches.
375 *
376 * Rather than hooking native_flush_tlb_others() here, I think
377 * that UV should be updated so that smp_call_function_many(),
378 * etc, are optimal on UV.
379 */
217 unsigned int cpu; 380 unsigned int cpu;
218 381
219 cpu = smp_processor_id(); 382 cpu = smp_processor_id();
@@ -250,8 +413,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
250 413
251 cpu = get_cpu(); 414 cpu = get_cpu();
252 415
253 /* Synchronize with switch_mm. */ 416 /* This is also a barrier that synchronizes with switch_mm(). */
254 smp_mb(); 417 info.new_tlb_gen = inc_mm_tlb_gen(mm);
255 418
256 /* Should we flush just the requested range? */ 419 /* Should we flush just the requested range? */
257 if ((end != TLB_FLUSH_ALL) && 420 if ((end != TLB_FLUSH_ALL) &&
@@ -273,6 +436,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
273 436
274 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 437 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
275 flush_tlb_others(mm_cpumask(mm), &info); 438 flush_tlb_others(mm_cpumask(mm), &info);
439
276 put_cpu(); 440 put_cpu();
277} 441}
278 442
@@ -281,8 +445,6 @@ static void do_flush_tlb_all(void *info)
281{ 445{
282 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 446 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
283 __flush_tlb_all(); 447 __flush_tlb_all();
284 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
285 leave_mm(smp_processor_id());
286} 448}
287 449
288void flush_tlb_all(void) 450void flush_tlb_all(void)
@@ -335,6 +497,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
335 497
336 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 498 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
337 flush_tlb_others(&batch->cpumask, &info); 499 flush_tlb_others(&batch->cpumask, &info);
500
338 cpumask_clear(&batch->cpumask); 501 cpumask_clear(&batch->cpumask);
339 502
340 put_cpu(); 503 put_cpu();
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index dbe2132b0ed4..7a5350d08cef 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev)
674 674
675 pa_data = boot_params.hdr.setup_data; 675 pa_data = boot_params.hdr.setup_data;
676 while (pa_data) { 676 while (pa_data) {
677 data = ioremap(pa_data, sizeof(*rom)); 677 data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB);
678 if (!data) 678 if (!data)
679 return -ENOMEM; 679 return -ENOMEM;
680 680
@@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev)
693 } 693 }
694 } 694 }
695 pa_data = data->next; 695 pa_data = data->next;
696 iounmap(data); 696 memunmap(data);
697 } 697 }
698 set_dma_domain_ops(dev); 698 set_dma_domain_ops(dev);
699 set_dev_domain_options(dev); 699 set_dev_domain_options(dev);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f084d8718ac4..6217b23e85f6 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -1035,12 +1035,12 @@ void __init efi_enter_virtual_mode(void)
1035/* 1035/*
1036 * Convenience functions to obtain memory types and attributes 1036 * Convenience functions to obtain memory types and attributes
1037 */ 1037 */
1038u32 efi_mem_type(unsigned long phys_addr) 1038int efi_mem_type(unsigned long phys_addr)
1039{ 1039{
1040 efi_memory_desc_t *md; 1040 efi_memory_desc_t *md;
1041 1041
1042 if (!efi_enabled(EFI_MEMMAP)) 1042 if (!efi_enabled(EFI_MEMMAP))
1043 return 0; 1043 return -ENOTSUPP;
1044 1044
1045 for_each_efi_memory_desc(md) { 1045 for_each_efi_memory_desc(md) {
1046 if ((md->phys_addr <= phys_addr) && 1046 if ((md->phys_addr <= phys_addr) &&
@@ -1048,7 +1048,7 @@ u32 efi_mem_type(unsigned long phys_addr)
1048 (md->num_pages << EFI_PAGE_SHIFT)))) 1048 (md->num_pages << EFI_PAGE_SHIFT))))
1049 return md->type; 1049 return md->type;
1050 } 1050 }
1051 return 0; 1051 return -EINVAL;
1052} 1052}
1053 1053
1054static int __init arch_parse_efi_cmdline(char *str) 1054static int __init arch_parse_efi_cmdline(char *str)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9bf72f5bfedb..12e83888e5b9 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -327,7 +327,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size)
327 327
328int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) 328int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
329{ 329{
330 unsigned long pfn, text; 330 unsigned long pfn, text, pf;
331 struct page *page; 331 struct page *page;
332 unsigned npages; 332 unsigned npages;
333 pgd_t *pgd; 333 pgd_t *pgd;
@@ -335,7 +335,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
335 if (efi_enabled(EFI_OLD_MEMMAP)) 335 if (efi_enabled(EFI_OLD_MEMMAP))
336 return 0; 336 return 0;
337 337
338 efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd); 338 /*
339 * Since the PGD is encrypted, set the encryption mask so that when
340 * this value is loaded into cr3 the PGD will be decrypted during
341 * the pagetable walk.
342 */
343 efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd);
339 pgd = efi_pgd; 344 pgd = efi_pgd;
340 345
341 /* 346 /*
@@ -345,7 +350,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
345 * phys_efi_set_virtual_address_map(). 350 * phys_efi_set_virtual_address_map().
346 */ 351 */
347 pfn = pa_memmap >> PAGE_SHIFT; 352 pfn = pa_memmap >> PAGE_SHIFT;
348 if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) { 353 pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC;
354 if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) {
349 pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap); 355 pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
350 return 1; 356 return 1;
351 } 357 }
@@ -388,7 +394,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
388 text = __pa(_text); 394 text = __pa(_text);
389 pfn = text >> PAGE_SHIFT; 395 pfn = text >> PAGE_SHIFT;
390 396
391 if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) { 397 pf = _PAGE_RW | _PAGE_ENC;
398 if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) {
392 pr_err("Failed to map kernel text 1:1\n"); 399 pr_err("Failed to map kernel text 1:1\n");
393 return 1; 400 return 1;
394 } 401 }
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cd4be19c36dc..1f71980fc5e0 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,6 +1,7 @@
1#include <linux/io.h> 1#include <linux/io.h>
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/memblock.h> 3#include <linux/memblock.h>
4#include <linux/mem_encrypt.h>
4 5
5#include <asm/set_memory.h> 6#include <asm/set_memory.h>
6#include <asm/pgtable.h> 7#include <asm/pgtable.h>
@@ -59,6 +60,13 @@ static void __init setup_real_mode(void)
59 60
60 base = (unsigned char *)real_mode_header; 61 base = (unsigned char *)real_mode_header;
61 62
63 /*
64 * If SME is active, the trampoline area will need to be in
65 * decrypted memory in order to bring up other processors
66 * successfully.
67 */
68 set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
69
62 memcpy(base, real_mode_blob, size); 70 memcpy(base, real_mode_blob, size);
63 71
64 phys_base = __pa(base); 72 phys_base = __pa(base);
@@ -100,6 +108,10 @@ static void __init setup_real_mode(void)
100 trampoline_cr4_features = &trampoline_header->cr4; 108 trampoline_cr4_features = &trampoline_header->cr4;
101 *trampoline_cr4_features = mmu_cr4_features; 109 *trampoline_cr4_features = mmu_cr4_features;
102 110
111 trampoline_header->flags = 0;
112 if (sme_active())
113 trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
114
103 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 115 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
104 trampoline_pgd[0] = trampoline_pgd_entry.pgd; 116 trampoline_pgd[0] = trampoline_pgd_entry.pgd;
105 trampoline_pgd[511] = init_top_pgt[511].pgd; 117 trampoline_pgd[511] = init_top_pgt[511].pgd;
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index dac7b20d2f9d..614fd7064d0a 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -30,6 +30,7 @@
30#include <asm/msr.h> 30#include <asm/msr.h>
31#include <asm/segment.h> 31#include <asm/segment.h>
32#include <asm/processor-flags.h> 32#include <asm/processor-flags.h>
33#include <asm/realmode.h>
33#include "realmode.h" 34#include "realmode.h"
34 35
35 .text 36 .text
@@ -92,6 +93,28 @@ ENTRY(startup_32)
92 movl %edx, %fs 93 movl %edx, %fs
93 movl %edx, %gs 94 movl %edx, %gs
94 95
96 /*
97 * Check for memory encryption support. This is a safety net in
98 * case BIOS hasn't done the necessary step of setting the bit in
99 * the MSR for this AP. If SME is active and we've gotten this far
100 * then it is safe for us to set the MSR bit and continue. If we
101 * don't we'll eventually crash trying to execute encrypted
102 * instructions.
103 */
104 bt $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
105 jnc .Ldone
106 movl $MSR_K8_SYSCFG, %ecx
107 rdmsr
108 bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax
109 jc .Ldone
110
111 /*
112 * Memory encryption is enabled but the SME enable bit for this
113 * CPU has has not been set. It is safe to set it, so do so.
114 */
115 wrmsr
116.Ldone:
117
95 movl pa_tr_cr4, %eax 118 movl pa_tr_cr4, %eax
96 movl %eax, %cr4 # Enable PAE mode 119 movl %eax, %cr4 # Enable PAE mode
97 120
@@ -147,6 +170,7 @@ GLOBAL(trampoline_header)
147 tr_start: .space 8 170 tr_start: .space 8
148 GLOBAL(tr_efer) .space 8 171 GLOBAL(tr_efer) .space 8
149 GLOBAL(tr_cr4) .space 4 172 GLOBAL(tr_cr4) .space 4
173 GLOBAL(tr_flags) .space 4
150END(trampoline_header) 174END(trampoline_header)
151 175
152#include "trampoline_common.S" 176#include "trampoline_common.S"
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 027987638e98..1ecd419811a2 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -17,6 +17,9 @@ config XEN_PV
17 bool "Xen PV guest support" 17 bool "Xen PV guest support"
18 default y 18 default y
19 depends on XEN 19 depends on XEN
20 # XEN_PV is not ready to work with 5-level paging.
21 # Changes to hypervisor are also required.
22 depends on !X86_5LEVEL
20 select XEN_HAVE_PVMMU 23 select XEN_HAVE_PVMMU
21 select XEN_HAVE_VPMU 24 select XEN_HAVE_VPMU
22 help 25 help
@@ -75,4 +78,6 @@ config XEN_DEBUG_FS
75config XEN_PVH 78config XEN_PVH
76 bool "Support for running as a PVH guest" 79 bool "Support for running as a PVH guest"
77 depends on XEN && XEN_PVHVM && ACPI 80 depends on XEN && XEN_PVHVM && ACPI
81 # Pre-built page tables are not ready to handle 5-level paging.
82 depends on !X86_5LEVEL
78 def_bool n 83 def_bool n
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 98491521bb43..6c279c8f0a0e 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -263,6 +263,13 @@ static void __init xen_init_capabilities(void)
263 setup_clear_cpu_cap(X86_FEATURE_MTRR); 263 setup_clear_cpu_cap(X86_FEATURE_MTRR);
264 setup_clear_cpu_cap(X86_FEATURE_ACC); 264 setup_clear_cpu_cap(X86_FEATURE_ACC);
265 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 265 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
266 setup_clear_cpu_cap(X86_FEATURE_SME);
267
268 /*
269 * Xen PV would need some work to support PCID: CR3 handling as well
270 * as xen_flush_tlb_others() would need updating.
271 */
272 setup_clear_cpu_cap(X86_FEATURE_PCID);
266 273
267 if (!xen_initial_domain()) 274 if (!xen_initial_domain())
268 setup_clear_cpu_cap(X86_FEATURE_ACPI); 275 setup_clear_cpu_cap(X86_FEATURE_ACPI);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index cab28cf2cffb..e437714750f8 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1005 /* Get the "official" set of cpus referring to our pagetable. */ 1005 /* Get the "official" set of cpus referring to our pagetable. */
1006 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1006 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1007 for_each_online_cpu(cpu) { 1007 for_each_online_cpu(cpu) {
1008 if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) 1008 if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1009 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1010 continue; 1009 continue;
1011 smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); 1010 smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
1012 } 1011 }
1013 return; 1012 return;
1014 } 1013 }
1015 cpumask_copy(mask, mm_cpumask(mm));
1016 1014
1017 /* 1015 /*
1018 * It's possible that a vcpu may have a stale reference to our 1016 * It's possible that a vcpu may have a stale reference to our
@@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1021 * look at its actual current cr3 value, and force it to flush 1019 * look at its actual current cr3 value, and force it to flush
1022 * if needed. 1020 * if needed.
1023 */ 1021 */
1022 cpumask_clear(mask);
1024 for_each_online_cpu(cpu) { 1023 for_each_online_cpu(cpu) {
1025 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1024 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1026 cpumask_set_cpu(cpu, mask); 1025 cpumask_set_cpu(cpu, mask);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 72a8e6adebe6..a7525e95d53f 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -58,7 +58,7 @@ ENTRY(hypercall_page)
58#else 58#else
59 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) 59 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
60 /* Map the p2m table to a 512GB-aligned user address. */ 60 /* Map the p2m table to a 512GB-aligned user address. */
61 ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) 61 ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD))
62#endif 62#endif
63#ifdef CONFIG_XEN_PV 63#ifdef CONFIG_XEN_PV
64 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) 64 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5c8aa9cf62d7..fe3d2a40f311 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -708,8 +708,6 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
708static void acpi_idle_enter_bm(struct acpi_processor *pr, 708static void acpi_idle_enter_bm(struct acpi_processor *pr,
709 struct acpi_processor_cx *cx, bool timer_bc) 709 struct acpi_processor_cx *cx, bool timer_bc)
710{ 710{
711 acpi_unlazy_tlb(smp_processor_id());
712
713 /* 711 /*
714 * Must be done before busmaster disable as we might need to 712 * Must be done before busmaster disable as we might need to
715 * access HPET ! 713 * access HPET !
diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c
index ef76e5eecf0b..d5de6ee8466d 100644
--- a/drivers/firmware/dmi-sysfs.c
+++ b/drivers/firmware/dmi-sysfs.c
@@ -25,6 +25,7 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/list.h> 26#include <linux/list.h>
27#include <linux/io.h> 27#include <linux/io.h>
28#include <asm/dmi.h>
28 29
29#define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider 30#define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider
30 the top entry type is only 8 bits */ 31 the top entry type is only 8 bits */
@@ -380,7 +381,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry,
380 u8 __iomem *mapped; 381 u8 __iomem *mapped;
381 ssize_t wrote = 0; 382 ssize_t wrote = 0;
382 383
383 mapped = ioremap(sel->access_method_address, sel->area_length); 384 mapped = dmi_remap(sel->access_method_address, sel->area_length);
384 if (!mapped) 385 if (!mapped)
385 return -EIO; 386 return -EIO;
386 387
@@ -390,7 +391,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry,
390 wrote++; 391 wrote++;
391 } 392 }
392 393
393 iounmap(mapped); 394 dmi_unmap(mapped);
394 return wrote; 395 return wrote;
395} 396}
396 397
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 045d6d311bde..69d4d130e055 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -55,6 +55,25 @@ struct efi __read_mostly efi = {
55}; 55};
56EXPORT_SYMBOL(efi); 56EXPORT_SYMBOL(efi);
57 57
58static unsigned long *efi_tables[] = {
59 &efi.mps,
60 &efi.acpi,
61 &efi.acpi20,
62 &efi.smbios,
63 &efi.smbios3,
64 &efi.sal_systab,
65 &efi.boot_info,
66 &efi.hcdp,
67 &efi.uga,
68 &efi.uv_systab,
69 &efi.fw_vendor,
70 &efi.runtime,
71 &efi.config_table,
72 &efi.esrt,
73 &efi.properties_table,
74 &efi.mem_attr_table,
75};
76
58static bool disable_runtime; 77static bool disable_runtime;
59static int __init setup_noefi(char *arg) 78static int __init setup_noefi(char *arg)
60{ 79{
@@ -855,6 +874,20 @@ int efi_status_to_err(efi_status_t status)
855 return err; 874 return err;
856} 875}
857 876
877bool efi_is_table_address(unsigned long phys_addr)
878{
879 unsigned int i;
880
881 if (phys_addr == EFI_INVALID_TABLE_ADDR)
882 return false;
883
884 for (i = 0; i < ARRAY_SIZE(efi_tables); i++)
885 if (*(efi_tables[i]) == phys_addr)
886 return true;
887
888 return false;
889}
890
858#ifdef CONFIG_KEXEC 891#ifdef CONFIG_KEXEC
859static int update_efi_random_seed(struct notifier_block *nb, 892static int update_efi_random_seed(struct notifier_block *nb,
860 unsigned long code, void *unused) 893 unsigned long code, void *unused)
diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c
index 75273a251603..e83d6aec0c13 100644
--- a/drivers/firmware/pcdp.c
+++ b/drivers/firmware/pcdp.c
@@ -95,7 +95,7 @@ efi_setup_pcdp_console(char *cmdline)
95 if (efi.hcdp == EFI_INVALID_TABLE_ADDR) 95 if (efi.hcdp == EFI_INVALID_TABLE_ADDR)
96 return -ENODEV; 96 return -ENODEV;
97 97
98 pcdp = early_ioremap(efi.hcdp, 4096); 98 pcdp = early_memremap(efi.hcdp, 4096);
99 printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp); 99 printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp);
100 100
101 if (strstr(cmdline, "console=hcdp")) { 101 if (strstr(cmdline, "console=hcdp")) {
@@ -131,6 +131,6 @@ efi_setup_pcdp_console(char *cmdline)
131 } 131 }
132 132
133out: 133out:
134 early_iounmap(pcdp, 4096); 134 early_memunmap(pcdp, 4096);
135 return rc; 135 return rc;
136} 136}
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 88c6d78ee2d5..c55f338e380b 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -36,6 +36,7 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/shmem_fs.h> 37#include <linux/shmem_fs.h>
38#include <linux/dma-buf.h> 38#include <linux/dma-buf.h>
39#include <linux/mem_encrypt.h>
39#include <drm/drmP.h> 40#include <drm/drmP.h>
40#include <drm/drm_vma_manager.h> 41#include <drm/drm_vma_manager.h>
41#include <drm/drm_gem.h> 42#include <drm/drm_gem.h>
@@ -965,6 +966,7 @@ int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size,
965 vma->vm_ops = dev->driver->gem_vm_ops; 966 vma->vm_ops = dev->driver->gem_vm_ops;
966 vma->vm_private_data = obj; 967 vma->vm_private_data = obj;
967 vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); 968 vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
969 vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
968 970
969 /* Take a ref for this mapping of the object, so that the fault 971 /* Take a ref for this mapping of the object, so that the fault
970 * handler can dereference the mmap offset's pointer to the object. 972 * handler can dereference the mmap offset's pointer to the object.
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index 13a59ed2afbc..2660543ad86a 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -40,6 +40,7 @@
40#include <linux/efi.h> 40#include <linux/efi.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#endif 42#endif
43#include <linux/mem_encrypt.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
44#include "drm_internal.h" 45#include "drm_internal.h"
45#include "drm_legacy.h" 46#include "drm_legacy.h"
@@ -58,6 +59,9 @@ static pgprot_t drm_io_prot(struct drm_local_map *map,
58{ 59{
59 pgprot_t tmp = vm_get_page_prot(vma->vm_flags); 60 pgprot_t tmp = vm_get_page_prot(vma->vm_flags);
60 61
62 /* We don't want graphics memory to be mapped encrypted */
63 tmp = pgprot_decrypted(tmp);
64
61#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) 65#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
62 if (map->type == _DRM_REGISTERS && !(map->flags & _DRM_WRITE_COMBINING)) 66 if (map->type == _DRM_REGISTERS && !(map->flags & _DRM_WRITE_COMBINING))
63 tmp = pgprot_noncached(tmp); 67 tmp = pgprot_noncached(tmp);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index a01e5c90fd87..c8ebb757e36b 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -39,6 +39,7 @@
39#include <linux/rbtree.h> 39#include <linux/rbtree.h>
40#include <linux/module.h> 40#include <linux/module.h>
41#include <linux/uaccess.h> 41#include <linux/uaccess.h>
42#include <linux/mem_encrypt.h>
42 43
43#define TTM_BO_VM_NUM_PREFAULT 16 44#define TTM_BO_VM_NUM_PREFAULT 16
44 45
@@ -230,9 +231,11 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
230 * first page. 231 * first page.
231 */ 232 */
232 for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) { 233 for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
233 if (bo->mem.bus.is_iomem) 234 if (bo->mem.bus.is_iomem) {
235 /* Iomem should not be marked encrypted */
236 cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
234 pfn = bdev->driver->io_mem_pfn(bo, page_offset); 237 pfn = bdev->driver->io_mem_pfn(bo, page_offset);
235 else { 238 } else {
236 page = ttm->pages[page_offset]; 239 page = ttm->pages[page_offset];
237 if (unlikely(!page && i == 0)) { 240 if (unlikely(!page && i == 0)) {
238 retval = VM_FAULT_OOM; 241 retval = VM_FAULT_OOM;
diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c
index b7ca90db4e80..b5b335c9b2bb 100644
--- a/drivers/gpu/drm/udl/udl_fb.c
+++ b/drivers/gpu/drm/udl/udl_fb.c
@@ -14,6 +14,7 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/fb.h> 15#include <linux/fb.h>
16#include <linux/dma-buf.h> 16#include <linux/dma-buf.h>
17#include <linux/mem_encrypt.h>
17 18
18#include <drm/drmP.h> 19#include <drm/drmP.h>
19#include <drm/drm_crtc.h> 20#include <drm/drm_crtc.h>
@@ -169,6 +170,9 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
169 pr_notice("mmap() framebuffer addr:%lu size:%lu\n", 170 pr_notice("mmap() framebuffer addr:%lu size:%lu\n",
170 pos, size); 171 pos, size);
171 172
173 /* We don't want the framebuffer to be mapped encrypted */
174 vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
175
172 while (size > 0) { 176 while (size > 0) {
173 page = vmalloc_to_pfn((void *)pos); 177 page = vmalloc_to_pfn((void *)pos);
174 if (remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED)) 178 if (remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED))
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c2ae819a871c..e87ffb3c31a9 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -913,16 +913,15 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
913 struct cpuidle_state *state = &drv->states[index]; 913 struct cpuidle_state *state = &drv->states[index];
914 unsigned long eax = flg2MWAIT(state->flags); 914 unsigned long eax = flg2MWAIT(state->flags);
915 unsigned int cstate; 915 unsigned int cstate;
916 int cpu = smp_processor_id();
917 916
918 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 917 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
919 918
920 /* 919 /*
921 * leave_mm() to avoid costly and often unnecessary wakeups 920 * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition
922 * for flushing the user TLB's associated with the active mm. 921 * will probably flush the TLB. It's not guaranteed to flush
922 * the TLB, though, so it's not clear that we can do anything
923 * useful with this knowledge.
923 */ 924 */
924 if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
925 leave_mm(cpu);
926 925
927 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 926 if (!(lapic_timer_reliable_states & (1 << (cstate))))
928 tick_broadcast_enter(); 927 tick_broadcast_enter();
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 354cbd6392cd..4ad7e5e31943 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -575,7 +575,7 @@ static void dump_dte_entry(u16 devid)
575 575
576static void dump_command(unsigned long phys_addr) 576static void dump_command(unsigned long phys_addr)
577{ 577{
578 struct iommu_cmd *cmd = phys_to_virt(phys_addr); 578 struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
579 int i; 579 int i;
580 580
581 for (i = 0; i < 4; ++i) 581 for (i = 0; i < 4; ++i)
@@ -919,11 +919,13 @@ static void copy_cmd_to_buffer(struct amd_iommu *iommu,
919 919
920static void build_completion_wait(struct iommu_cmd *cmd, u64 address) 920static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
921{ 921{
922 u64 paddr = iommu_virt_to_phys((void *)address);
923
922 WARN_ON(address & 0x7ULL); 924 WARN_ON(address & 0x7ULL);
923 925
924 memset(cmd, 0, sizeof(*cmd)); 926 memset(cmd, 0, sizeof(*cmd));
925 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; 927 cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
926 cmd->data[1] = upper_32_bits(__pa(address)); 928 cmd->data[1] = upper_32_bits(paddr);
927 cmd->data[2] = 1; 929 cmd->data[2] = 1;
928 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 930 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
929} 931}
@@ -1383,7 +1385,7 @@ static bool increase_address_space(struct protection_domain *domain,
1383 return false; 1385 return false;
1384 1386
1385 *pte = PM_LEVEL_PDE(domain->mode, 1387 *pte = PM_LEVEL_PDE(domain->mode,
1386 virt_to_phys(domain->pt_root)); 1388 iommu_virt_to_phys(domain->pt_root));
1387 domain->pt_root = pte; 1389 domain->pt_root = pte;
1388 domain->mode += 1; 1390 domain->mode += 1;
1389 domain->updated = true; 1391 domain->updated = true;
@@ -1420,7 +1422,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
1420 if (!page) 1422 if (!page)
1421 return NULL; 1423 return NULL;
1422 1424
1423 __npte = PM_LEVEL_PDE(level, virt_to_phys(page)); 1425 __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
1424 1426
1425 /* pte could have been changed somewhere. */ 1427 /* pte could have been changed somewhere. */
1426 if (cmpxchg64(pte, __pte, __npte) != __pte) { 1428 if (cmpxchg64(pte, __pte, __npte) != __pte) {
@@ -1536,10 +1538,10 @@ static int iommu_map_page(struct protection_domain *dom,
1536 return -EBUSY; 1538 return -EBUSY;
1537 1539
1538 if (count > 1) { 1540 if (count > 1) {
1539 __pte = PAGE_SIZE_PTE(phys_addr, page_size); 1541 __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
1540 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; 1542 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
1541 } else 1543 } else
1542 __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; 1544 __pte = __sme_set(phys_addr) | IOMMU_PTE_P | IOMMU_PTE_FC;
1543 1545
1544 if (prot & IOMMU_PROT_IR) 1546 if (prot & IOMMU_PROT_IR)
1545 __pte |= IOMMU_PTE_IR; 1547 __pte |= IOMMU_PTE_IR;
@@ -1755,7 +1757,7 @@ static void free_gcr3_tbl_level1(u64 *tbl)
1755 if (!(tbl[i] & GCR3_VALID)) 1757 if (!(tbl[i] & GCR3_VALID))
1756 continue; 1758 continue;
1757 1759
1758 ptr = __va(tbl[i] & PAGE_MASK); 1760 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1759 1761
1760 free_page((unsigned long)ptr); 1762 free_page((unsigned long)ptr);
1761 } 1763 }
@@ -1770,7 +1772,7 @@ static void free_gcr3_tbl_level2(u64 *tbl)
1770 if (!(tbl[i] & GCR3_VALID)) 1772 if (!(tbl[i] & GCR3_VALID))
1771 continue; 1773 continue;
1772 1774
1773 ptr = __va(tbl[i] & PAGE_MASK); 1775 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1774 1776
1775 free_gcr3_tbl_level1(ptr); 1777 free_gcr3_tbl_level1(ptr);
1776 } 1778 }
@@ -2049,7 +2051,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
2049 u64 flags = 0; 2051 u64 flags = 0;
2050 2052
2051 if (domain->mode != PAGE_MODE_NONE) 2053 if (domain->mode != PAGE_MODE_NONE)
2052 pte_root = virt_to_phys(domain->pt_root); 2054 pte_root = iommu_virt_to_phys(domain->pt_root);
2053 2055
2054 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 2056 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
2055 << DEV_ENTRY_MODE_SHIFT; 2057 << DEV_ENTRY_MODE_SHIFT;
@@ -2061,7 +2063,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
2061 flags |= DTE_FLAG_IOTLB; 2063 flags |= DTE_FLAG_IOTLB;
2062 2064
2063 if (domain->flags & PD_IOMMUV2_MASK) { 2065 if (domain->flags & PD_IOMMUV2_MASK) {
2064 u64 gcr3 = __pa(domain->gcr3_tbl); 2066 u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
2065 u64 glx = domain->glx; 2067 u64 glx = domain->glx;
2066 u64 tmp; 2068 u64 tmp;
2067 2069
@@ -3606,10 +3608,10 @@ static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
3606 if (root == NULL) 3608 if (root == NULL)
3607 return NULL; 3609 return NULL;
3608 3610
3609 *pte = __pa(root) | GCR3_VALID; 3611 *pte = iommu_virt_to_phys(root) | GCR3_VALID;
3610 } 3612 }
3611 3613
3612 root = __va(*pte & PAGE_MASK); 3614 root = iommu_phys_to_virt(*pte & PAGE_MASK);
3613 3615
3614 level -= 1; 3616 level -= 1;
3615 } 3617 }
@@ -3788,7 +3790,7 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
3788 3790
3789 dte = amd_iommu_dev_table[devid].data[2]; 3791 dte = amd_iommu_dev_table[devid].data[2];
3790 dte &= ~DTE_IRQ_PHYS_ADDR_MASK; 3792 dte &= ~DTE_IRQ_PHYS_ADDR_MASK;
3791 dte |= virt_to_phys(table->table); 3793 dte |= iommu_virt_to_phys(table->table);
3792 dte |= DTE_IRQ_REMAP_INTCTL; 3794 dte |= DTE_IRQ_REMAP_INTCTL;
3793 dte |= DTE_IRQ_TABLE_LEN; 3795 dte |= DTE_IRQ_TABLE_LEN;
3794 dte |= DTE_IRQ_REMAP_ENABLE; 3796 dte |= DTE_IRQ_REMAP_ENABLE;
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 372303700566..2292a6cece76 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -30,6 +30,7 @@
30#include <linux/iommu.h> 30#include <linux/iommu.h>
31#include <linux/kmemleak.h> 31#include <linux/kmemleak.h>
32#include <linux/crash_dump.h> 32#include <linux/crash_dump.h>
33#include <linux/mem_encrypt.h>
33#include <asm/pci-direct.h> 34#include <asm/pci-direct.h>
34#include <asm/iommu.h> 35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
@@ -348,7 +349,7 @@ static void iommu_set_device_table(struct amd_iommu *iommu)
348 349
349 BUG_ON(iommu->mmio_base == NULL); 350 BUG_ON(iommu->mmio_base == NULL);
350 351
351 entry = virt_to_phys(amd_iommu_dev_table); 352 entry = iommu_virt_to_phys(amd_iommu_dev_table);
352 entry |= (dev_table_size >> 12) - 1; 353 entry |= (dev_table_size >> 12) - 1;
353 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, 354 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
354 &entry, sizeof(entry)); 355 &entry, sizeof(entry));
@@ -606,7 +607,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
606 607
607 BUG_ON(iommu->cmd_buf == NULL); 608 BUG_ON(iommu->cmd_buf == NULL);
608 609
609 entry = (u64)virt_to_phys(iommu->cmd_buf); 610 entry = iommu_virt_to_phys(iommu->cmd_buf);
610 entry |= MMIO_CMD_SIZE_512; 611 entry |= MMIO_CMD_SIZE_512;
611 612
612 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 613 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -635,7 +636,7 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu)
635 636
636 BUG_ON(iommu->evt_buf == NULL); 637 BUG_ON(iommu->evt_buf == NULL);
637 638
638 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 639 entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
639 640
640 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 641 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
641 &entry, sizeof(entry)); 642 &entry, sizeof(entry));
@@ -668,7 +669,7 @@ static void iommu_enable_ppr_log(struct amd_iommu *iommu)
668 if (iommu->ppr_log == NULL) 669 if (iommu->ppr_log == NULL)
669 return; 670 return;
670 671
671 entry = (u64)virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512; 672 entry = iommu_virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
672 673
673 memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET, 674 memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET,
674 &entry, sizeof(entry)); 675 &entry, sizeof(entry));
@@ -748,10 +749,10 @@ static int iommu_init_ga_log(struct amd_iommu *iommu)
748 if (!iommu->ga_log_tail) 749 if (!iommu->ga_log_tail)
749 goto err_out; 750 goto err_out;
750 751
751 entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512; 752 entry = iommu_virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
752 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET, 753 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET,
753 &entry, sizeof(entry)); 754 &entry, sizeof(entry));
754 entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL; 755 entry = (iommu_virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
755 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET, 756 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET,
756 &entry, sizeof(entry)); 757 &entry, sizeof(entry));
757 writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); 758 writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
@@ -2564,6 +2565,24 @@ static int __init amd_iommu_init(void)
2564 return ret; 2565 return ret;
2565} 2566}
2566 2567
2568static bool amd_iommu_sme_check(void)
2569{
2570 if (!sme_active() || (boot_cpu_data.x86 != 0x17))
2571 return true;
2572
2573 /* For Fam17h, a specific level of support is required */
2574 if (boot_cpu_data.microcode >= 0x08001205)
2575 return true;
2576
2577 if ((boot_cpu_data.microcode >= 0x08001126) &&
2578 (boot_cpu_data.microcode <= 0x080011ff))
2579 return true;
2580
2581 pr_notice("AMD-Vi: IOMMU not currently supported when SME is active\n");
2582
2583 return false;
2584}
2585
2567/**************************************************************************** 2586/****************************************************************************
2568 * 2587 *
2569 * Early detect code. This code runs at IOMMU detection time in the DMA 2588 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -2578,6 +2597,9 @@ int __init amd_iommu_detect(void)
2578 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 2597 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
2579 return -ENODEV; 2598 return -ENODEV;
2580 2599
2600 if (!amd_iommu_sme_check())
2601 return -ENODEV;
2602
2581 ret = iommu_go_to_state(IOMMU_IVRS_DETECTED); 2603 ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);
2582 if (ret) 2604 if (ret)
2583 return ret; 2605 return ret;
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 466260f8a1df..3f12fb2338ea 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -87,4 +87,14 @@ static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
87 return !!(iommu->features & f); 87 return !!(iommu->features & f);
88} 88}
89 89
90static inline u64 iommu_virt_to_phys(void *vaddr)
91{
92 return (u64)__sme_set(virt_to_phys(vaddr));
93}
94
95static inline void *iommu_phys_to_virt(unsigned long paddr)
96{
97 return phys_to_virt(__sme_clr(paddr));
98}
99
90#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ 100#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index d6b873b57054..8e3a85759242 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -344,7 +344,7 @@
344 344
345#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 345#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
346#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) 346#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
347#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) 347#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
348#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) 348#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
349 349
350#define IOMMU_PROT_MASK 0x03 350#define IOMMU_PROT_MASK 0x03
diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c
index 296db7a69c27..153b3f3cc795 100644
--- a/drivers/sfi/sfi_core.c
+++ b/drivers/sfi/sfi_core.c
@@ -68,6 +68,7 @@
68#include <linux/init.h> 68#include <linux/init.h>
69#include <linux/sfi.h> 69#include <linux/sfi.h>
70#include <linux/slab.h> 70#include <linux/slab.h>
71#include <linux/io.h>
71 72
72#include "sfi_core.h" 73#include "sfi_core.h"
73 74
@@ -86,13 +87,13 @@ static struct sfi_table_simple *syst_va __read_mostly;
86/* 87/*
87 * FW creates and saves the SFI tables in memory. When these tables get 88 * FW creates and saves the SFI tables in memory. When these tables get
88 * used, they may need to be mapped to virtual address space, and the mapping 89 * used, they may need to be mapped to virtual address space, and the mapping
89 * can happen before or after the ioremap() is ready, so a flag is needed 90 * can happen before or after the memremap() is ready, so a flag is needed
90 * to indicating this 91 * to indicating this
91 */ 92 */
92static u32 sfi_use_ioremap __read_mostly; 93static u32 sfi_use_memremap __read_mostly;
93 94
94/* 95/*
95 * sfi_un/map_memory calls early_ioremap/iounmap which is a __init function 96 * sfi_un/map_memory calls early_memremap/memunmap which is a __init function
96 * and introduces section mismatch. So use __ref to make it calm. 97 * and introduces section mismatch. So use __ref to make it calm.
97 */ 98 */
98static void __iomem * __ref sfi_map_memory(u64 phys, u32 size) 99static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
@@ -100,10 +101,10 @@ static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
100 if (!phys || !size) 101 if (!phys || !size)
101 return NULL; 102 return NULL;
102 103
103 if (sfi_use_ioremap) 104 if (sfi_use_memremap)
104 return ioremap_cache(phys, size); 105 return memremap(phys, size, MEMREMAP_WB);
105 else 106 else
106 return early_ioremap(phys, size); 107 return early_memremap(phys, size);
107} 108}
108 109
109static void __ref sfi_unmap_memory(void __iomem *virt, u32 size) 110static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
@@ -111,10 +112,10 @@ static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
111 if (!virt || !size) 112 if (!virt || !size)
112 return; 113 return;
113 114
114 if (sfi_use_ioremap) 115 if (sfi_use_memremap)
115 iounmap(virt); 116 memunmap(virt);
116 else 117 else
117 early_iounmap(virt, size); 118 early_memunmap(virt, size);
118} 119}
119 120
120static void sfi_print_table_header(unsigned long long pa, 121static void sfi_print_table_header(unsigned long long pa,
@@ -507,8 +508,8 @@ void __init sfi_init_late(void)
507 length = syst_va->header.len; 508 length = syst_va->header.len;
508 sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple)); 509 sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple));
509 510
510 /* Use ioremap now after it is ready */ 511 /* Use memremap now after it is ready */
511 sfi_use_ioremap = 1; 512 sfi_use_memremap = 1;
512 syst_va = sfi_map_memory(syst_pa, length); 513 syst_va = sfi_map_memory(syst_pa, length);
513 514
514 sfi_acpi_init(); 515 sfi_acpi_init();
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 7a42238db446..25e862c487f6 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -32,6 +32,7 @@
32#include <linux/device.h> 32#include <linux/device.h>
33#include <linux/efi.h> 33#include <linux/efi.h>
34#include <linux/fb.h> 34#include <linux/fb.h>
35#include <linux/mem_encrypt.h>
35 36
36#include <asm/fb.h> 37#include <asm/fb.h>
37 38
@@ -1396,6 +1397,12 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
1396 mutex_lock(&info->mm_lock); 1397 mutex_lock(&info->mm_lock);
1397 if (fb->fb_mmap) { 1398 if (fb->fb_mmap) {
1398 int res; 1399 int res;
1400
1401 /*
1402 * The framebuffer needs to be accessed decrypted, be sure
1403 * SME protection is removed ahead of the call
1404 */
1405 vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
1399 res = fb->fb_mmap(info, vma); 1406 res = fb->fb_mmap(info, vma);
1400 mutex_unlock(&info->mm_lock); 1407 mutex_unlock(&info->mm_lock);
1401 return res; 1408 return res;
@@ -1421,6 +1428,11 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
1421 mutex_unlock(&info->mm_lock); 1428 mutex_unlock(&info->mm_lock);
1422 1429
1423 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 1430 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
1431 /*
1432 * The framebuffer needs to be accessed decrypted, be sure
1433 * SME protection is removed
1434 */
1435 vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
1424 fb_pgprotect(file, vma, start); 1436 fb_pgprotect(file, vma, start);
1425 1437
1426 return vm_iomap_memory(vma, start, len); 1438 return vm_iomap_memory(vma, start, len);
diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h
index 734ad4db388c..2edef8d7fa6b 100644
--- a/include/asm-generic/early_ioremap.h
+++ b/include/asm-generic/early_ioremap.h
@@ -13,6 +13,8 @@ extern void *early_memremap(resource_size_t phys_addr,
13 unsigned long size); 13 unsigned long size);
14extern void *early_memremap_ro(resource_size_t phys_addr, 14extern void *early_memremap_ro(resource_size_t phys_addr,
15 unsigned long size); 15 unsigned long size);
16extern void *early_memremap_prot(resource_size_t phys_addr,
17 unsigned long size, unsigned long prot_val);
16extern void early_iounmap(void __iomem *addr, unsigned long size); 18extern void early_iounmap(void __iomem *addr, unsigned long size);
17extern void early_memunmap(void *addr, unsigned long size); 19extern void early_memunmap(void *addr, unsigned long size);
18 20
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 7dfa767dc680..4d7bb98f4134 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -583,6 +583,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
583#endif /* CONFIG_MMU */ 583#endif /* CONFIG_MMU */
584 584
585/* 585/*
586 * No-op macros that just return the current protection value. Defined here
587 * because these macros can be used used even if CONFIG_MMU is not defined.
588 */
589#ifndef pgprot_encrypted
590#define pgprot_encrypted(prot) (prot)
591#endif
592
593#ifndef pgprot_decrypted
594#define pgprot_decrypted(prot) (prot)
595#endif
596
597/*
586 * A facility to provide lazy MMU batching. This allows PTE updates and 598 * A facility to provide lazy MMU batching. This allows PTE updates and
587 * page invalidations to be delayed until a call to leave lazy MMU mode 599 * page invalidations to be delayed until a call to leave lazy MMU mode
588 * is issued. Some architectures may benefit from doing this, and it is 600 * is issued. Some architectures may benefit from doing this, and it is
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 310f51d42550..16d41de92ee3 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -166,6 +166,8 @@
166 166
167#if GCC_VERSION >= 40100 167#if GCC_VERSION >= 40100
168# define __compiletime_object_size(obj) __builtin_object_size(obj, 0) 168# define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
169
170#define __nostackprotector __attribute__((__optimize__("no-stack-protector")))
169#endif 171#endif
170 172
171#if GCC_VERSION >= 40300 173#if GCC_VERSION >= 40300
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index e786337cf5a7..e95a2631e545 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -501,6 +501,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
501#define __visible 501#define __visible
502#endif 502#endif
503 503
504#ifndef __nostackprotector
505# define __nostackprotector
506#endif
507
504/* 508/*
505 * Assume alignment of return value. 509 * Assume alignment of return value.
506 */ 510 */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 03c0196a6f24..2189c79cde5d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -10,6 +10,7 @@
10#include <linux/scatterlist.h> 10#include <linux/scatterlist.h>
11#include <linux/kmemcheck.h> 11#include <linux/kmemcheck.h>
12#include <linux/bug.h> 12#include <linux/bug.h>
13#include <linux/mem_encrypt.h>
13 14
14/** 15/**
15 * List of possible attributes associated with a DMA mapping. The semantics 16 * List of possible attributes associated with a DMA mapping. The semantics
@@ -572,6 +573,12 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
572 return 0; 573 return 0;
573} 574}
574 575
576static inline void dma_check_mask(struct device *dev, u64 mask)
577{
578 if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
579 dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
580}
581
575static inline int dma_supported(struct device *dev, u64 mask) 582static inline int dma_supported(struct device *dev, u64 mask)
576{ 583{
577 const struct dma_map_ops *ops = get_dma_ops(dev); 584 const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -588,6 +595,9 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
588{ 595{
589 if (!dev->dma_mask || !dma_supported(dev, mask)) 596 if (!dev->dma_mask || !dma_supported(dev, mask))
590 return -EIO; 597 return -EIO;
598
599 dma_check_mask(dev, mask);
600
591 *dev->dma_mask = mask; 601 *dev->dma_mask = mask;
592 return 0; 602 return 0;
593} 603}
@@ -607,6 +617,9 @@ static inline int dma_set_coherent_mask(struct device *dev, u64 mask)
607{ 617{
608 if (!dma_supported(dev, mask)) 618 if (!dma_supported(dev, mask))
609 return -EIO; 619 return -EIO;
620
621 dma_check_mask(dev, mask);
622
610 dev->coherent_dma_mask = mask; 623 dev->coherent_dma_mask = mask;
611 return 0; 624 return 0;
612} 625}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index a686ca9a7e5c..4102b85217d5 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -985,7 +985,7 @@ static inline void efi_esrt_init(void) { }
985extern int efi_config_parse_tables(void *config_tables, int count, int sz, 985extern int efi_config_parse_tables(void *config_tables, int count, int sz,
986 efi_config_table_type_t *arch_tables); 986 efi_config_table_type_t *arch_tables);
987extern u64 efi_get_iobase (void); 987extern u64 efi_get_iobase (void);
988extern u32 efi_mem_type (unsigned long phys_addr); 988extern int efi_mem_type(unsigned long phys_addr);
989extern u64 efi_mem_attributes (unsigned long phys_addr); 989extern u64 efi_mem_attributes (unsigned long phys_addr);
990extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); 990extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
991extern int __init efi_uart_console_only (void); 991extern int __init efi_uart_console_only (void);
@@ -1113,6 +1113,8 @@ static inline bool efi_enabled(int feature)
1113 return test_bit(feature, &efi.flags) != 0; 1113 return test_bit(feature, &efi.flags) != 0;
1114} 1114}
1115extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused); 1115extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused);
1116
1117extern bool efi_is_table_address(unsigned long phys_addr);
1116#else 1118#else
1117static inline bool efi_enabled(int feature) 1119static inline bool efi_enabled(int feature)
1118{ 1120{
@@ -1126,6 +1128,11 @@ efi_capsule_pending(int *reset_type)
1126{ 1128{
1127 return false; 1129 return false;
1128} 1130}
1131
1132static inline bool efi_is_table_address(unsigned long phys_addr)
1133{
1134 return false;
1135}
1129#endif 1136#endif
1130 1137
1131extern int efi_status_to_err(efi_status_t status); 1138extern int efi_status_to_err(efi_status_t status);
diff --git a/include/linux/io.h b/include/linux/io.h
index 2195d9ea4aaa..32e30e8fb9db 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -157,6 +157,8 @@ enum {
157 MEMREMAP_WB = 1 << 0, 157 MEMREMAP_WB = 1 << 0,
158 MEMREMAP_WT = 1 << 1, 158 MEMREMAP_WT = 1 << 1,
159 MEMREMAP_WC = 1 << 2, 159 MEMREMAP_WC = 1 << 2,
160 MEMREMAP_ENC = 1 << 3,
161 MEMREMAP_DEC = 1 << 4,
160}; 162};
161 163
162void *memremap(resource_size_t offset, size_t size, unsigned long flags); 164void *memremap(resource_size_t offset, size_t size, unsigned long flags);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index dd056fab9e35..2b7590f5483a 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -327,6 +327,14 @@ static inline void *boot_phys_to_virt(unsigned long entry)
327 return phys_to_virt(boot_phys_to_phys(entry)); 327 return phys_to_virt(boot_phys_to_phys(entry));
328} 328}
329 329
330#ifndef arch_kexec_post_alloc_pages
331static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; }
332#endif
333
334#ifndef arch_kexec_pre_free_pages
335static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
336#endif
337
330#else /* !CONFIG_KEXEC_CORE */ 338#else /* !CONFIG_KEXEC_CORE */
331struct pt_regs; 339struct pt_regs;
332struct task_struct; 340struct task_struct;
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
new file mode 100644
index 000000000000..1255f09f5e42
--- /dev/null
+++ b/include/linux/mem_encrypt.h
@@ -0,0 +1,48 @@
1/*
2 * AMD Memory Encryption Support
3 *
4 * Copyright (C) 2016 Advanced Micro Devices, Inc.
5 *
6 * Author: Tom Lendacky <thomas.lendacky@amd.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#ifndef __MEM_ENCRYPT_H__
14#define __MEM_ENCRYPT_H__
15
16#ifndef __ASSEMBLY__
17
18#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
19
20#include <asm/mem_encrypt.h>
21
22#else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
23
24#define sme_me_mask 0UL
25
26#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
27
28static inline bool sme_active(void)
29{
30 return !!sme_me_mask;
31}
32
33static inline unsigned long sme_get_me_mask(void)
34{
35 return sme_me_mask;
36}
37
38/*
39 * The __sme_set() and __sme_clr() macros are useful for adding or removing
40 * the encryption mask from a value (e.g. when dealing with pagetable
41 * entries).
42 */
43#define __sme_set(x) ((unsigned long)(x) | sme_me_mask)
44#define __sme_clr(x) ((unsigned long)(x) & ~sme_me_mask)
45
46#endif /* __ASSEMBLY__ */
47
48#endif /* __MEM_ENCRYPT_H__ */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index e030a68ead7e..25438b2b6f22 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -126,4 +126,10 @@ static __always_inline enum lru_list page_lru(struct page *page)
126 126
127#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) 127#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
128 128
129#ifdef arch_unmap_kpfn
130extern void arch_unmap_kpfn(unsigned long pfn);
131#else
132static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
133#endif
134
129#endif 135#endif
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 4ee479f2f355..15e7160751a8 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -35,6 +35,7 @@ int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
35extern unsigned long swiotlb_nr_tbl(void); 35extern unsigned long swiotlb_nr_tbl(void);
36unsigned long swiotlb_size_or_default(void); 36unsigned long swiotlb_size_or_default(void);
37extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); 37extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
38extern void __init swiotlb_update_mem_attributes(void);
38 39
39/* 40/*
40 * Enumeration for sync targets 41 * Enumeration for sync targets
diff --git a/init/main.c b/init/main.c
index b78f63c30b17..8828fc148670 100644
--- a/init/main.c
+++ b/init/main.c
@@ -487,6 +487,8 @@ void __init __weak thread_stack_cache_init(void)
487} 487}
488#endif 488#endif
489 489
490void __init __weak mem_encrypt_init(void) { }
491
490/* 492/*
491 * Set up kernel memory allocators 493 * Set up kernel memory allocators
492 */ 494 */
@@ -640,6 +642,14 @@ asmlinkage __visible void __init start_kernel(void)
640 */ 642 */
641 locking_selftest(); 643 locking_selftest();
642 644
645 /*
646 * This needs to be called before any devices perform DMA
647 * operations that might use the SWIOTLB bounce buffers. It will
648 * mark the bounce buffers as decrypted so that their usage will
649 * not cause "plain-text" data to be decrypted when accessed.
650 */
651 mem_encrypt_init();
652
643#ifdef CONFIG_BLK_DEV_INITRD 653#ifdef CONFIG_BLK_DEV_INITRD
644 if (initrd_start && !initrd_below_start_ok && 654 if (initrd_start && !initrd_below_start_ok &&
645 page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { 655 page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1ae7c41c33c1..20fef1a38602 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
301{ 301{
302 struct page *pages; 302 struct page *pages;
303 303
304 pages = alloc_pages(gfp_mask, order); 304 pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
305 if (pages) { 305 if (pages) {
306 unsigned int count, i; 306 unsigned int count, i;
307 307
@@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
310 count = 1 << order; 310 count = 1 << order;
311 for (i = 0; i < count; i++) 311 for (i = 0; i < count; i++)
312 SetPageReserved(pages + i); 312 SetPageReserved(pages + i);
313
314 arch_kexec_post_alloc_pages(page_address(pages), count,
315 gfp_mask);
316
317 if (gfp_mask & __GFP_ZERO)
318 for (i = 0; i < count; i++)
319 clear_highpage(pages + i);
313 } 320 }
314 321
315 return pages; 322 return pages;
@@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page)
321 328
322 order = page_private(page); 329 order = page_private(page);
323 count = 1 << order; 330 count = 1 << order;
331
332 arch_kexec_pre_free_pages(page_address(page), count);
333
324 for (i = 0; i < count; i++) 334 for (i = 0; i < count; i++)
325 ClearPageReserved(page + i); 335 ClearPageReserved(page + i);
326 __free_pages(page, order); 336 __free_pages(page, order);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 124bed776532..9afdc434fb49 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
34} 34}
35#endif 35#endif
36 36
37static void *try_ram_remap(resource_size_t offset, size_t size) 37#ifndef arch_memremap_can_ram_remap
38static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
39 unsigned long flags)
40{
41 return true;
42}
43#endif
44
45static void *try_ram_remap(resource_size_t offset, size_t size,
46 unsigned long flags)
38{ 47{
39 unsigned long pfn = PHYS_PFN(offset); 48 unsigned long pfn = PHYS_PFN(offset);
40 49
41 /* In the simple case just return the existing linear address */ 50 /* In the simple case just return the existing linear address */
42 if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) 51 if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
52 arch_memremap_can_ram_remap(offset, size, flags))
43 return __va(offset); 53 return __va(offset);
54
44 return NULL; /* fallback to arch_memremap_wb */ 55 return NULL; /* fallback to arch_memremap_wb */
45} 56}
46 57
@@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
48 * memremap() - remap an iomem_resource as cacheable memory 59 * memremap() - remap an iomem_resource as cacheable memory
49 * @offset: iomem resource start address 60 * @offset: iomem resource start address
50 * @size: size of remap 61 * @size: size of remap
51 * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC 62 * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
63 * MEMREMAP_ENC, MEMREMAP_DEC
52 * 64 *
53 * memremap() is "ioremap" for cases where it is known that the resource 65 * memremap() is "ioremap" for cases where it is known that the resource
54 * being mapped does not have i/o side effects and the __iomem 66 * being mapped does not have i/o side effects and the __iomem
@@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
95 * the requested range is potentially in System RAM. 107 * the requested range is potentially in System RAM.
96 */ 108 */
97 if (is_ram == REGION_INTERSECTS) 109 if (is_ram == REGION_INTERSECTS)
98 addr = try_ram_remap(offset, size); 110 addr = try_ram_remap(offset, size, flags);
99 if (!addr) 111 if (!addr)
100 addr = arch_memremap_wb(offset, size); 112 addr = arch_memremap_wb(offset, size);
101 } 113 }
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index a8d74a733a38..8c6c83ef57a4 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -30,6 +30,7 @@
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/gfp.h> 31#include <linux/gfp.h>
32#include <linux/scatterlist.h> 32#include <linux/scatterlist.h>
33#include <linux/mem_encrypt.h>
33 34
34#include <asm/io.h> 35#include <asm/io.h>
35#include <asm/dma.h> 36#include <asm/dma.h>
@@ -155,6 +156,15 @@ unsigned long swiotlb_size_or_default(void)
155 return size ? size : (IO_TLB_DEFAULT_SIZE); 156 return size ? size : (IO_TLB_DEFAULT_SIZE);
156} 157}
157 158
159void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { }
160
161/* For swiotlb, clear memory encryption mask from dma addresses */
162static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev,
163 phys_addr_t address)
164{
165 return __sme_clr(phys_to_dma(hwdev, address));
166}
167
158/* Note that this doesn't work with highmem page */ 168/* Note that this doesn't work with highmem page */
159static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, 169static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
160 volatile void *address) 170 volatile void *address)
@@ -183,6 +193,31 @@ void swiotlb_print_info(void)
183 bytes >> 20, vstart, vend - 1); 193 bytes >> 20, vstart, vend - 1);
184} 194}
185 195
196/*
197 * Early SWIOTLB allocation may be too early to allow an architecture to
198 * perform the desired operations. This function allows the architecture to
199 * call SWIOTLB when the operations are possible. It needs to be called
200 * before the SWIOTLB memory is used.
201 */
202void __init swiotlb_update_mem_attributes(void)
203{
204 void *vaddr;
205 unsigned long bytes;
206
207 if (no_iotlb_memory || late_alloc)
208 return;
209
210 vaddr = phys_to_virt(io_tlb_start);
211 bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
212 swiotlb_set_mem_attributes(vaddr, bytes);
213 memset(vaddr, 0, bytes);
214
215 vaddr = phys_to_virt(io_tlb_overflow_buffer);
216 bytes = PAGE_ALIGN(io_tlb_overflow);
217 swiotlb_set_mem_attributes(vaddr, bytes);
218 memset(vaddr, 0, bytes);
219}
220
186int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) 221int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
187{ 222{
188 void *v_overflow_buffer; 223 void *v_overflow_buffer;
@@ -320,6 +355,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
320 io_tlb_start = virt_to_phys(tlb); 355 io_tlb_start = virt_to_phys(tlb);
321 io_tlb_end = io_tlb_start + bytes; 356 io_tlb_end = io_tlb_start + bytes;
322 357
358 swiotlb_set_mem_attributes(tlb, bytes);
323 memset(tlb, 0, bytes); 359 memset(tlb, 0, bytes);
324 360
325 /* 361 /*
@@ -330,6 +366,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
330 if (!v_overflow_buffer) 366 if (!v_overflow_buffer)
331 goto cleanup2; 367 goto cleanup2;
332 368
369 swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow);
370 memset(v_overflow_buffer, 0, io_tlb_overflow);
333 io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); 371 io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
334 372
335 /* 373 /*
@@ -469,6 +507,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
469 if (no_iotlb_memory) 507 if (no_iotlb_memory)
470 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); 508 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
471 509
510 if (sme_active())
511 pr_warn_once("SME is active and system is using DMA bounce buffers\n");
512
472 mask = dma_get_seg_boundary(hwdev); 513 mask = dma_get_seg_boundary(hwdev);
473 514
474 tbl_dma_addr &= mask; 515 tbl_dma_addr &= mask;
@@ -581,7 +622,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
581 return SWIOTLB_MAP_ERROR; 622 return SWIOTLB_MAP_ERROR;
582 } 623 }
583 624
584 start_dma_addr = phys_to_dma(hwdev, io_tlb_start); 625 start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start);
585 return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, 626 return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
586 dir, attrs); 627 dir, attrs);
587} 628}
@@ -702,7 +743,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
702 goto err_warn; 743 goto err_warn;
703 744
704 ret = phys_to_virt(paddr); 745 ret = phys_to_virt(paddr);
705 dev_addr = phys_to_dma(hwdev, paddr); 746 dev_addr = swiotlb_phys_to_dma(hwdev, paddr);
706 747
707 /* Confirm address can be DMA'd by device */ 748 /* Confirm address can be DMA'd by device */
708 if (dev_addr + size - 1 > dma_mask) { 749 if (dev_addr + size - 1 > dma_mask) {
@@ -812,10 +853,10 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
812 map = map_single(dev, phys, size, dir, attrs); 853 map = map_single(dev, phys, size, dir, attrs);
813 if (map == SWIOTLB_MAP_ERROR) { 854 if (map == SWIOTLB_MAP_ERROR) {
814 swiotlb_full(dev, size, dir, 1); 855 swiotlb_full(dev, size, dir, 1);
815 return phys_to_dma(dev, io_tlb_overflow_buffer); 856 return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
816 } 857 }
817 858
818 dev_addr = phys_to_dma(dev, map); 859 dev_addr = swiotlb_phys_to_dma(dev, map);
819 860
820 /* Ensure that the address returned is DMA'ble */ 861 /* Ensure that the address returned is DMA'ble */
821 if (dma_capable(dev, dev_addr, size)) 862 if (dma_capable(dev, dev_addr, size))
@@ -824,7 +865,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
824 attrs |= DMA_ATTR_SKIP_CPU_SYNC; 865 attrs |= DMA_ATTR_SKIP_CPU_SYNC;
825 swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); 866 swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
826 867
827 return phys_to_dma(dev, io_tlb_overflow_buffer); 868 return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
828} 869}
829EXPORT_SYMBOL_GPL(swiotlb_map_page); 870EXPORT_SYMBOL_GPL(swiotlb_map_page);
830 871
@@ -958,7 +999,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
958 sg_dma_len(sgl) = 0; 999 sg_dma_len(sgl) = 0;
959 return 0; 1000 return 0;
960 } 1001 }
961 sg->dma_address = phys_to_dma(hwdev, map); 1002 sg->dma_address = swiotlb_phys_to_dma(hwdev, map);
962 } else 1003 } else
963 sg->dma_address = dev_addr; 1004 sg->dma_address = dev_addr;
964 sg_dma_len(sg) = sg->length; 1005 sg_dma_len(sg) = sg->length;
@@ -1026,7 +1067,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
1026int 1067int
1027swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) 1068swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
1028{ 1069{
1029 return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer)); 1070 return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer));
1030} 1071}
1031EXPORT_SYMBOL(swiotlb_dma_mapping_error); 1072EXPORT_SYMBOL(swiotlb_dma_mapping_error);
1032 1073
@@ -1039,6 +1080,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
1039int 1080int
1040swiotlb_dma_supported(struct device *hwdev, u64 mask) 1081swiotlb_dma_supported(struct device *hwdev, u64 mask)
1041{ 1082{
1042 return phys_to_dma(hwdev, io_tlb_end - 1) <= mask; 1083 return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
1043} 1084}
1044EXPORT_SYMBOL(swiotlb_dma_supported); 1085EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 6d5717bd7197..b1dd4a948fc0 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup);
30 30
31static int after_paging_init __initdata; 31static int after_paging_init __initdata;
32 32
33pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
34 unsigned long size,
35 pgprot_t prot)
36{
37 return prot;
38}
39
33void __init __weak early_ioremap_shutdown(void) 40void __init __weak early_ioremap_shutdown(void)
34{ 41{
35} 42}
@@ -215,14 +222,29 @@ early_ioremap(resource_size_t phys_addr, unsigned long size)
215void __init * 222void __init *
216early_memremap(resource_size_t phys_addr, unsigned long size) 223early_memremap(resource_size_t phys_addr, unsigned long size)
217{ 224{
218 return (__force void *)__early_ioremap(phys_addr, size, 225 pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
219 FIXMAP_PAGE_NORMAL); 226 FIXMAP_PAGE_NORMAL);
227
228 return (__force void *)__early_ioremap(phys_addr, size, prot);
220} 229}
221#ifdef FIXMAP_PAGE_RO 230#ifdef FIXMAP_PAGE_RO
222void __init * 231void __init *
223early_memremap_ro(resource_size_t phys_addr, unsigned long size) 232early_memremap_ro(resource_size_t phys_addr, unsigned long size)
224{ 233{
225 return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); 234 pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
235 FIXMAP_PAGE_RO);
236
237 return (__force void *)__early_ioremap(phys_addr, size, prot);
238}
239#endif
240
241#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
242void __init *
243early_memremap_prot(resource_size_t phys_addr, unsigned long size,
244 unsigned long prot_val)
245{
246 return (__force void *)__early_ioremap(phys_addr, size,
247 __pgprot(prot_val));
226} 248}
227#endif 249#endif
228 250
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1cd3b3569af8..88366626c0b7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1146 return 0; 1146 return 0;
1147 } 1147 }
1148 1148
1149 arch_unmap_kpfn(pfn);
1150
1149 orig_head = hpage = compound_head(p); 1151 orig_head = hpage = compound_head(p);
1150 num_poisoned_pages_inc(); 1152 num_poisoned_pages_inc();
1151 1153