aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-02 02:54:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-02 02:54:56 -0400
commitd3b5d35290d729a2518af00feca867385a1b08fa (patch)
tree7b56c0863d59bc57f7c7dcf5d5665c56b05f1d1b
parentaa2a4b6569d5b10491b606a86e574dff3852597a (diff)
parent71389703839ebe9cb426c72d5f0bd549592e583c (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The main x86 MM changes in this cycle were: - continued native kernel PCID support preparation patches to the TLB flushing code (Andy Lutomirski) - various fixes related to 32-bit compat syscall returning address over 4Gb in applications, launched from 64-bit binaries - motivated by C/R frameworks such as Virtuozzo. (Dmitry Safonov) - continued Intel 5-level paging enablement: in particular the conversion of x86 GUP to the generic GUP code. (Kirill A. Shutemov) - x86/mpx ABI corner case fixes/enhancements (Joerg Roedel) - ... plus misc updates, fixes and cleanups" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (62 commits) mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash x86/mm: Fix flush_tlb_page() on Xen x86/mm: Make flush_tlb_mm_range() more predictable x86/mm: Remove flush_tlb() and flush_tlb_current_task() x86/vm86/32: Switch to flush_tlb_mm_range() in mark_screen_rdonly() x86/mm/64: Fix crash in remove_pagetable() Revert "x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation" x86/boot/e820: Remove a redundant self assignment x86/mm: Fix dump pagetables for 4 levels of page tables x86/mpx, selftests: Only check bounds-vs-shadow when we keep shadow x86/mpx: Correctly report do_mpx_bt_fault() failures to user-space Revert "x86/mm/numa: Remove numa_nodemask_from_meminfo()" x86/espfix: Add support for 5-level paging x86/kasan: Extend KASAN to support 5-level paging x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL=y x86/paravirt: Add 5-level support to the paravirt code x86/mm: Define virtual memory map for 5-level paging x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert x86/boot: Detect 5-level paging support x86/mm/numa: Remove numa_nodemask_from_meminfo() ...
-rw-r--r--Documentation/x86/x86_64/mm.txt36
-rw-r--r--arch/Kconfig7
-rw-r--r--arch/powerpc/include/asm/mmu_context.h6
-rw-r--r--arch/s390/include/asm/mmu_context.h6
-rw-r--r--arch/um/include/asm/mmu_context.h6
-rw-r--r--arch/unicore32/include/asm/mmu_context.h6
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/cpucheck.c9
-rw-r--r--arch/x86/boot/cpuflags.c12
-rw-r--r--arch/x86/entry/entry_64.S7
-rw-r--r--arch/x86/entry/vdso/vma.c2
-rw-r--r--arch/x86/include/asm/desc.h147
-rw-r--r--arch/x86/include/asm/disabled-features.h8
-rw-r--r--arch/x86/include/asm/elf.h28
-rw-r--r--arch/x86/include/asm/fixmap.h4
-rw-r--r--arch/x86/include/asm/kasan.h9
-rw-r--r--arch/x86/include/asm/kexec.h1
-rw-r--r--arch/x86/include/asm/mmu_context.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h10
-rw-r--r--arch/x86/include/asm/paravirt.h54
-rw-r--r--arch/x86/include/asm/paravirt_types.h17
-rw-r--r--arch/x86/include/asm/pgalloc.h37
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable.h84
-rw-r--r--arch/x86/include/asm/pgtable_32.h1
-rw-r--r--arch/x86/include/asm/pgtable_64.h23
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h32
-rw-r--r--arch/x86/include/asm/pgtable_types.h46
-rw-r--r--arch/x86/include/asm/processor.h6
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/sparsemem.h9
-rw-r--r--arch/x86/include/asm/stackprotector.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h9
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/apm_32.c6
-rw-r--r--arch/x86/kernel/cpu/common.c59
-rw-r--r--arch/x86/kernel/e820.c1
-rw-r--r--arch/x86/kernel/espfix_64.c12
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c14
-rw-r--r--arch/x86/kernel/paravirt.c13
-rw-r--r--arch/x86/kernel/process_64.c67
-rw-r--r--arch/x86/kernel/setup.c15
-rw-r--r--arch/x86/kernel/setup_percpu.c23
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c15
-rw-r--r--arch/x86/kernel/tboot.c6
-rw-r--r--arch/x86/kernel/tls.c11
-rw-r--r--arch/x86/kernel/vm86_32.c8
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/vmx.c12
-rw-r--r--arch/x86/mm/dump_pagetables.c59
-rw-r--r--arch/x86/mm/fault.c66
-rw-r--r--arch/x86/mm/gup.c33
-rw-r--r--arch/x86/mm/hugetlbpage.c9
-rw-r--r--arch/x86/mm/ident_map.c51
-rw-r--r--arch/x86/mm/init_32.c68
-rw-r--r--arch/x86/mm/init_64.c185
-rw-r--r--arch/x86/mm/ioremap.c3
-rw-r--r--arch/x86/mm/kasan_init_64.c33
-rw-r--r--arch/x86/mm/mmap.c125
-rw-r--r--arch/x86/mm/mpx.c10
-rw-r--r--arch/x86/mm/numa.c4
-rw-r--r--arch/x86/mm/pageattr.c54
-rw-r--r--arch/x86/mm/pgtable.c36
-rw-r--r--arch/x86/mm/pgtable_32.c8
-rw-r--r--arch/x86/mm/tlb.c33
-rw-r--r--arch/x86/platform/efi/efi_32.c4
-rw-r--r--arch/x86/platform/efi/efi_64.c41
-rw-r--r--arch/x86/power/cpu.c7
-rw-r--r--arch/x86/power/hibernate_32.c7
-rw-r--r--arch/x86/power/hibernate_64.c47
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/mmu.c398
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--drivers/dax/pmem.c2
-rw-r--r--drivers/lguest/x86/core.c6
-rw-r--r--drivers/nvdimm/pmem.c13
-rw-r--r--drivers/pnp/pnpbios/bioscalls.c10
-rw-r--r--include/asm-generic/mm_hooks.h6
-rw-r--r--include/asm-generic/pgtable.h25
-rw-r--r--include/linux/mm.h18
-rw-r--r--include/linux/mm_types.h5
-rw-r--r--include/linux/pagemap.h4
-rw-r--r--include/trace/events/xen.h28
-rw-r--r--kernel/memremap.c22
-rw-r--r--mm/gup.c148
-rw-r--r--mm/swap.c10
-rw-r--r--tools/testing/selftests/x86/ldt_gdt.c46
-rw-r--r--tools/testing/selftests/x86/mpx-mini-test.c5
93 files changed, 1845 insertions, 711 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 5724092db811..b0798e281aa6 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -4,7 +4,7 @@
4Virtual memory map with 4 level page tables: 4Virtual memory map with 4 level page tables:
5 5
60000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm 60000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
7hole caused by [48:63] sign extension 7hole caused by [47:63] sign extension
8ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor 8ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
9ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory 9ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
10ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole 10ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
@@ -19,16 +19,43 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
19ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space 19ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
20... unused hole ... 20... unused hole ...
21ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 21ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
22ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
23ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
24ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
25
26Virtual memory map with 5 level page tables:
27
280000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
29hole caused by [56:63] sign extension
30ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
31ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
32ff90000000000000 - ff91ffffffffffff (=49 bits) hole
33ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
34ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
35ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
36... unused hole ...
37ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB)
38... unused hole ...
39ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
40... unused hole ...
41ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
42... unused hole ...
43ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
22ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space 44ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
23ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls 45ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
24ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole 46ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
25 47
48Architecture defines a 64-bit virtual address. Implementations can support
49less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
50through to the most-significant implemented bit are set to either all ones
51or all zero. This causes hole between user space and kernel addresses.
52
26The direct mapping covers all memory in the system up to the highest 53The direct mapping covers all memory in the system up to the highest
27memory address (this means in some cases it can also include PCI memory 54memory address (this means in some cases it can also include PCI memory
28holes). 55holes).
29 56
30vmalloc space is lazily synchronized into the different PML4 pages of 57vmalloc space is lazily synchronized into the different PML4/PML5 pages of
31the processes using the page fault handler, with init_level4_pgt as 58the processes using the page fault handler, with init_top_pgt as
32reference. 59reference.
33 60
34Current X86-64 implementations support up to 46 bits of address space (64 TB), 61Current X86-64 implementations support up to 46 bits of address space (64 TB),
@@ -39,6 +66,9 @@ memory window (this size is arbitrary, it can be raised later if needed).
39The mappings are not part of any other kernel PGD and are only available 66The mappings are not part of any other kernel PGD and are only available
40during EFI runtime calls. 67during EFI runtime calls.
41 68
69The module mapping space size changes based on the CONFIG requirements for the
70following fixmap section.
71
42Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all 72Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
43physical memory, vmalloc/ioremap space and virtual memory map are randomized. 73physical memory, vmalloc/ioremap space and virtual memory map are randomized.
44Their order is preserved but their base will be offset early at boot time. 74Their order is preserved but their base will be offset early at boot time.
diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..c4d6833aacd9 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -700,6 +700,13 @@ config ARCH_MMAP_RND_COMPAT_BITS
700 This value can be changed after boot using the 700 This value can be changed after boot using the
701 /proc/sys/vm/mmap_rnd_compat_bits tunable 701 /proc/sys/vm/mmap_rnd_compat_bits tunable
702 702
703config HAVE_ARCH_COMPAT_MMAP_BASES
704 bool
705 help
706 This allows 64bit applications to invoke 32-bit mmap() syscall
707 and vice-versa 32-bit applications to call 64-bit mmap().
708 Required for applications doing different bitness syscalls.
709
703config HAVE_COPY_THREAD_TLS 710config HAVE_COPY_THREAD_TLS
704 bool 711 bool
705 help 712 help
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b9e3f0aca261..ecf9885ab660 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -163,11 +163,5 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
163 /* by default, allow everything */ 163 /* by default, allow everything */
164 return true; 164 return true;
165} 165}
166
167static inline bool arch_pte_access_permitted(pte_t pte, bool write)
168{
169 /* by default, allow everything */
170 return true;
171}
172#endif /* __KERNEL__ */ 166#endif /* __KERNEL__ */
173#endif /* __ASM_POWERPC_MMU_CONTEXT_H */ 167#endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 6e31d87fb669..fa2bf69be182 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -156,10 +156,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
156 /* by default, allow everything */ 156 /* by default, allow everything */
157 return true; 157 return true;
158} 158}
159
160static inline bool arch_pte_access_permitted(pte_t pte, bool write)
161{
162 /* by default, allow everything */
163 return true;
164}
165#endif /* __S390_MMU_CONTEXT_H */ 159#endif /* __S390_MMU_CONTEXT_H */
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 94ac2739918c..b668e351fd6c 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -37,12 +37,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
37 return true; 37 return true;
38} 38}
39 39
40static inline bool arch_pte_access_permitted(pte_t pte, bool write)
41{
42 /* by default, allow everything */
43 return true;
44}
45
46/* 40/*
47 * end asm-generic/mm_hooks.h functions 41 * end asm-generic/mm_hooks.h functions
48 */ 42 */
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index 62dfc644c908..59b06b48f27d 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -103,10 +103,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
103 /* by default, allow everything */ 103 /* by default, allow everything */
104 return true; 104 return true;
105} 105}
106
107static inline bool arch_pte_access_permitted(pte_t pte, bool write)
108{
109 /* by default, allow everything */
110 return true;
111}
112#endif 106#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2b899858532a..8d4f87e5bba3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -105,6 +105,7 @@ config X86
105 select HAVE_ARCH_KMEMCHECK 105 select HAVE_ARCH_KMEMCHECK
106 select HAVE_ARCH_MMAP_RND_BITS if MMU 106 select HAVE_ARCH_MMAP_RND_BITS if MMU
107 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT 107 select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
108 select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
108 select HAVE_ARCH_SECCOMP_FILTER 109 select HAVE_ARCH_SECCOMP_FILTER
109 select HAVE_ARCH_TRACEHOOK 110 select HAVE_ARCH_TRACEHOOK
110 select HAVE_ARCH_TRANSPARENT_HUGEPAGE 111 select HAVE_ARCH_TRANSPARENT_HUGEPAGE
@@ -289,6 +290,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
289config KASAN_SHADOW_OFFSET 290config KASAN_SHADOW_OFFSET
290 hex 291 hex
291 depends on KASAN 292 depends on KASAN
293 default 0xdff8000000000000 if X86_5LEVEL
292 default 0xdffffc0000000000 294 default 0xdffffc0000000000
293 295
294config HAVE_INTEL_TXT 296config HAVE_INTEL_TXT
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4ad7d70e8739..8f0c4c9fc904 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] =
44 0, /* REQUIRED_MASK5 not implemented in this file */ 44 0, /* REQUIRED_MASK5 not implemented in this file */
45 REQUIRED_MASK6, 45 REQUIRED_MASK6,
46 0, /* REQUIRED_MASK7 not implemented in this file */ 46 0, /* REQUIRED_MASK7 not implemented in this file */
47 0, /* REQUIRED_MASK8 not implemented in this file */
48 0, /* REQUIRED_MASK9 not implemented in this file */
49 0, /* REQUIRED_MASK10 not implemented in this file */
50 0, /* REQUIRED_MASK11 not implemented in this file */
51 0, /* REQUIRED_MASK12 not implemented in this file */
52 0, /* REQUIRED_MASK13 not implemented in this file */
53 0, /* REQUIRED_MASK14 not implemented in this file */
54 0, /* REQUIRED_MASK15 not implemented in this file */
55 REQUIRED_MASK16,
47}; 56};
48 57
49#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) 58#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index 6687ab953257..9e77c23c2422 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -70,16 +70,19 @@ int has_eflag(unsigned long mask)
70# define EBX_REG "=b" 70# define EBX_REG "=b"
71#endif 71#endif
72 72
73static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d) 73static inline void cpuid_count(u32 id, u32 count,
74 u32 *a, u32 *b, u32 *c, u32 *d)
74{ 75{
75 asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" 76 asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
76 "cpuid \n\t" 77 "cpuid \n\t"
77 ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t" 78 ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
78 : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b) 79 : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
79 : "a" (id) 80 : "a" (id), "c" (count)
80 ); 81 );
81} 82}
82 83
84#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d)
85
83void get_cpuflags(void) 86void get_cpuflags(void)
84{ 87{
85 u32 max_intel_level, max_amd_level; 88 u32 max_intel_level, max_amd_level;
@@ -108,6 +111,11 @@ void get_cpuflags(void)
108 cpu.model += ((tfms >> 16) & 0xf) << 4; 111 cpu.model += ((tfms >> 16) & 0xf) << 4;
109 } 112 }
110 113
114 if (max_intel_level >= 0x00000007) {
115 cpuid_count(0x00000007, 0, &ignored, &ignored,
116 &cpu.flags[16], &ignored);
117 }
118
111 cpuid(0x80000000, &max_amd_level, &ignored, &ignored, 119 cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
112 &ignored); 120 &ignored);
113 121
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2b2a2948ffe..607d72c4a485 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,12 +265,9 @@ return_from_SYSCALL_64:
265 * 265 *
266 * If width of "canonical tail" ever becomes variable, this will need 266 * If width of "canonical tail" ever becomes variable, this will need
267 * to be updated to remain correct on both old and new CPUs. 267 * to be updated to remain correct on both old and new CPUs.
268 *
269 * Change top 16 bits to be the sign-extension of 47th bit
268 */ 270 */
269 .ifne __VIRTUAL_MASK_SHIFT - 47
270 .error "virtual address width changed -- SYSRET checks need update"
271 .endif
272
273 /* Change top 16 bits to be the sign-extension of 47th bit */
274 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 271 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
275 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 272 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
276 273
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index faf80fdeeacc..139ad7726e10 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -361,7 +361,7 @@ static void vgetcpu_cpu_init(void *arg)
361 d.p = 1; /* Present */ 361 d.p = 1; /* Present */
362 d.d = 1; /* 32-bit */ 362 d.d = 1; /* 32-bit */
363 363
364 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); 364 write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
365} 365}
366 366
367static int vgetcpu_online(unsigned int cpu) 367static int vgetcpu_online(unsigned int cpu)
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 1548ca92ad3f..d0a21b12dd58 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -4,6 +4,7 @@
4#include <asm/desc_defs.h> 4#include <asm/desc_defs.h>
5#include <asm/ldt.h> 5#include <asm/ldt.h>
6#include <asm/mmu.h> 6#include <asm/mmu.h>
7#include <asm/fixmap.h>
7 8
8#include <linux/smp.h> 9#include <linux/smp.h>
9#include <linux/percpu.h> 10#include <linux/percpu.h>
@@ -45,11 +46,43 @@ struct gdt_page {
45 46
46DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); 47DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
47 48
48static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) 49/* Provide the original GDT */
50static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
49{ 51{
50 return per_cpu(gdt_page, cpu).gdt; 52 return per_cpu(gdt_page, cpu).gdt;
51} 53}
52 54
55/* Provide the current original GDT */
56static inline struct desc_struct *get_current_gdt_rw(void)
57{
58 return this_cpu_ptr(&gdt_page)->gdt;
59}
60
61/* Get the fixmap index for a specific processor */
62static inline unsigned int get_cpu_gdt_ro_index(int cpu)
63{
64 return FIX_GDT_REMAP_BEGIN + cpu;
65}
66
67/* Provide the fixmap address of the remapped GDT */
68static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
69{
70 unsigned int idx = get_cpu_gdt_ro_index(cpu);
71 return (struct desc_struct *)__fix_to_virt(idx);
72}
73
74/* Provide the current read-only GDT */
75static inline struct desc_struct *get_current_gdt_ro(void)
76{
77 return get_cpu_gdt_ro(smp_processor_id());
78}
79
80/* Provide the physical address of the GDT page. */
81static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
82{
83 return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
84}
85
53#ifdef CONFIG_X86_64 86#ifdef CONFIG_X86_64
54 87
55static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, 88static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
@@ -174,7 +207,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t
174 207
175static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) 208static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
176{ 209{
177 struct desc_struct *d = get_cpu_gdt_table(cpu); 210 struct desc_struct *d = get_cpu_gdt_rw(cpu);
178 tss_desc tss; 211 tss_desc tss;
179 212
180 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, 213 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
@@ -194,22 +227,90 @@ static inline void native_set_ldt(const void *addr, unsigned int entries)
194 227
195 set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, 228 set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
196 entries * LDT_ENTRY_SIZE - 1); 229 entries * LDT_ENTRY_SIZE - 1);
197 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, 230 write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT,
198 &ldt, DESC_LDT); 231 &ldt, DESC_LDT);
199 asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); 232 asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
200 } 233 }
201} 234}
202 235
236static inline void native_load_gdt(const struct desc_ptr *dtr)
237{
238 asm volatile("lgdt %0"::"m" (*dtr));
239}
240
241static inline void native_load_idt(const struct desc_ptr *dtr)
242{
243 asm volatile("lidt %0"::"m" (*dtr));
244}
245
246static inline void native_store_gdt(struct desc_ptr *dtr)
247{
248 asm volatile("sgdt %0":"=m" (*dtr));
249}
250
251static inline void native_store_idt(struct desc_ptr *dtr)
252{
253 asm volatile("sidt %0":"=m" (*dtr));
254}
255
256/*
257 * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
258 * a read-only remapping. To prevent a page fault, the GDT is switched to the
259 * original writeable version when needed.
260 */
261#ifdef CONFIG_X86_64
262static inline void native_load_tr_desc(void)
263{
264 struct desc_ptr gdt;
265 int cpu = raw_smp_processor_id();
266 bool restore = 0;
267 struct desc_struct *fixmap_gdt;
268
269 native_store_gdt(&gdt);
270 fixmap_gdt = get_cpu_gdt_ro(cpu);
271
272 /*
273 * If the current GDT is the read-only fixmap, swap to the original
274 * writeable version. Swap back at the end.
275 */
276 if (gdt.address == (unsigned long)fixmap_gdt) {
277 load_direct_gdt(cpu);
278 restore = 1;
279 }
280 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
281 if (restore)
282 load_fixmap_gdt(cpu);
283}
284#else
203static inline void native_load_tr_desc(void) 285static inline void native_load_tr_desc(void)
204{ 286{
205 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); 287 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
206} 288}
289#endif
290
291static inline unsigned long native_store_tr(void)
292{
293 unsigned long tr;
294
295 asm volatile("str %0":"=r" (tr));
296
297 return tr;
298}
299
300static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
301{
302 struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
303 unsigned int i;
304
305 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
306 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
307}
207 308
208DECLARE_PER_CPU(bool, __tss_limit_invalid); 309DECLARE_PER_CPU(bool, __tss_limit_invalid);
209 310
210static inline void force_reload_TR(void) 311static inline void force_reload_TR(void)
211{ 312{
212 struct desc_struct *d = get_cpu_gdt_table(smp_processor_id()); 313 struct desc_struct *d = get_current_gdt_rw();
213 tss_desc tss; 314 tss_desc tss;
214 315
215 memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc)); 316 memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
@@ -257,44 +358,6 @@ static inline void invalidate_tss_limit(void)
257 this_cpu_write(__tss_limit_invalid, true); 358 this_cpu_write(__tss_limit_invalid, true);
258} 359}
259 360
260static inline void native_load_gdt(const struct desc_ptr *dtr)
261{
262 asm volatile("lgdt %0"::"m" (*dtr));
263}
264
265static inline void native_load_idt(const struct desc_ptr *dtr)
266{
267 asm volatile("lidt %0"::"m" (*dtr));
268}
269
270static inline void native_store_gdt(struct desc_ptr *dtr)
271{
272 asm volatile("sgdt %0":"=m" (*dtr));
273}
274
275static inline void native_store_idt(struct desc_ptr *dtr)
276{
277 asm volatile("sidt %0":"=m" (*dtr));
278}
279
280static inline unsigned long native_store_tr(void)
281{
282 unsigned long tr;
283
284 asm volatile("str %0":"=r" (tr));
285
286 return tr;
287}
288
289static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
290{
291 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
292 unsigned int i;
293
294 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
295 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
296}
297
298/* This intentionally ignores lm, since 32-bit apps don't have that field. */ 361/* This intentionally ignores lm, since 32-bit apps don't have that field. */
299#define LDT_empty(info) \ 362#define LDT_empty(info) \
300 ((info)->base_addr == 0 && \ 363 ((info)->base_addr == 0 && \
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 85599ad4d024..5dff775af7cd 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -36,6 +36,12 @@
36# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) 36# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
37#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ 37#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
38 38
39#ifdef CONFIG_X86_5LEVEL
40# define DISABLE_LA57 0
41#else
42# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
43#endif
44
39/* 45/*
40 * Make sure to add features to the correct mask 46 * Make sure to add features to the correct mask
41 */ 47 */
@@ -55,7 +61,7 @@
55#define DISABLED_MASK13 0 61#define DISABLED_MASK13 0
56#define DISABLED_MASK14 0 62#define DISABLED_MASK14 0
57#define DISABLED_MASK15 0 63#define DISABLED_MASK15 0
58#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) 64#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
59#define DISABLED_MASK17 0 65#define DISABLED_MASK17 0
60#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) 66#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
61 67
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 3762536619f8..e8ab9a46bc68 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -293,8 +293,23 @@ do { \
293 } \ 293 } \
294} while (0) 294} while (0)
295 295
296/*
297 * True on X86_32 or when emulating IA32 on X86_64
298 */
299static inline int mmap_is_ia32(void)
300{
301 return IS_ENABLED(CONFIG_X86_32) ||
302 (IS_ENABLED(CONFIG_COMPAT) &&
303 test_thread_flag(TIF_ADDR32));
304}
305
306extern unsigned long tasksize_32bit(void);
307extern unsigned long tasksize_64bit(void);
308extern unsigned long get_mmap_base(int is_legacy);
309
296#ifdef CONFIG_X86_32 310#ifdef CONFIG_X86_32
297 311
312#define __STACK_RND_MASK(is32bit) (0x7ff)
298#define STACK_RND_MASK (0x7ff) 313#define STACK_RND_MASK (0x7ff)
299 314
300#define ARCH_DLINFO ARCH_DLINFO_IA32 315#define ARCH_DLINFO ARCH_DLINFO_IA32
@@ -304,7 +319,8 @@ do { \
304#else /* CONFIG_X86_32 */ 319#else /* CONFIG_X86_32 */
305 320
306/* 1GB for 64bit, 8MB for 32bit */ 321/* 1GB for 64bit, 8MB for 32bit */
307#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) 322#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff)
323#define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32())
308 324
309#define ARCH_DLINFO \ 325#define ARCH_DLINFO \
310do { \ 326do { \
@@ -348,16 +364,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
348 int uses_interp); 364 int uses_interp);
349#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages 365#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
350 366
351/*
352 * True on X86_32 or when emulating IA32 on X86_64
353 */
354static inline int mmap_is_ia32(void)
355{
356 return IS_ENABLED(CONFIG_X86_32) ||
357 (IS_ENABLED(CONFIG_COMPAT) &&
358 test_thread_flag(TIF_ADDR32));
359}
360
361/* Do not change the values. See get_align_mask() */ 367/* Do not change the values. See get_align_mask() */
362enum align_flags { 368enum align_flags {
363 ALIGN_VA_32 = BIT(0), 369 ALIGN_VA_32 = BIT(0),
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 8554f960e21b..b65155cc3760 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -100,6 +100,10 @@ enum fixed_addresses {
100#ifdef CONFIG_X86_INTEL_MID 100#ifdef CONFIG_X86_INTEL_MID
101 FIX_LNW_VRTC, 101 FIX_LNW_VRTC,
102#endif 102#endif
103 /* Fixmap entries to remap the GDTs, one per processor. */
104 FIX_GDT_REMAP_BEGIN,
105 FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
106
103 __end_of_permanent_fixed_addresses, 107 __end_of_permanent_fixed_addresses,
104 108
105 /* 109 /*
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 1410b567ecde..f527b02a0ee3 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -11,9 +11,12 @@
11 * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT 11 * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
12 */ 12 */
13#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ 13#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
14 (0xffff800000000000ULL >> 3)) 14 ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3))
15/* 47 bits for kernel address -> (47 - 3) bits for shadow */ 15/*
16#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3))) 16 * 47 bits for kernel address -> (47 - 3) bits for shadow
17 * 56 bits for kernel address -> (56 - 3) bits for shadow
18 */
19#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3)))
17 20
18#ifndef __ASSEMBLY__ 21#ifndef __ASSEMBLY__
19 22
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 282630e4c6ea..70ef205489f0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -164,6 +164,7 @@ struct kimage_arch {
164}; 164};
165#else 165#else
166struct kimage_arch { 166struct kimage_arch {
167 p4d_t *p4d;
167 pud_t *pud; 168 pud_t *pud;
168 pmd_t *pmd; 169 pmd_t *pmd;
169 pte_t *pte; 170 pte_t *pte;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 306c7e12af55..68b329d77b3a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -268,8 +268,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
268 return __pkru_allows_pkey(vma_pkey(vma), write); 268 return __pkru_allows_pkey(vma_pkey(vma), write);
269} 269}
270 270
271static inline bool arch_pte_access_permitted(pte_t pte, bool write)
272{
273 return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
274}
275#endif /* _ASM_X86_MMU_CONTEXT_H */ 271#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 9215e0527647..3f5f08b010d0 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -36,7 +36,12 @@
36 * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's 36 * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
37 * what Xen requires. 37 * what Xen requires.
38 */ 38 */
39#ifdef CONFIG_X86_5LEVEL
40#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL)
41#else
39#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) 42#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
43#endif
44
40#ifdef CONFIG_RANDOMIZE_MEMORY 45#ifdef CONFIG_RANDOMIZE_MEMORY
41#define __PAGE_OFFSET page_offset_base 46#define __PAGE_OFFSET page_offset_base
42#else 47#else
@@ -46,8 +51,13 @@
46#define __START_KERNEL_map _AC(0xffffffff80000000, UL) 51#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
47 52
48/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ 53/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
54#ifdef CONFIG_X86_5LEVEL
55#define __PHYSICAL_MASK_SHIFT 52
56#define __VIRTUAL_MASK_SHIFT 56
57#else
49#define __PHYSICAL_MASK_SHIFT 46 58#define __PHYSICAL_MASK_SHIFT 46
50#define __VIRTUAL_MASK_SHIFT 47 59#define __VIRTUAL_MASK_SHIFT 47
60#endif
51 61
52/* 62/*
53 * Kernel image size is limited to 1GiB due to the fixmap living in the 63 * Kernel image size is limited to 1GiB due to the fixmap living in the
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0489884fdc44..55fa56fe4e45 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -357,6 +357,16 @@ static inline void paravirt_release_pud(unsigned long pfn)
357 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); 357 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
358} 358}
359 359
360static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
361{
362 PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn);
363}
364
365static inline void paravirt_release_p4d(unsigned long pfn)
366{
367 PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
368}
369
360static inline void pte_update(struct mm_struct *mm, unsigned long addr, 370static inline void pte_update(struct mm_struct *mm, unsigned long addr,
361 pte_t *ptep) 371 pte_t *ptep)
362{ 372{
@@ -536,7 +546,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
536 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, 546 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
537 val); 547 val);
538} 548}
539#if CONFIG_PGTABLE_LEVELS == 4 549#if CONFIG_PGTABLE_LEVELS >= 4
540static inline pud_t __pud(pudval_t val) 550static inline pud_t __pud(pudval_t val)
541{ 551{
542 pudval_t ret; 552 pudval_t ret;
@@ -565,26 +575,54 @@ static inline pudval_t pud_val(pud_t pud)
565 return ret; 575 return ret;
566} 576}
567 577
568static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) 578static inline void pud_clear(pud_t *pudp)
569{ 579{
570 pgdval_t val = native_pgd_val(pgd); 580 set_pud(pudp, __pud(0));
581}
571 582
572 if (sizeof(pgdval_t) > sizeof(long)) 583static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
573 PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp, 584{
585 p4dval_t val = native_p4d_val(p4d);
586
587 if (sizeof(p4dval_t) > sizeof(long))
588 PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp,
574 val, (u64)val >> 32); 589 val, (u64)val >> 32);
575 else 590 else
576 PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, 591 PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp,
577 val); 592 val);
578} 593}
579 594
595#if CONFIG_PGTABLE_LEVELS >= 5
596
597static inline p4d_t __p4d(p4dval_t val)
598{
599 p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val);
600
601 return (p4d_t) { ret };
602}
603
604static inline p4dval_t p4d_val(p4d_t p4d)
605{
606 return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
607}
608
609static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
610{
611 pgdval_t val = native_pgd_val(pgd);
612
613 PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val);
614}
615
580static inline void pgd_clear(pgd_t *pgdp) 616static inline void pgd_clear(pgd_t *pgdp)
581{ 617{
582 set_pgd(pgdp, __pgd(0)); 618 set_pgd(pgdp, __pgd(0));
583} 619}
584 620
585static inline void pud_clear(pud_t *pudp) 621#endif /* CONFIG_PGTABLE_LEVELS == 5 */
622
623static inline void p4d_clear(p4d_t *p4dp)
586{ 624{
587 set_pud(pudp, __pud(0)); 625 set_p4d(p4dp, __p4d(0));
588} 626}
589 627
590#endif /* CONFIG_PGTABLE_LEVELS == 4 */ 628#endif /* CONFIG_PGTABLE_LEVELS == 4 */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b060f962d581..7465d6fe336f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -238,9 +238,11 @@ struct pv_mmu_ops {
238 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); 238 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
239 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); 239 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
240 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); 240 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
241 void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn);
241 void (*release_pte)(unsigned long pfn); 242 void (*release_pte)(unsigned long pfn);
242 void (*release_pmd)(unsigned long pfn); 243 void (*release_pmd)(unsigned long pfn);
243 void (*release_pud)(unsigned long pfn); 244 void (*release_pud)(unsigned long pfn);
245 void (*release_p4d)(unsigned long pfn);
244 246
245 /* Pagetable manipulation functions */ 247 /* Pagetable manipulation functions */
246 void (*set_pte)(pte_t *ptep, pte_t pteval); 248 void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -279,12 +281,21 @@ struct pv_mmu_ops {
279 struct paravirt_callee_save pmd_val; 281 struct paravirt_callee_save pmd_val;
280 struct paravirt_callee_save make_pmd; 282 struct paravirt_callee_save make_pmd;
281 283
282#if CONFIG_PGTABLE_LEVELS == 4 284#if CONFIG_PGTABLE_LEVELS >= 4
283 struct paravirt_callee_save pud_val; 285 struct paravirt_callee_save pud_val;
284 struct paravirt_callee_save make_pud; 286 struct paravirt_callee_save make_pud;
285 287
286 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); 288 void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);
287#endif /* CONFIG_PGTABLE_LEVELS == 4 */ 289
290#if CONFIG_PGTABLE_LEVELS >= 5
291 struct paravirt_callee_save p4d_val;
292 struct paravirt_callee_save make_p4d;
293
294 void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval);
295#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
296
297#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
298
288#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ 299#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
289 300
290 struct pv_lazy_ops lazy_mode; 301 struct pv_lazy_ops lazy_mode;
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b6d425999f99..b2d0cd8288aa 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -17,9 +17,11 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {
17static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, 17static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
18 unsigned long start, unsigned long count) {} 18 unsigned long start, unsigned long count) {}
19static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} 19static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
20static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {}
20static inline void paravirt_release_pte(unsigned long pfn) {} 21static inline void paravirt_release_pte(unsigned long pfn) {}
21static inline void paravirt_release_pmd(unsigned long pfn) {} 22static inline void paravirt_release_pmd(unsigned long pfn) {}
22static inline void paravirt_release_pud(unsigned long pfn) {} 23static inline void paravirt_release_pud(unsigned long pfn) {}
24static inline void paravirt_release_p4d(unsigned long pfn) {}
23#endif 25#endif
24 26
25/* 27/*
@@ -121,10 +123,10 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
121#endif /* CONFIG_X86_PAE */ 123#endif /* CONFIG_X86_PAE */
122 124
123#if CONFIG_PGTABLE_LEVELS > 3 125#if CONFIG_PGTABLE_LEVELS > 3
124static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) 126static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
125{ 127{
126 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); 128 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
127 set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); 129 set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
128} 130}
129 131
130static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 132static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -150,6 +152,37 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
150 ___pud_free_tlb(tlb, pud); 152 ___pud_free_tlb(tlb, pud);
151} 153}
152 154
155#if CONFIG_PGTABLE_LEVELS > 4
156static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
157{
158 paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
159 set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
160}
161
162static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
163{
164 gfp_t gfp = GFP_KERNEL_ACCOUNT;
165
166 if (mm == &init_mm)
167 gfp &= ~__GFP_ACCOUNT;
168 return (p4d_t *)get_zeroed_page(gfp);
169}
170
171static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
172{
173 BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
174 free_page((unsigned long)p4d);
175}
176
177extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
178
179static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
180 unsigned long address)
181{
182 ___p4d_free_tlb(tlb, p4d);
183}
184
185#endif /* CONFIG_PGTABLE_LEVELS > 4 */
153#endif /* CONFIG_PGTABLE_LEVELS > 3 */ 186#endif /* CONFIG_PGTABLE_LEVELS > 3 */
154#endif /* CONFIG_PGTABLE_LEVELS > 2 */ 187#endif /* CONFIG_PGTABLE_LEVELS > 2 */
155 188
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index 392576433e77..373ab1de909f 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -7,6 +7,7 @@
7typedef unsigned long pteval_t; 7typedef unsigned long pteval_t;
8typedef unsigned long pmdval_t; 8typedef unsigned long pmdval_t;
9typedef unsigned long pudval_t; 9typedef unsigned long pudval_t;
10typedef unsigned long p4dval_t;
10typedef unsigned long pgdval_t; 11typedef unsigned long pgdval_t;
11typedef unsigned long pgprotval_t; 12typedef unsigned long pgprotval_t;
12 13
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index bcc89625ebe5..b8a4341faafa 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -7,6 +7,7 @@
7typedef u64 pteval_t; 7typedef u64 pteval_t;
8typedef u64 pmdval_t; 8typedef u64 pmdval_t;
9typedef u64 pudval_t; 9typedef u64 pudval_t;
10typedef u64 p4dval_t;
10typedef u64 pgdval_t; 11typedef u64 pgdval_t;
11typedef u64 pgprotval_t; 12typedef u64 pgprotval_t;
12 13
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2197e5322df9..f5af95a0c6b8 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -51,11 +51,19 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
51 51
52#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) 52#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
53 53
54#ifndef __PAGETABLE_PUD_FOLDED 54#ifndef __PAGETABLE_P4D_FOLDED
55#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) 55#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
56#define pgd_clear(pgd) native_pgd_clear(pgd) 56#define pgd_clear(pgd) native_pgd_clear(pgd)
57#endif 57#endif
58 58
59#ifndef set_p4d
60# define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d)
61#endif
62
63#ifndef __PAGETABLE_PUD_FOLDED
64#define p4d_clear(p4d) native_p4d_clear(p4d)
65#endif
66
59#ifndef set_pud 67#ifndef set_pud
60# define set_pud(pudp, pud) native_set_pud(pudp, pud) 68# define set_pud(pudp, pud) native_set_pud(pudp, pud)
61#endif 69#endif
@@ -72,6 +80,11 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
72#define pgd_val(x) native_pgd_val(x) 80#define pgd_val(x) native_pgd_val(x)
73#define __pgd(x) native_make_pgd(x) 81#define __pgd(x) native_make_pgd(x)
74 82
83#ifndef __PAGETABLE_P4D_FOLDED
84#define p4d_val(x) native_p4d_val(x)
85#define __p4d(x) native_make_p4d(x)
86#endif
87
75#ifndef __PAGETABLE_PUD_FOLDED 88#ifndef __PAGETABLE_PUD_FOLDED
76#define pud_val(x) native_pud_val(x) 89#define pud_val(x) native_pud_val(x)
77#define __pud(x) native_make_pud(x) 90#define __pud(x) native_make_pud(x)
@@ -177,6 +190,17 @@ static inline unsigned long pud_pfn(pud_t pud)
177 return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; 190 return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
178} 191}
179 192
193static inline unsigned long p4d_pfn(p4d_t p4d)
194{
195 return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
196}
197
198static inline int p4d_large(p4d_t p4d)
199{
200 /* No 512 GiB pages yet */
201 return 0;
202}
203
180#define pte_page(pte) pfn_to_page(pte_pfn(pte)) 204#define pte_page(pte) pfn_to_page(pte_pfn(pte))
181 205
182static inline int pmd_large(pmd_t pte) 206static inline int pmd_large(pmd_t pte)
@@ -536,6 +560,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
536#define pte_pgprot(x) __pgprot(pte_flags(x)) 560#define pte_pgprot(x) __pgprot(pte_flags(x))
537#define pmd_pgprot(x) __pgprot(pmd_flags(x)) 561#define pmd_pgprot(x) __pgprot(pmd_flags(x))
538#define pud_pgprot(x) __pgprot(pud_flags(x)) 562#define pud_pgprot(x) __pgprot(pud_flags(x))
563#define p4d_pgprot(x) __pgprot(p4d_flags(x))
539 564
540#define canon_pgprot(p) __pgprot(massage_pgprot(p)) 565#define canon_pgprot(p) __pgprot(massage_pgprot(p))
541 566
@@ -585,6 +610,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
585#include <linux/mm_types.h> 610#include <linux/mm_types.h>
586#include <linux/mmdebug.h> 611#include <linux/mmdebug.h>
587#include <linux/log2.h> 612#include <linux/log2.h>
613#include <asm/fixmap.h>
588 614
589static inline int pte_none(pte_t pte) 615static inline int pte_none(pte_t pte)
590{ 616{
@@ -768,7 +794,52 @@ static inline int pud_large(pud_t pud)
768} 794}
769#endif /* CONFIG_PGTABLE_LEVELS > 2 */ 795#endif /* CONFIG_PGTABLE_LEVELS > 2 */
770 796
797static inline unsigned long pud_index(unsigned long address)
798{
799 return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
800}
801
771#if CONFIG_PGTABLE_LEVELS > 3 802#if CONFIG_PGTABLE_LEVELS > 3
803static inline int p4d_none(p4d_t p4d)
804{
805 return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
806}
807
808static inline int p4d_present(p4d_t p4d)
809{
810 return p4d_flags(p4d) & _PAGE_PRESENT;
811}
812
813static inline unsigned long p4d_page_vaddr(p4d_t p4d)
814{
815 return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
816}
817
818/*
819 * Currently stuck as a macro due to indirect forward reference to
820 * linux/mmzone.h's __section_mem_map_addr() definition:
821 */
822#define p4d_page(p4d) \
823 pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT)
824
825/* Find an entry in the third-level page table.. */
826static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
827{
828 return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
829}
830
831static inline int p4d_bad(p4d_t p4d)
832{
833 return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
834}
835#endif /* CONFIG_PGTABLE_LEVELS > 3 */
836
837static inline unsigned long p4d_index(unsigned long address)
838{
839 return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
840}
841
842#if CONFIG_PGTABLE_LEVELS > 4
772static inline int pgd_present(pgd_t pgd) 843static inline int pgd_present(pgd_t pgd)
773{ 844{
774 return pgd_flags(pgd) & _PAGE_PRESENT; 845 return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -786,14 +857,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
786#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) 857#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
787 858
788/* to find an entry in a page-table-directory. */ 859/* to find an entry in a page-table-directory. */
789static inline unsigned long pud_index(unsigned long address) 860static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
790{
791 return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
792}
793
794static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
795{ 861{
796 return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); 862 return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
797} 863}
798 864
799static inline int pgd_bad(pgd_t pgd) 865static inline int pgd_bad(pgd_t pgd)
@@ -811,7 +877,7 @@ static inline int pgd_none(pgd_t pgd)
811 */ 877 */
812 return !native_pgd_val(pgd); 878 return !native_pgd_val(pgd);
813} 879}
814#endif /* CONFIG_PGTABLE_LEVELS > 3 */ 880#endif /* CONFIG_PGTABLE_LEVELS > 4 */
815 881
816#endif /* __ASSEMBLY__ */ 882#endif /* __ASSEMBLY__ */
817 883
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index fbc73360aea0..bfab55675c16 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -14,7 +14,6 @@
14 */ 14 */
15#ifndef __ASSEMBLY__ 15#ifndef __ASSEMBLY__
16#include <asm/processor.h> 16#include <asm/processor.h>
17#include <asm/fixmap.h>
18#include <linux/threads.h> 17#include <linux/threads.h>
19#include <asm/paravirt.h> 18#include <asm/paravirt.h>
20 19
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 73c7ccc38912..9991224f6238 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -35,15 +35,22 @@ extern void paging_init(void);
35#define pud_ERROR(e) \ 35#define pud_ERROR(e) \
36 pr_err("%s:%d: bad pud %p(%016lx)\n", \ 36 pr_err("%s:%d: bad pud %p(%016lx)\n", \
37 __FILE__, __LINE__, &(e), pud_val(e)) 37 __FILE__, __LINE__, &(e), pud_val(e))
38
39#if CONFIG_PGTABLE_LEVELS >= 5
40#define p4d_ERROR(e) \
41 pr_err("%s:%d: bad p4d %p(%016lx)\n", \
42 __FILE__, __LINE__, &(e), p4d_val(e))
43#endif
44
38#define pgd_ERROR(e) \ 45#define pgd_ERROR(e) \
39 pr_err("%s:%d: bad pgd %p(%016lx)\n", \ 46 pr_err("%s:%d: bad pgd %p(%016lx)\n", \
40 __FILE__, __LINE__, &(e), pgd_val(e)) 47 __FILE__, __LINE__, &(e), pgd_val(e))
41 48
42struct mm_struct; 49struct mm_struct;
43 50
51void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
44void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); 52void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
45 53
46
47static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, 54static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
48 pte_t *ptep) 55 pte_t *ptep)
49{ 56{
@@ -121,6 +128,20 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
121#endif 128#endif
122} 129}
123 130
131static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
132{
133 *p4dp = p4d;
134}
135
136static inline void native_p4d_clear(p4d_t *p4d)
137{
138#ifdef CONFIG_X86_5LEVEL
139 native_set_p4d(p4d, native_make_p4d(0));
140#else
141 native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
142#endif
143}
144
124static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) 145static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
125{ 146{
126 *pgdp = pgd; 147 *pgdp = pgd;
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 3a264200c62f..06470da156ba 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -13,6 +13,7 @@
13typedef unsigned long pteval_t; 13typedef unsigned long pteval_t;
14typedef unsigned long pmdval_t; 14typedef unsigned long pmdval_t;
15typedef unsigned long pudval_t; 15typedef unsigned long pudval_t;
16typedef unsigned long p4dval_t;
16typedef unsigned long pgdval_t; 17typedef unsigned long pgdval_t;
17typedef unsigned long pgprotval_t; 18typedef unsigned long pgprotval_t;
18 19
@@ -22,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t;
22 23
23#define SHARED_KERNEL_PMD 0 24#define SHARED_KERNEL_PMD 0
24 25
26#ifdef CONFIG_X86_5LEVEL
27
28/*
29 * PGDIR_SHIFT determines what a top-level page table entry can map
30 */
31#define PGDIR_SHIFT 48
32#define PTRS_PER_PGD 512
33
34/*
35 * 4th level page in 5-level paging case
36 */
37#define P4D_SHIFT 39
38#define PTRS_PER_P4D 512
39#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
40#define P4D_MASK (~(P4D_SIZE - 1))
41
42#else /* CONFIG_X86_5LEVEL */
43
25/* 44/*
26 * PGDIR_SHIFT determines what a top-level page table entry can map 45 * PGDIR_SHIFT determines what a top-level page table entry can map
27 */ 46 */
28#define PGDIR_SHIFT 39 47#define PGDIR_SHIFT 39
29#define PTRS_PER_PGD 512 48#define PTRS_PER_PGD 512
30 49
50#endif /* CONFIG_X86_5LEVEL */
51
31/* 52/*
32 * 3rd level page 53 * 3rd level page
33 */ 54 */
@@ -55,9 +76,15 @@ typedef struct { pteval_t pte; } pte_t;
55 76
56/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ 77/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
57#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 78#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
79#ifdef CONFIG_X86_5LEVEL
80#define VMALLOC_SIZE_TB _AC(16384, UL)
81#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
82#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
83#else
58#define VMALLOC_SIZE_TB _AC(32, UL) 84#define VMALLOC_SIZE_TB _AC(32, UL)
59#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) 85#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
60#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) 86#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
87#endif
61#ifdef CONFIG_RANDOMIZE_MEMORY 88#ifdef CONFIG_RANDOMIZE_MEMORY
62#define VMALLOC_START vmalloc_base 89#define VMALLOC_START vmalloc_base
63#define VMEMMAP_START vmemmap_base 90#define VMEMMAP_START vmemmap_base
@@ -67,10 +94,11 @@ typedef struct { pteval_t pte; } pte_t;
67#endif /* CONFIG_RANDOMIZE_MEMORY */ 94#endif /* CONFIG_RANDOMIZE_MEMORY */
68#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) 95#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
69#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) 96#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
70#define MODULES_END _AC(0xffffffffff000000, UL) 97/* The module sections ends with the start of the fixmap */
98#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
71#define MODULES_LEN (MODULES_END - MODULES_VADDR) 99#define MODULES_LEN (MODULES_END - MODULES_VADDR)
72#define ESPFIX_PGD_ENTRY _AC(-2, UL) 100#define ESPFIX_PGD_ENTRY _AC(-2, UL)
73#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) 101#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
74#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) 102#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
75#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) 103#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
76 104
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 62484333673d..bf9638e1ee42 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -272,9 +272,28 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
272 return native_pgd_val(pgd) & PTE_FLAGS_MASK; 272 return native_pgd_val(pgd) & PTE_FLAGS_MASK;
273} 273}
274 274
275#if CONFIG_PGTABLE_LEVELS > 3 275#if CONFIG_PGTABLE_LEVELS > 4
276#include <asm-generic/5level-fixup.h> 276typedef struct { p4dval_t p4d; } p4d_t;
277
278static inline p4d_t native_make_p4d(pudval_t val)
279{
280 return (p4d_t) { val };
281}
282
283static inline p4dval_t native_p4d_val(p4d_t p4d)
284{
285 return p4d.p4d;
286}
287#else
288#include <asm-generic/pgtable-nop4d.h>
289
290static inline p4dval_t native_p4d_val(p4d_t p4d)
291{
292 return native_pgd_val(p4d.pgd);
293}
294#endif
277 295
296#if CONFIG_PGTABLE_LEVELS > 3
278typedef struct { pudval_t pud; } pud_t; 297typedef struct { pudval_t pud; } pud_t;
279 298
280static inline pud_t native_make_pud(pmdval_t val) 299static inline pud_t native_make_pud(pmdval_t val)
@@ -287,12 +306,11 @@ static inline pudval_t native_pud_val(pud_t pud)
287 return pud.pud; 306 return pud.pud;
288} 307}
289#else 308#else
290#define __ARCH_USE_5LEVEL_HACK
291#include <asm-generic/pgtable-nopud.h> 309#include <asm-generic/pgtable-nopud.h>
292 310
293static inline pudval_t native_pud_val(pud_t pud) 311static inline pudval_t native_pud_val(pud_t pud)
294{ 312{
295 return native_pgd_val(pud.pgd); 313 return native_pgd_val(pud.p4d.pgd);
296} 314}
297#endif 315#endif
298 316
@@ -309,15 +327,30 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
309 return pmd.pmd; 327 return pmd.pmd;
310} 328}
311#else 329#else
312#define __ARCH_USE_5LEVEL_HACK
313#include <asm-generic/pgtable-nopmd.h> 330#include <asm-generic/pgtable-nopmd.h>
314 331
315static inline pmdval_t native_pmd_val(pmd_t pmd) 332static inline pmdval_t native_pmd_val(pmd_t pmd)
316{ 333{
317 return native_pgd_val(pmd.pud.pgd); 334 return native_pgd_val(pmd.pud.p4d.pgd);
318} 335}
319#endif 336#endif
320 337
338static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
339{
340 /* No 512 GiB huge pages yet */
341 return PTE_PFN_MASK;
342}
343
344static inline p4dval_t p4d_flags_mask(p4d_t p4d)
345{
346 return ~p4d_pfn_mask(p4d);
347}
348
349static inline p4dval_t p4d_flags(p4d_t p4d)
350{
351 return native_p4d_val(p4d) & p4d_flags_mask(p4d);
352}
353
321static inline pudval_t pud_pfn_mask(pud_t pud) 354static inline pudval_t pud_pfn_mask(pud_t pud)
322{ 355{
323 if (native_pud_val(pud) & _PAGE_PSE) 356 if (native_pud_val(pud) & _PAGE_PSE)
@@ -461,6 +494,7 @@ enum pg_level {
461 PG_LEVEL_4K, 494 PG_LEVEL_4K,
462 PG_LEVEL_2M, 495 PG_LEVEL_2M,
463 PG_LEVEL_1G, 496 PG_LEVEL_1G,
497 PG_LEVEL_512G,
464 PG_LEVEL_NUM 498 PG_LEVEL_NUM
465}; 499};
466 500
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 78defd0aa220..3cada998a402 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -709,6 +709,8 @@ extern struct desc_ptr early_gdt_descr;
709 709
710extern void cpu_set_gdt(int); 710extern void cpu_set_gdt(int);
711extern void switch_to_new_gdt(int); 711extern void switch_to_new_gdt(int);
712extern void load_direct_gdt(int);
713extern void load_fixmap_gdt(int);
712extern void load_percpu_segment(int); 714extern void load_percpu_segment(int);
713extern void cpu_init(void); 715extern void cpu_init(void);
714 716
@@ -790,6 +792,7 @@ static inline void spin_lock_prefetch(const void *x)
790/* 792/*
791 * User space process size: 3GB (default). 793 * User space process size: 3GB (default).
792 */ 794 */
795#define IA32_PAGE_OFFSET PAGE_OFFSET
793#define TASK_SIZE PAGE_OFFSET 796#define TASK_SIZE PAGE_OFFSET
794#define TASK_SIZE_MAX TASK_SIZE 797#define TASK_SIZE_MAX TASK_SIZE
795#define STACK_TOP TASK_SIZE 798#define STACK_TOP TASK_SIZE
@@ -866,7 +869,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
866 * This decides where the kernel will search for a free chunk of vm 869 * This decides where the kernel will search for a free chunk of vm
867 * space during mmap's. 870 * space during mmap's.
868 */ 871 */
869#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) 872#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3))
873#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE)
870 874
871#define KSTK_EIP(task) (task_pt_regs(task)->ip) 875#define KSTK_EIP(task) (task_pt_regs(task)->ip)
872 876
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index fac9a5c0abe9..d91ba04dd007 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -53,6 +53,12 @@
53# define NEED_MOVBE 0 53# define NEED_MOVBE 0
54#endif 54#endif
55 55
56#ifdef CONFIG_X86_5LEVEL
57# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
58#else
59# define NEED_LA57 0
60#endif
61
56#ifdef CONFIG_X86_64 62#ifdef CONFIG_X86_64
57#ifdef CONFIG_PARAVIRT 63#ifdef CONFIG_PARAVIRT
58/* Paravirtualized systems may not have PSE or PGE available */ 64/* Paravirtualized systems may not have PSE or PGE available */
@@ -98,7 +104,7 @@
98#define REQUIRED_MASK13 0 104#define REQUIRED_MASK13 0
99#define REQUIRED_MASK14 0 105#define REQUIRED_MASK14 0
100#define REQUIRED_MASK15 0 106#define REQUIRED_MASK15 0
101#define REQUIRED_MASK16 0 107#define REQUIRED_MASK16 (NEED_LA57)
102#define REQUIRED_MASK17 0 108#define REQUIRED_MASK17 0
103#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) 109#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
104 110
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4517d6b93188..1f5bee2c202f 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -26,8 +26,13 @@
26# endif 26# endif
27#else /* CONFIG_X86_32 */ 27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ 28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 44 29# ifdef CONFIG_X86_5LEVEL
30# define MAX_PHYSMEM_BITS 46 30# define MAX_PHYSADDR_BITS 52
31# define MAX_PHYSMEM_BITS 52
32# else
33# define MAX_PHYSADDR_BITS 44
34# define MAX_PHYSMEM_BITS 46
35# endif
31#endif 36#endif
32 37
33#endif /* CONFIG_SPARSEMEM */ 38#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 58505f01962f..dcbd9bcce714 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -87,7 +87,7 @@ static inline void setup_stack_canary_segment(int cpu)
87{ 87{
88#ifdef CONFIG_X86_32 88#ifdef CONFIG_X86_32
89 unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); 89 unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
90 struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); 90 struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu);
91 struct desc_struct desc; 91 struct desc_struct desc;
92 92
93 desc = gdt_table[GDT_ENTRY_STACK_CANARY]; 93 desc = gdt_table[GDT_ENTRY_STACK_CANARY];
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 75d002bdb3f3..6ed9ea469b48 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -215,7 +215,6 @@ static inline void __flush_tlb_one(unsigned long addr)
215/* 215/*
216 * TLB flushing: 216 * TLB flushing:
217 * 217 *
218 * - flush_tlb() flushes the current mm struct TLBs
219 * - flush_tlb_all() flushes all processes TLBs 218 * - flush_tlb_all() flushes all processes TLBs
220 * - flush_tlb_mm(mm) flushes the specified mm context TLB's 219 * - flush_tlb_mm(mm) flushes the specified mm context TLB's
221 * - flush_tlb_page(vma, vmaddr) flushes one page 220 * - flush_tlb_page(vma, vmaddr) flushes one page
@@ -247,11 +246,6 @@ static inline void flush_tlb_all(void)
247 __flush_tlb_all(); 246 __flush_tlb_all();
248} 247}
249 248
250static inline void flush_tlb(void)
251{
252 __flush_tlb_up();
253}
254
255static inline void local_flush_tlb(void) 249static inline void local_flush_tlb(void)
256{ 250{
257 __flush_tlb_up(); 251 __flush_tlb_up();
@@ -313,14 +307,11 @@ static inline void flush_tlb_kernel_range(unsigned long start,
313 flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) 307 flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
314 308
315extern void flush_tlb_all(void); 309extern void flush_tlb_all(void);
316extern void flush_tlb_current_task(void);
317extern void flush_tlb_page(struct vm_area_struct *, unsigned long); 310extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
318extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 311extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
319 unsigned long end, unsigned long vmflag); 312 unsigned long end, unsigned long vmflag);
320extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); 313extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
321 314
322#define flush_tlb() flush_tlb_current_task()
323
324void native_flush_tlb_others(const struct cpumask *cpumask, 315void native_flush_tlb_others(const struct cpumask *cpumask,
325 struct mm_struct *mm, 316 struct mm_struct *mm,
326 unsigned long start, unsigned long end); 317 unsigned long start, unsigned long end);
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 64c5e745ebad..8a5a02b1dfba 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -280,13 +280,17 @@ static inline pte_t __pte_ma(pteval_t x)
280 280
281#define pmd_val_ma(v) ((v).pmd) 281#define pmd_val_ma(v) ((v).pmd)
282#ifdef __PAGETABLE_PUD_FOLDED 282#ifdef __PAGETABLE_PUD_FOLDED
283#define pud_val_ma(v) ((v).pgd.pgd) 283#define pud_val_ma(v) ((v).p4d.pgd.pgd)
284#else 284#else
285#define pud_val_ma(v) ((v).pud) 285#define pud_val_ma(v) ((v).pud)
286#endif 286#endif
287#define __pmd_ma(x) ((pmd_t) { (x) } ) 287#define __pmd_ma(x) ((pmd_t) { (x) } )
288 288
289#define pgd_val_ma(x) ((x).pgd) 289#ifdef __PAGETABLE_P4D_FOLDED
290#define p4d_val_ma(x) ((x).pgd.pgd)
291#else
292#define p4d_val_ma(x) ((x).p4d)
293#endif
290 294
291void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); 295void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
292 296
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 48587335ede8..ed014814ea35 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void)
101#ifdef CONFIG_SMP 101#ifdef CONFIG_SMP
102 initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); 102 initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
103 early_gdt_descr.address = 103 early_gdt_descr.address =
104 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 104 (unsigned long)get_cpu_gdt_rw(smp_processor_id());
105 initial_gs = per_cpu_offset(smp_processor_id()); 105 initial_gs = per_cpu_offset(smp_processor_id());
106#endif 106#endif
107 initial_code = (unsigned long)wakeup_long64; 107 initial_code = (unsigned long)wakeup_long64;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5a414545e8a3..446b0d3d4932 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call)
609 609
610 cpu = get_cpu(); 610 cpu = get_cpu();
611 BUG_ON(cpu != 0); 611 BUG_ON(cpu != 0);
612 gdt = get_cpu_gdt_table(cpu); 612 gdt = get_cpu_gdt_rw(cpu);
613 save_desc_40 = gdt[0x40 / 8]; 613 save_desc_40 = gdt[0x40 / 8];
614 gdt[0x40 / 8] = bad_bios_desc; 614 gdt[0x40 / 8] = bad_bios_desc;
615 615
@@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call)
685 685
686 cpu = get_cpu(); 686 cpu = get_cpu();
687 BUG_ON(cpu != 0); 687 BUG_ON(cpu != 0);
688 gdt = get_cpu_gdt_table(cpu); 688 gdt = get_cpu_gdt_rw(cpu);
689 save_desc_40 = gdt[0x40 / 8]; 689 save_desc_40 = gdt[0x40 / 8];
690 gdt[0x40 / 8] = bad_bios_desc; 690 gdt[0x40 / 8] = bad_bios_desc;
691 691
@@ -2352,7 +2352,7 @@ static int __init apm_init(void)
2352 * Note we only set APM segments on CPU zero, since we pin the APM 2352 * Note we only set APM segments on CPU zero, since we pin the APM
2353 * code to that CPU. 2353 * code to that CPU.
2354 */ 2354 */
2355 gdt = get_cpu_gdt_table(0); 2355 gdt = get_cpu_gdt_rw(0);
2356 set_desc_base(&gdt[APM_CS >> 3], 2356 set_desc_base(&gdt[APM_CS >> 3],
2357 (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); 2357 (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
2358 set_desc_base(&gdt[APM_CS_16 >> 3], 2358 set_desc_base(&gdt[APM_CS_16 >> 3],
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 58094a1f9e9d..8ee32119144d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -448,19 +448,60 @@ void load_percpu_segment(int cpu)
448 load_stack_canary_segment(); 448 load_stack_canary_segment();
449} 449}
450 450
451/* Setup the fixmap mapping only once per-processor */
452static inline void setup_fixmap_gdt(int cpu)
453{
454#ifdef CONFIG_X86_64
455 /* On 64-bit systems, we use a read-only fixmap GDT. */
456 pgprot_t prot = PAGE_KERNEL_RO;
457#else
458 /*
459 * On native 32-bit systems, the GDT cannot be read-only because
460 * our double fault handler uses a task gate, and entering through
461 * a task gate needs to change an available TSS to busy. If the GDT
462 * is read-only, that will triple fault.
463 *
464 * On Xen PV, the GDT must be read-only because the hypervisor requires
465 * it.
466 */
467 pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
468 PAGE_KERNEL_RO : PAGE_KERNEL;
469#endif
470
471 __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
472}
473
474/* Load the original GDT from the per-cpu structure */
475void load_direct_gdt(int cpu)
476{
477 struct desc_ptr gdt_descr;
478
479 gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
480 gdt_descr.size = GDT_SIZE - 1;
481 load_gdt(&gdt_descr);
482}
483EXPORT_SYMBOL_GPL(load_direct_gdt);
484
485/* Load a fixmap remapping of the per-cpu GDT */
486void load_fixmap_gdt(int cpu)
487{
488 struct desc_ptr gdt_descr;
489
490 gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
491 gdt_descr.size = GDT_SIZE - 1;
492 load_gdt(&gdt_descr);
493}
494EXPORT_SYMBOL_GPL(load_fixmap_gdt);
495
451/* 496/*
452 * Current gdt points %fs at the "master" per-cpu area: after this, 497 * Current gdt points %fs at the "master" per-cpu area: after this,
453 * it's on the real one. 498 * it's on the real one.
454 */ 499 */
455void switch_to_new_gdt(int cpu) 500void switch_to_new_gdt(int cpu)
456{ 501{
457 struct desc_ptr gdt_descr; 502 /* Load the original GDT */
458 503 load_direct_gdt(cpu);
459 gdt_descr.address = (long)get_cpu_gdt_table(cpu);
460 gdt_descr.size = GDT_SIZE - 1;
461 load_gdt(&gdt_descr);
462 /* Reload the per-cpu base */ 504 /* Reload the per-cpu base */
463
464 load_percpu_segment(cpu); 505 load_percpu_segment(cpu);
465} 506}
466 507
@@ -1526,6 +1567,9 @@ void cpu_init(void)
1526 1567
1527 if (is_uv_system()) 1568 if (is_uv_system())
1528 uv_cpu_init(); 1569 uv_cpu_init();
1570
1571 setup_fixmap_gdt(cpu);
1572 load_fixmap_gdt(cpu);
1529} 1573}
1530 1574
1531#else 1575#else
@@ -1581,6 +1625,9 @@ void cpu_init(void)
1581 dbg_restore_debug_regs(); 1625 dbg_restore_debug_regs();
1582 1626
1583 fpu__init_cpu(); 1627 fpu__init_cpu();
1628
1629 setup_fixmap_gdt(cpu);
1630 load_fixmap_gdt(cpu);
1584} 1631}
1585#endif 1632#endif
1586 1633
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 6e9b26fa6d05..d78a586ba8dc 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -270,7 +270,6 @@ int __init e820__update_table(struct e820_table *table)
270 if (table->nr_entries < 2) 270 if (table->nr_entries < 2)
271 return -1; 271 return -1;
272 272
273 table->nr_entries = table->nr_entries;
274 BUG_ON(table->nr_entries > max_nr_entries); 273 BUG_ON(table->nr_entries > max_nr_entries);
275 274
276 /* Bail out if we find any unreasonable addresses in the map: */ 275 /* Bail out if we find any unreasonable addresses in the map: */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 04f89caef9c4..8e598a1ad986 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -50,11 +50,11 @@
50#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) 50#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
51 51
52/* There is address space for how many espfix pages? */ 52/* There is address space for how many espfix pages? */
53#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) 53#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
54 54
55#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) 55#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
56#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS 56#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
57# error "Need more than one PGD for the ESPFIX hack" 57# error "Need more virtual address space for the ESPFIX hack"
58#endif 58#endif
59 59
60#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) 60#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
@@ -121,11 +121,13 @@ static void init_espfix_random(void)
121 121
122void __init init_espfix_bsp(void) 122void __init init_espfix_bsp(void)
123{ 123{
124 pgd_t *pgd_p; 124 pgd_t *pgd;
125 p4d_t *p4d;
125 126
126 /* Install the espfix pud into the kernel page directory */ 127 /* Install the espfix pud into the kernel page directory */
127 pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; 128 pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
128 pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); 129 p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
130 p4d_populate(&init_mm, p4d, espfix_pud_page);
129 131
130 /* Randomize the locations */ 132 /* Randomize the locations */
131 init_espfix_random(); 133 init_espfix_random();
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 469b23d6acc2..5f43cec296c5 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one(
103 pgd_t *pgd, pmd_t *pmd, pte_t *pte, 103 pgd_t *pgd, pmd_t *pmd, pte_t *pte,
104 unsigned long vaddr, unsigned long paddr) 104 unsigned long vaddr, unsigned long paddr)
105{ 105{
106 p4d_t *p4d;
106 pud_t *pud; 107 pud_t *pud;
107 108
108 pgd += pgd_index(vaddr); 109 pgd += pgd_index(vaddr);
@@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one(
110 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) 111 if (!(pgd_val(*pgd) & _PAGE_PRESENT))
111 set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); 112 set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
112#endif 113#endif
113 pud = pud_offset(pgd, vaddr); 114 p4d = p4d_offset(pgd, vaddr);
115 pud = pud_offset(p4d, vaddr);
114 pmd = pmd_offset(pud, vaddr); 116 pmd = pmd_offset(pud, vaddr);
115 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) 117 if (!(pmd_val(*pmd) & _PAGE_PRESENT))
116 set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); 118 set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 857cdbd02867..085c3b300d32 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,6 +36,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = {
36 36
37static void free_transition_pgtable(struct kimage *image) 37static void free_transition_pgtable(struct kimage *image)
38{ 38{
39 free_page((unsigned long)image->arch.p4d);
39 free_page((unsigned long)image->arch.pud); 40 free_page((unsigned long)image->arch.pud);
40 free_page((unsigned long)image->arch.pmd); 41 free_page((unsigned long)image->arch.pmd);
41 free_page((unsigned long)image->arch.pte); 42 free_page((unsigned long)image->arch.pte);
@@ -43,6 +44,7 @@ static void free_transition_pgtable(struct kimage *image)
43 44
44static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) 45static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
45{ 46{
47 p4d_t *p4d;
46 pud_t *pud; 48 pud_t *pud;
47 pmd_t *pmd; 49 pmd_t *pmd;
48 pte_t *pte; 50 pte_t *pte;
@@ -53,13 +55,21 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
53 paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); 55 paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
54 pgd += pgd_index(vaddr); 56 pgd += pgd_index(vaddr);
55 if (!pgd_present(*pgd)) { 57 if (!pgd_present(*pgd)) {
58 p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
59 if (!p4d)
60 goto err;
61 image->arch.p4d = p4d;
62 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
63 }
64 p4d = p4d_offset(pgd, vaddr);
65 if (!p4d_present(*p4d)) {
56 pud = (pud_t *)get_zeroed_page(GFP_KERNEL); 66 pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
57 if (!pud) 67 if (!pud)
58 goto err; 68 goto err;
59 image->arch.pud = pud; 69 image->arch.pud = pud;
60 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); 70 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
61 } 71 }
62 pud = pud_offset(pgd, vaddr); 72 pud = pud_offset(p4d, vaddr);
63 if (!pud_present(*pud)) { 73 if (!pud_present(*pud)) {
64 pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); 74 pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
65 if (!pmd) 75 if (!pmd)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 4797e87b0fb6..3586996fc50d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
405 .alloc_pte = paravirt_nop, 405 .alloc_pte = paravirt_nop,
406 .alloc_pmd = paravirt_nop, 406 .alloc_pmd = paravirt_nop,
407 .alloc_pud = paravirt_nop, 407 .alloc_pud = paravirt_nop,
408 .alloc_p4d = paravirt_nop,
408 .release_pte = paravirt_nop, 409 .release_pte = paravirt_nop,
409 .release_pmd = paravirt_nop, 410 .release_pmd = paravirt_nop,
410 .release_pud = paravirt_nop, 411 .release_pud = paravirt_nop,
412 .release_p4d = paravirt_nop,
411 413
412 .set_pte = native_set_pte, 414 .set_pte = native_set_pte,
413 .set_pte_at = native_set_pte_at, 415 .set_pte_at = native_set_pte_at,
@@ -430,12 +432,19 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
430 .pmd_val = PTE_IDENT, 432 .pmd_val = PTE_IDENT,
431 .make_pmd = PTE_IDENT, 433 .make_pmd = PTE_IDENT,
432 434
433#if CONFIG_PGTABLE_LEVELS == 4 435#if CONFIG_PGTABLE_LEVELS >= 4
434 .pud_val = PTE_IDENT, 436 .pud_val = PTE_IDENT,
435 .make_pud = PTE_IDENT, 437 .make_pud = PTE_IDENT,
436 438
439 .set_p4d = native_set_p4d,
440
441#if CONFIG_PGTABLE_LEVELS >= 5
442 .p4d_val = PTE_IDENT,
443 .make_p4d = PTE_IDENT,
444
437 .set_pgd = native_set_pgd, 445 .set_pgd = native_set_pgd,
438#endif 446#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
447#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
439#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ 448#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
440 449
441 .pte_val = PTE_IDENT, 450 .pte_val = PTE_IDENT,
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ea1a6180bf39..825a1e47cf3e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -53,6 +53,11 @@
53#include <asm/xen/hypervisor.h> 53#include <asm/xen/hypervisor.h>
54#include <asm/vdso.h> 54#include <asm/vdso.h>
55#include <asm/intel_rdt.h> 55#include <asm/intel_rdt.h>
56#include <asm/unistd.h>
57#ifdef CONFIG_IA32_EMULATION
58/* Not included via unistd.h */
59#include <asm/unistd_32_ia32.h>
60#endif
56 61
57__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); 62__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
58 63
@@ -494,6 +499,8 @@ void set_personality_64bit(void)
494 clear_thread_flag(TIF_IA32); 499 clear_thread_flag(TIF_IA32);
495 clear_thread_flag(TIF_ADDR32); 500 clear_thread_flag(TIF_ADDR32);
496 clear_thread_flag(TIF_X32); 501 clear_thread_flag(TIF_X32);
502 /* Pretend that this comes from a 64bit execve */
503 task_pt_regs(current)->orig_ax = __NR_execve;
497 504
498 /* Ensure the corresponding mm is not marked. */ 505 /* Ensure the corresponding mm is not marked. */
499 if (current->mm) 506 if (current->mm)
@@ -506,32 +513,50 @@ void set_personality_64bit(void)
506 current->personality &= ~READ_IMPLIES_EXEC; 513 current->personality &= ~READ_IMPLIES_EXEC;
507} 514}
508 515
509void set_personality_ia32(bool x32) 516static void __set_personality_x32(void)
510{ 517{
511 /* inherit personality from parent */ 518#ifdef CONFIG_X86_X32
519 clear_thread_flag(TIF_IA32);
520 set_thread_flag(TIF_X32);
521 if (current->mm)
522 current->mm->context.ia32_compat = TIF_X32;
523 current->personality &= ~READ_IMPLIES_EXEC;
524 /*
525 * in_compat_syscall() uses the presence of the x32 syscall bit
526 * flag to determine compat status. The x86 mmap() code relies on
527 * the syscall bitness so set x32 syscall bit right here to make
528 * in_compat_syscall() work during exec().
529 *
530 * Pretend to come from a x32 execve.
531 */
532 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
533 current->thread.status &= ~TS_COMPAT;
534#endif
535}
512 536
537static void __set_personality_ia32(void)
538{
539#ifdef CONFIG_IA32_EMULATION
540 set_thread_flag(TIF_IA32);
541 clear_thread_flag(TIF_X32);
542 if (current->mm)
543 current->mm->context.ia32_compat = TIF_IA32;
544 current->personality |= force_personality32;
545 /* Prepare the first "return" to user space */
546 task_pt_regs(current)->orig_ax = __NR_ia32_execve;
547 current->thread.status |= TS_COMPAT;
548#endif
549}
550
551void set_personality_ia32(bool x32)
552{
513 /* Make sure to be in 32bit mode */ 553 /* Make sure to be in 32bit mode */
514 set_thread_flag(TIF_ADDR32); 554 set_thread_flag(TIF_ADDR32);
515 555
516 /* Mark the associated mm as containing 32-bit tasks. */ 556 if (x32)
517 if (x32) { 557 __set_personality_x32();
518 clear_thread_flag(TIF_IA32); 558 else
519 set_thread_flag(TIF_X32); 559 __set_personality_ia32();
520 if (current->mm)
521 current->mm->context.ia32_compat = TIF_X32;
522 current->personality &= ~READ_IMPLIES_EXEC;
523 /* in_compat_syscall() uses the presence of the x32
524 syscall bit flag to determine compat status */
525 current->thread.status &= ~TS_COMPAT;
526 } else {
527 set_thread_flag(TIF_IA32);
528 clear_thread_flag(TIF_X32);
529 if (current->mm)
530 current->mm->context.ia32_compat = TIF_IA32;
531 current->personality |= force_personality32;
532 /* Prepare the first "return" to user space */
533 current->thread.status |= TS_COMPAT;
534 }
535} 560}
536EXPORT_SYMBOL_GPL(set_personality_ia32); 561EXPORT_SYMBOL_GPL(set_personality_ia32);
537 562
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0b4d3c686b1e..603a1669a2ec 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1225,21 +1225,6 @@ void __init setup_arch(char **cmdline_p)
1225 1225
1226 kasan_init(); 1226 kasan_init();
1227 1227
1228#ifdef CONFIG_X86_32
1229 /* sync back kernel address range */
1230 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
1231 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
1232 KERNEL_PGD_PTRS);
1233
1234 /*
1235 * sync back low identity map too. It is used for example
1236 * in the 32-bit EFI stub.
1237 */
1238 clone_pgd_range(initial_page_table,
1239 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
1240 min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
1241#endif
1242
1243 tboot_probe(); 1228 tboot_probe();
1244 1229
1245 map_vsyscall(); 1230 map_vsyscall();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9820d6d977c6..bb1e8cc0bc84 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu)
160 pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, 160 pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
161 0x2 | DESCTYPE_S, 0x8); 161 0x2 | DESCTYPE_S, 0x8);
162 gdt.s = 1; 162 gdt.s = 1;
163 write_gdt_entry(get_cpu_gdt_table(cpu), 163 write_gdt_entry(get_cpu_gdt_rw(cpu),
164 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); 164 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
165#endif 165#endif
166} 166}
@@ -288,4 +288,25 @@ void __init setup_per_cpu_areas(void)
288 288
289 /* Setup cpu initialized, callin, callout masks */ 289 /* Setup cpu initialized, callin, callout masks */
290 setup_cpu_local_masks(); 290 setup_cpu_local_masks();
291
292#ifdef CONFIG_X86_32
293 /*
294 * Sync back kernel address range. We want to make sure that
295 * all kernel mappings, including percpu mappings, are available
296 * in the smpboot asm. We can't reliably pick up percpu
297 * mappings using vmalloc_fault(), because exception dispatch
298 * needs percpu data.
299 */
300 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
301 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
302 KERNEL_PGD_PTRS);
303
304 /*
305 * sync back low identity map too. It is used for example
306 * in the 32-bit EFI stub.
307 */
308 clone_pgd_range(initial_page_table,
309 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
310 min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
311#endif
291} 312}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bd1f1ad35284..f04479a8f74f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
983 unsigned long timeout; 983 unsigned long timeout;
984 984
985 idle->thread.sp = (unsigned long)task_pt_regs(idle); 985 idle->thread.sp = (unsigned long)task_pt_regs(idle);
986 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 986 early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
987 initial_code = (unsigned long)start_secondary; 987 initial_code = (unsigned long)start_secondary;
988 initial_stack = idle->thread.sp; 988 initial_stack = idle->thread.sp;
989 989
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 50215a4b9347..207b8f2582c7 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -17,6 +17,8 @@
17#include <linux/uaccess.h> 17#include <linux/uaccess.h>
18#include <linux/elf.h> 18#include <linux/elf.h>
19 19
20#include <asm/elf.h>
21#include <asm/compat.h>
20#include <asm/ia32.h> 22#include <asm/ia32.h>
21#include <asm/syscalls.h> 23#include <asm/syscalls.h>
22 24
@@ -101,7 +103,7 @@ out:
101static void find_start_end(unsigned long flags, unsigned long *begin, 103static void find_start_end(unsigned long flags, unsigned long *begin,
102 unsigned long *end) 104 unsigned long *end)
103{ 105{
104 if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { 106 if (!in_compat_syscall() && (flags & MAP_32BIT)) {
105 /* This is usually used needed to map code in small 107 /* This is usually used needed to map code in small
106 model, so it needs to be in the first 31bit. Limit 108 model, so it needs to be in the first 31bit. Limit
107 it to that. This means we need to move the 109 it to that. This means we need to move the
@@ -114,10 +116,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
114 if (current->flags & PF_RANDOMIZE) { 116 if (current->flags & PF_RANDOMIZE) {
115 *begin = randomize_page(*begin, 0x02000000); 117 *begin = randomize_page(*begin, 0x02000000);
116 } 118 }
117 } else { 119 return;
118 *begin = current->mm->mmap_legacy_base;
119 *end = TASK_SIZE;
120 } 120 }
121
122 *begin = get_mmap_base(1);
123 *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
121} 124}
122 125
123unsigned long 126unsigned long
@@ -176,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
176 return addr; 179 return addr;
177 180
178 /* for MAP_32BIT mappings we force the legacy mmap base */ 181 /* for MAP_32BIT mappings we force the legacy mmap base */
179 if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) 182 if (!in_compat_syscall() && (flags & MAP_32BIT))
180 goto bottomup; 183 goto bottomup;
181 184
182 /* requesting a specific address */ 185 /* requesting a specific address */
@@ -191,7 +194,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
191 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 194 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
192 info.length = len; 195 info.length = len;
193 info.low_limit = PAGE_SIZE; 196 info.low_limit = PAGE_SIZE;
194 info.high_limit = mm->mmap_base; 197 info.high_limit = get_mmap_base(0);
195 info.align_mask = 0; 198 info.align_mask = 0;
196 info.align_offset = pgoff << PAGE_SHIFT; 199 info.align_offset = pgoff << PAGE_SHIFT;
197 if (filp) { 200 if (filp) {
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index ccccd335ae01..d4c8011a2293 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
118 pgprot_t prot) 118 pgprot_t prot)
119{ 119{
120 pgd_t *pgd; 120 pgd_t *pgd;
121 p4d_t *p4d;
121 pud_t *pud; 122 pud_t *pud;
122 pmd_t *pmd; 123 pmd_t *pmd;
123 pte_t *pte; 124 pte_t *pte;
124 125
125 pgd = pgd_offset(&tboot_mm, vaddr); 126 pgd = pgd_offset(&tboot_mm, vaddr);
126 pud = pud_alloc(&tboot_mm, pgd, vaddr); 127 p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
128 if (!p4d)
129 return -1;
130 pud = pud_alloc(&tboot_mm, p4d, vaddr);
127 if (!pud) 131 if (!pud)
128 return -1; 132 return -1;
129 pmd = pmd_alloc(&tboot_mm, pud, vaddr); 133 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6c8934406dc9..dcd699baea1b 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -92,10 +92,17 @@ static void set_tls_desc(struct task_struct *p, int idx,
92 cpu = get_cpu(); 92 cpu = get_cpu();
93 93
94 while (n-- > 0) { 94 while (n-- > 0) {
95 if (LDT_empty(info) || LDT_zero(info)) 95 if (LDT_empty(info) || LDT_zero(info)) {
96 desc->a = desc->b = 0; 96 desc->a = desc->b = 0;
97 else 97 } else {
98 fill_ldt(desc, info); 98 fill_ldt(desc, info);
99
100 /*
101 * Always set the accessed bit so that the CPU
102 * doesn't try to write to the (read-only) GDT.
103 */
104 desc->type |= 1;
105 }
99 ++info; 106 ++info;
100 ++desc; 107 ++desc;
101 } 108 }
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 23ee89ce59a9..7924a5356c8a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
164 struct vm_area_struct *vma; 164 struct vm_area_struct *vma;
165 spinlock_t *ptl; 165 spinlock_t *ptl;
166 pgd_t *pgd; 166 pgd_t *pgd;
167 p4d_t *p4d;
167 pud_t *pud; 168 pud_t *pud;
168 pmd_t *pmd; 169 pmd_t *pmd;
169 pte_t *pte; 170 pte_t *pte;
@@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm)
173 pgd = pgd_offset(mm, 0xA0000); 174 pgd = pgd_offset(mm, 0xA0000);
174 if (pgd_none_or_clear_bad(pgd)) 175 if (pgd_none_or_clear_bad(pgd))
175 goto out; 176 goto out;
176 pud = pud_offset(pgd, 0xA0000); 177 p4d = p4d_offset(pgd, 0xA0000);
178 if (p4d_none_or_clear_bad(p4d))
179 goto out;
180 pud = pud_offset(p4d, 0xA0000);
177 if (pud_none_or_clear_bad(pud)) 181 if (pud_none_or_clear_bad(pud))
178 goto out; 182 goto out;
179 pmd = pmd_offset(pud, 0xA0000); 183 pmd = pmd_offset(pud, 0xA0000);
@@ -193,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
193 pte_unmap_unlock(pte, ptl); 197 pte_unmap_unlock(pte, ptl);
194out: 198out:
195 up_write(&mm->mmap_sem); 199 up_write(&mm->mmap_sem);
196 flush_tlb(); 200 flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
197} 201}
198 202
199 203
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5fba70646c32..5f48f62b8dc2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -741,7 +741,6 @@ static int svm_hardware_enable(void)
741 741
742 struct svm_cpu_data *sd; 742 struct svm_cpu_data *sd;
743 uint64_t efer; 743 uint64_t efer;
744 struct desc_ptr gdt_descr;
745 struct desc_struct *gdt; 744 struct desc_struct *gdt;
746 int me = raw_smp_processor_id(); 745 int me = raw_smp_processor_id();
747 746
@@ -763,8 +762,7 @@ static int svm_hardware_enable(void)
763 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 762 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
764 sd->next_asid = sd->max_asid + 1; 763 sd->next_asid = sd->max_asid + 1;
765 764
766 native_store_gdt(&gdt_descr); 765 gdt = get_current_gdt_rw();
767 gdt = (struct desc_struct *)gdt_descr.address;
768 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 766 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
769 767
770 wrmsrl(MSR_EFER, efer | EFER_SVME); 768 wrmsrl(MSR_EFER, efer | EFER_SVME);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 259e9b28ccf8..1a471e5f963f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
935 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 935 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
936 */ 936 */
937static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 937static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
938static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
939 938
940/* 939/*
941 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we 940 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
@@ -2057,14 +2056,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2057 */ 2056 */
2058static unsigned long segment_base(u16 selector) 2057static unsigned long segment_base(u16 selector)
2059{ 2058{
2060 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
2061 struct desc_struct *table; 2059 struct desc_struct *table;
2062 unsigned long v; 2060 unsigned long v;
2063 2061
2064 if (!(selector & ~SEGMENT_RPL_MASK)) 2062 if (!(selector & ~SEGMENT_RPL_MASK))
2065 return 0; 2063 return 0;
2066 2064
2067 table = (struct desc_struct *)gdt->address; 2065 table = get_current_gdt_ro();
2068 2066
2069 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 2067 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2070 u16 ldt_selector = kvm_read_ldt(); 2068 u16 ldt_selector = kvm_read_ldt();
@@ -2169,7 +2167,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
2169#endif 2167#endif
2170 if (vmx->host_state.msr_host_bndcfgs) 2168 if (vmx->host_state.msr_host_bndcfgs)
2171 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 2169 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
2172 load_gdt(this_cpu_ptr(&host_gdt)); 2170 load_fixmap_gdt(raw_smp_processor_id());
2173} 2171}
2174 2172
2175static void vmx_load_host_state(struct vcpu_vmx *vmx) 2173static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -2271,7 +2269,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2271 } 2269 }
2272 2270
2273 if (!already_loaded) { 2271 if (!already_loaded) {
2274 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 2272 void *gdt = get_current_gdt_ro();
2275 unsigned long sysenter_esp; 2273 unsigned long sysenter_esp;
2276 2274
2277 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2275 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -2282,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2282 */ 2280 */
2283 vmcs_writel(HOST_TR_BASE, 2281 vmcs_writel(HOST_TR_BASE,
2284 (unsigned long)this_cpu_ptr(&cpu_tss)); 2282 (unsigned long)this_cpu_ptr(&cpu_tss));
2285 vmcs_writel(HOST_GDTR_BASE, gdt->address); 2283 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
2286 2284
2287 /* 2285 /*
2288 * VM exits change the host TR limit to 0x67 after a VM 2286 * VM exits change the host TR limit to 0x67 after a VM
@@ -3471,8 +3469,6 @@ static int hardware_enable(void)
3471 ept_sync_global(); 3469 ept_sync_global();
3472 } 3470 }
3473 3471
3474 native_store_gdt(this_cpu_ptr(&host_gdt));
3475
3476 return 0; 3472 return 0;
3477} 3473}
3478 3474
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee7ea27..bce6990b1d81 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = {
110#define PTE_LEVEL_MULT (PAGE_SIZE) 110#define PTE_LEVEL_MULT (PAGE_SIZE)
111#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) 111#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
112#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) 112#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
113#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) 113#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
114#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
114 115
115#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ 116#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
116({ \ 117({ \
@@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st,
286 } 287 }
287} 288}
288 289
289static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, 290static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
290 unsigned long P)
291{ 291{
292 int i; 292 int i;
293 pte_t *start; 293 pte_t *start;
294 pgprotval_t prot; 294 pgprotval_t prot;
295 295
296 start = (pte_t *) pmd_page_vaddr(addr); 296 start = (pte_t *)pmd_page_vaddr(addr);
297 for (i = 0; i < PTRS_PER_PTE; i++) { 297 for (i = 0; i < PTRS_PER_PTE; i++) {
298 prot = pte_flags(*start); 298 prot = pte_flags(*start);
299 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 299 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
@@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
304 304
305#if PTRS_PER_PMD > 1 305#if PTRS_PER_PMD > 1
306 306
307static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, 307static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
308 unsigned long P)
309{ 308{
310 int i; 309 int i;
311 pmd_t *start; 310 pmd_t *start;
312 pgprotval_t prot; 311 pgprotval_t prot;
313 312
314 start = (pmd_t *) pud_page_vaddr(addr); 313 start = (pmd_t *)pud_page_vaddr(addr);
315 for (i = 0; i < PTRS_PER_PMD; i++) { 314 for (i = 0; i < PTRS_PER_PMD; i++) {
316 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 315 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
317 if (!pmd_none(*start)) { 316 if (!pmd_none(*start)) {
@@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
347 return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); 346 return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
348} 347}
349 348
350static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, 349static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
351 unsigned long P)
352{ 350{
353 int i; 351 int i;
354 pud_t *start; 352 pud_t *start;
355 pgprotval_t prot; 353 pgprotval_t prot;
356 pud_t *prev_pud = NULL; 354 pud_t *prev_pud = NULL;
357 355
358 start = (pud_t *) pgd_page_vaddr(addr); 356 start = (pud_t *)p4d_page_vaddr(addr);
359 357
360 for (i = 0; i < PTRS_PER_PUD; i++) { 358 for (i = 0; i < PTRS_PER_PUD; i++) {
361 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 359 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
@@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
377} 375}
378 376
379#else 377#else
380#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) 378#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
381#define pgd_large(a) pud_large(__pud(pgd_val(a))) 379#define p4d_large(a) pud_large(__pud(p4d_val(a)))
382#define pgd_none(a) pud_none(__pud(pgd_val(a))) 380#define p4d_none(a) pud_none(__pud(p4d_val(a)))
381#endif
382
383#if PTRS_PER_P4D > 1
384
385static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
386{
387 int i;
388 p4d_t *start;
389 pgprotval_t prot;
390
391 start = (p4d_t *)pgd_page_vaddr(addr);
392
393 for (i = 0; i < PTRS_PER_P4D; i++) {
394 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
395 if (!p4d_none(*start)) {
396 if (p4d_large(*start) || !p4d_present(*start)) {
397 prot = p4d_flags(*start);
398 note_page(m, st, __pgprot(prot), 2);
399 } else {
400 walk_pud_level(m, st, *start,
401 P + i * P4D_LEVEL_MULT);
402 }
403 } else
404 note_page(m, st, __pgprot(0), 2);
405
406 start++;
407 }
408}
409
410#else
411#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
412#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
413#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
383#endif 414#endif
384 415
385static inline bool is_hypervisor_range(int idx) 416static inline bool is_hypervisor_range(int idx)
@@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
424 prot = pgd_flags(*start); 455 prot = pgd_flags(*start);
425 note_page(m, &st, __pgprot(prot), 1); 456 note_page(m, &st, __pgprot(prot), 1);
426 } else { 457 } else {
427 walk_pud_level(m, &st, *start, 458 walk_p4d_level(m, &st, *start,
428 i * PGD_LEVEL_MULT); 459 i * PGD_LEVEL_MULT);
429 } 460 }
430 } else 461 } else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 428e31763cb9..8ad91a01cbc8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
253{ 253{
254 unsigned index = pgd_index(address); 254 unsigned index = pgd_index(address);
255 pgd_t *pgd_k; 255 pgd_t *pgd_k;
256 p4d_t *p4d, *p4d_k;
256 pud_t *pud, *pud_k; 257 pud_t *pud, *pud_k;
257 pmd_t *pmd, *pmd_k; 258 pmd_t *pmd, *pmd_k;
258 259
@@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
265 /* 266 /*
266 * set_pgd(pgd, *pgd_k); here would be useless on PAE 267 * set_pgd(pgd, *pgd_k); here would be useless on PAE
267 * and redundant with the set_pmd() on non-PAE. As would 268 * and redundant with the set_pmd() on non-PAE. As would
268 * set_pud. 269 * set_p4d/set_pud.
269 */ 270 */
270 pud = pud_offset(pgd, address); 271 p4d = p4d_offset(pgd, address);
271 pud_k = pud_offset(pgd_k, address); 272 p4d_k = p4d_offset(pgd_k, address);
273 if (!p4d_present(*p4d_k))
274 return NULL;
275
276 pud = pud_offset(p4d, address);
277 pud_k = pud_offset(p4d_k, address);
272 if (!pud_present(*pud_k)) 278 if (!pud_present(*pud_k))
273 return NULL; 279 return NULL;
274 280
@@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address)
384{ 390{
385 pgd_t *base = __va(read_cr3()); 391 pgd_t *base = __va(read_cr3());
386 pgd_t *pgd = &base[pgd_index(address)]; 392 pgd_t *pgd = &base[pgd_index(address)];
393 p4d_t *p4d;
394 pud_t *pud;
387 pmd_t *pmd; 395 pmd_t *pmd;
388 pte_t *pte; 396 pte_t *pte;
389 397
@@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address)
392 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 400 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
393 goto out; 401 goto out;
394#endif 402#endif
395 pmd = pmd_offset(pud_offset(pgd, address), address); 403 p4d = p4d_offset(pgd, address);
404 pud = pud_offset(p4d, address);
405 pmd = pmd_offset(pud, address);
396 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 406 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
397 407
398 /* 408 /*
@@ -425,6 +435,7 @@ void vmalloc_sync_all(void)
425static noinline int vmalloc_fault(unsigned long address) 435static noinline int vmalloc_fault(unsigned long address)
426{ 436{
427 pgd_t *pgd, *pgd_ref; 437 pgd_t *pgd, *pgd_ref;
438 p4d_t *p4d, *p4d_ref;
428 pud_t *pud, *pud_ref; 439 pud_t *pud, *pud_ref;
429 pmd_t *pmd, *pmd_ref; 440 pmd_t *pmd, *pmd_ref;
430 pte_t *pte, *pte_ref; 441 pte_t *pte, *pte_ref;
@@ -448,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address)
448 if (pgd_none(*pgd)) { 459 if (pgd_none(*pgd)) {
449 set_pgd(pgd, *pgd_ref); 460 set_pgd(pgd, *pgd_ref);
450 arch_flush_lazy_mmu_mode(); 461 arch_flush_lazy_mmu_mode();
451 } else { 462 } else if (CONFIG_PGTABLE_LEVELS > 4) {
463 /*
464 * With folded p4d, pgd_none() is always false, so the pgd may
465 * point to an empty page table entry and pgd_page_vaddr()
466 * will return garbage.
467 *
468 * We will do the correct sanity check on the p4d level.
469 */
452 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 470 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
453 } 471 }
454 472
473 /* With 4-level paging, copying happens on the p4d level. */
474 p4d = p4d_offset(pgd, address);
475 p4d_ref = p4d_offset(pgd_ref, address);
476 if (p4d_none(*p4d_ref))
477 return -1;
478
479 if (p4d_none(*p4d)) {
480 set_p4d(p4d, *p4d_ref);
481 arch_flush_lazy_mmu_mode();
482 } else {
483 BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
484 }
485
455 /* 486 /*
456 * Below here mismatches are bugs because these lower tables 487 * Below here mismatches are bugs because these lower tables
457 * are shared: 488 * are shared:
458 */ 489 */
459 490
460 pud = pud_offset(pgd, address); 491 pud = pud_offset(p4d, address);
461 pud_ref = pud_offset(pgd_ref, address); 492 pud_ref = pud_offset(p4d_ref, address);
462 if (pud_none(*pud_ref)) 493 if (pud_none(*pud_ref))
463 return -1; 494 return -1;
464 495
@@ -526,6 +557,7 @@ static void dump_pagetable(unsigned long address)
526{ 557{
527 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); 558 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
528 pgd_t *pgd = base + pgd_index(address); 559 pgd_t *pgd = base + pgd_index(address);
560 p4d_t *p4d;
529 pud_t *pud; 561 pud_t *pud;
530 pmd_t *pmd; 562 pmd_t *pmd;
531 pte_t *pte; 563 pte_t *pte;
@@ -538,7 +570,15 @@ static void dump_pagetable(unsigned long address)
538 if (!pgd_present(*pgd)) 570 if (!pgd_present(*pgd))
539 goto out; 571 goto out;
540 572
541 pud = pud_offset(pgd, address); 573 p4d = p4d_offset(pgd, address);
574 if (bad_address(p4d))
575 goto bad;
576
577 printk("P4D %lx ", p4d_val(*p4d));
578 if (!p4d_present(*p4d) || p4d_large(*p4d))
579 goto out;
580
581 pud = pud_offset(p4d, address);
542 if (bad_address(pud)) 582 if (bad_address(pud))
543 goto bad; 583 goto bad;
544 584
@@ -1082,6 +1122,7 @@ static noinline int
1082spurious_fault(unsigned long error_code, unsigned long address) 1122spurious_fault(unsigned long error_code, unsigned long address)
1083{ 1123{
1084 pgd_t *pgd; 1124 pgd_t *pgd;
1125 p4d_t *p4d;
1085 pud_t *pud; 1126 pud_t *pud;
1086 pmd_t *pmd; 1127 pmd_t *pmd;
1087 pte_t *pte; 1128 pte_t *pte;
@@ -1104,7 +1145,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
1104 if (!pgd_present(*pgd)) 1145 if (!pgd_present(*pgd))
1105 return 0; 1146 return 0;
1106 1147
1107 pud = pud_offset(pgd, address); 1148 p4d = p4d_offset(pgd, address);
1149 if (!p4d_present(*p4d))
1150 return 0;
1151
1152 if (p4d_large(*p4d))
1153 return spurious_fault_check(error_code, (pte_t *) p4d);
1154
1155 pud = pud_offset(p4d, address);
1108 if (!pud_present(*pud)) 1156 if (!pud_present(*pud))
1109 return 0; 1157 return 0;
1110 1158
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 1f3b6ef105cd..456dfdfd2249 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
76} 76}
77 77
78/* 78/*
79 * 'pteval' can come from a pte, pmd or pud. We only check 79 * 'pteval' can come from a pte, pmd, pud or p4d. We only check
80 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the 80 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
81 * same value on all 3 types. 81 * same value on all 4 types.
82 */ 82 */
83static inline int pte_allows_gup(unsigned long pteval, int write) 83static inline int pte_allows_gup(unsigned long pteval, int write)
84{ 84{
@@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
295 return 1; 295 return 1;
296} 296}
297 297
298static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, 298static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
299 int write, struct page **pages, int *nr) 299 int write, struct page **pages, int *nr)
300{ 300{
301 unsigned long next; 301 unsigned long next;
302 pud_t *pudp; 302 pud_t *pudp;
303 303
304 pudp = pud_offset(&pgd, addr); 304 pudp = pud_offset(&p4d, addr);
305 do { 305 do {
306 pud_t pud = *pudp; 306 pud_t pud = *pudp;
307 307
@@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
320 return 1; 320 return 1;
321} 321}
322 322
323static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
324 int write, struct page **pages, int *nr)
325{
326 unsigned long next;
327 p4d_t *p4dp;
328
329 p4dp = p4d_offset(&pgd, addr);
330 do {
331 p4d_t p4d = *p4dp;
332
333 next = p4d_addr_end(addr, end);
334 if (p4d_none(p4d))
335 return 0;
336 BUILD_BUG_ON(p4d_large(p4d));
337 if (!gup_pud_range(p4d, addr, next, write, pages, nr))
338 return 0;
339 } while (p4dp++, addr = next, addr != end);
340
341 return 1;
342}
343
323/* 344/*
324 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall 345 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
325 * back to the regular GUP. 346 * back to the regular GUP.
@@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
368 next = pgd_addr_end(addr, end); 389 next = pgd_addr_end(addr, end);
369 if (pgd_none(pgd)) 390 if (pgd_none(pgd))
370 break; 391 break;
371 if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 392 if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
372 break; 393 break;
373 } while (pgdp++, addr = next, addr != end); 394 } while (pgdp++, addr = next, addr != end);
374 local_irq_restore(flags); 395 local_irq_restore(flags);
@@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
440 next = pgd_addr_end(addr, end); 461 next = pgd_addr_end(addr, end);
441 if (pgd_none(pgd)) 462 if (pgd_none(pgd))
442 goto slow; 463 goto slow;
443 if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 464 if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
444 goto slow; 465 goto slow;
445 } while (pgdp++, addr = next, addr != end); 466 } while (pgdp++, addr = next, addr != end);
446 local_irq_enable(); 467 local_irq_enable();
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index c5066a260803..302f43fd9c28 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -12,10 +12,12 @@
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/sysctl.h> 14#include <linux/sysctl.h>
15#include <linux/compat.h>
15#include <asm/mman.h> 16#include <asm/mman.h>
16#include <asm/tlb.h> 17#include <asm/tlb.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18#include <asm/pgalloc.h> 19#include <asm/pgalloc.h>
20#include <asm/elf.h>
19 21
20#if 0 /* This is just for testing */ 22#if 0 /* This is just for testing */
21struct page * 23struct page *
@@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
82 84
83 info.flags = 0; 85 info.flags = 0;
84 info.length = len; 86 info.length = len;
85 info.low_limit = current->mm->mmap_legacy_base; 87 info.low_limit = get_mmap_base(1);
86 info.high_limit = TASK_SIZE; 88 info.high_limit = in_compat_syscall() ?
89 tasksize_32bit() : tasksize_64bit();
87 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 90 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
88 info.align_offset = 0; 91 info.align_offset = 0;
89 return vm_unmapped_area(&info); 92 return vm_unmapped_area(&info);
@@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
100 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 103 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
101 info.length = len; 104 info.length = len;
102 info.low_limit = PAGE_SIZE; 105 info.low_limit = PAGE_SIZE;
103 info.high_limit = current->mm->mmap_base; 106 info.high_limit = get_mmap_base(0);
104 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 107 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
105 info.align_offset = 0; 108 info.align_offset = 0;
106 addr = vm_unmapped_area(&info); 109 addr = vm_unmapped_area(&info);
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 4473cb4f8b90..04210a29dd60 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
45 return 0; 45 return 0;
46} 46}
47 47
48static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
49 unsigned long addr, unsigned long end)
50{
51 unsigned long next;
52
53 for (; addr < end; addr = next) {
54 p4d_t *p4d = p4d_page + p4d_index(addr);
55 pud_t *pud;
56
57 next = (addr & P4D_MASK) + P4D_SIZE;
58 if (next > end)
59 next = end;
60
61 if (p4d_present(*p4d)) {
62 pud = pud_offset(p4d, 0);
63 ident_pud_init(info, pud, addr, next);
64 continue;
65 }
66 pud = (pud_t *)info->alloc_pgt_page(info->context);
67 if (!pud)
68 return -ENOMEM;
69 ident_pud_init(info, pud, addr, next);
70 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
71 }
72
73 return 0;
74}
75
48int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, 76int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
49 unsigned long pstart, unsigned long pend) 77 unsigned long pstart, unsigned long pend)
50{ 78{
@@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
55 83
56 for (; addr < end; addr = next) { 84 for (; addr < end; addr = next) {
57 pgd_t *pgd = pgd_page + pgd_index(addr); 85 pgd_t *pgd = pgd_page + pgd_index(addr);
58 pud_t *pud; 86 p4d_t *p4d;
59 87
60 next = (addr & PGDIR_MASK) + PGDIR_SIZE; 88 next = (addr & PGDIR_MASK) + PGDIR_SIZE;
61 if (next > end) 89 if (next > end)
62 next = end; 90 next = end;
63 91
64 if (pgd_present(*pgd)) { 92 if (pgd_present(*pgd)) {
65 pud = pud_offset(pgd, 0); 93 p4d = p4d_offset(pgd, 0);
66 result = ident_pud_init(info, pud, addr, next); 94 result = ident_p4d_init(info, p4d, addr, next);
67 if (result) 95 if (result)
68 return result; 96 return result;
69 continue; 97 continue;
70 } 98 }
71 99
72 pud = (pud_t *)info->alloc_pgt_page(info->context); 100 p4d = (p4d_t *)info->alloc_pgt_page(info->context);
73 if (!pud) 101 if (!p4d)
74 return -ENOMEM; 102 return -ENOMEM;
75 result = ident_pud_init(info, pud, addr, next); 103 result = ident_p4d_init(info, p4d, addr, next);
76 if (result) 104 if (result)
77 return result; 105 return result;
78 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); 106 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
107 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
108 } else {
109 /*
110 * With p4d folded, pgd is equal to p4d.
111 * The pgd entry has to point to the pud page table in this case.
112 */
113 pud_t *pud = pud_offset(p4d, 0);
114 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
115 }
79 } 116 }
80 117
81 return 0; 118 return 0;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 030bfed10a6c..f34d275ee201 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -56,8 +56,6 @@
56 56
57unsigned long highstart_pfn, highend_pfn; 57unsigned long highstart_pfn, highend_pfn;
58 58
59static noinline int do_test_wp_bit(void);
60
61bool __read_mostly __vmalloc_start_set = false; 59bool __read_mostly __vmalloc_start_set = false;
62 60
63/* 61/*
@@ -67,6 +65,7 @@ bool __read_mostly __vmalloc_start_set = false;
67 */ 65 */
68static pmd_t * __init one_md_table_init(pgd_t *pgd) 66static pmd_t * __init one_md_table_init(pgd_t *pgd)
69{ 67{
68 p4d_t *p4d;
70 pud_t *pud; 69 pud_t *pud;
71 pmd_t *pmd_table; 70 pmd_t *pmd_table;
72 71
@@ -75,13 +74,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
75 pmd_table = (pmd_t *)alloc_low_page(); 74 pmd_table = (pmd_t *)alloc_low_page();
76 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 75 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
77 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 76 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
78 pud = pud_offset(pgd, 0); 77 p4d = p4d_offset(pgd, 0);
78 pud = pud_offset(p4d, 0);
79 BUG_ON(pmd_table != pmd_offset(pud, 0)); 79 BUG_ON(pmd_table != pmd_offset(pud, 0));
80 80
81 return pmd_table; 81 return pmd_table;
82 } 82 }
83#endif 83#endif
84 pud = pud_offset(pgd, 0); 84 p4d = p4d_offset(pgd, 0);
85 pud = pud_offset(p4d, 0);
85 pmd_table = pmd_offset(pud, 0); 86 pmd_table = pmd_offset(pud, 0);
86 87
87 return pmd_table; 88 return pmd_table;
@@ -390,8 +391,11 @@ pte_t *kmap_pte;
390 391
391static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) 392static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
392{ 393{
393 return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), 394 pgd_t *pgd = pgd_offset_k(vaddr);
394 vaddr), vaddr), vaddr); 395 p4d_t *p4d = p4d_offset(pgd, vaddr);
396 pud_t *pud = pud_offset(p4d, vaddr);
397 pmd_t *pmd = pmd_offset(pud, vaddr);
398 return pte_offset_kernel(pmd, vaddr);
395} 399}
396 400
397static void __init kmap_init(void) 401static void __init kmap_init(void)
@@ -410,6 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
410{ 414{
411 unsigned long vaddr; 415 unsigned long vaddr;
412 pgd_t *pgd; 416 pgd_t *pgd;
417 p4d_t *p4d;
413 pud_t *pud; 418 pud_t *pud;
414 pmd_t *pmd; 419 pmd_t *pmd;
415 pte_t *pte; 420 pte_t *pte;
@@ -418,7 +423,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
418 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); 423 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
419 424
420 pgd = swapper_pg_dir + pgd_index(vaddr); 425 pgd = swapper_pg_dir + pgd_index(vaddr);
421 pud = pud_offset(pgd, vaddr); 426 p4d = p4d_offset(pgd, vaddr);
427 pud = pud_offset(p4d, vaddr);
422 pmd = pmd_offset(pud, vaddr); 428 pmd = pmd_offset(pud, vaddr);
423 pte = pte_offset_kernel(pmd, vaddr); 429 pte = pte_offset_kernel(pmd, vaddr);
424 pkmap_page_table = pte; 430 pkmap_page_table = pte;
@@ -450,6 +456,7 @@ void __init native_pagetable_init(void)
450{ 456{
451 unsigned long pfn, va; 457 unsigned long pfn, va;
452 pgd_t *pgd, *base = swapper_pg_dir; 458 pgd_t *pgd, *base = swapper_pg_dir;
459 p4d_t *p4d;
453 pud_t *pud; 460 pud_t *pud;
454 pmd_t *pmd; 461 pmd_t *pmd;
455 pte_t *pte; 462 pte_t *pte;
@@ -469,7 +476,8 @@ void __init native_pagetable_init(void)
469 if (!pgd_present(*pgd)) 476 if (!pgd_present(*pgd))
470 break; 477 break;
471 478
472 pud = pud_offset(pgd, va); 479 p4d = p4d_offset(pgd, va);
480 pud = pud_offset(p4d, va);
473 pmd = pmd_offset(pud, va); 481 pmd = pmd_offset(pud, va);
474 if (!pmd_present(*pmd)) 482 if (!pmd_present(*pmd))
475 break; 483 break;
@@ -716,22 +724,20 @@ void __init paging_init(void)
716 */ 724 */
717static void __init test_wp_bit(void) 725static void __init test_wp_bit(void)
718{ 726{
719 int wp_works_ok; 727 char z = 0;
720 728
721 printk(KERN_INFO 729 printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode...");
722 "Checking if this processor honours the WP bit even in supervisor mode...");
723 730
724 /* Any page-aligned address will do, the test is non-destructive */ 731 __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO);
725 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
726 wp_works_ok = do_test_wp_bit();
727 clear_fixmap(FIX_WP_TEST);
728 732
729 if (!wp_works_ok) { 733 if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) {
730 printk(KERN_CONT "No.\n"); 734 clear_fixmap(FIX_WP_TEST);
731 panic("Linux doesn't support CPUs with broken WP.");
732 } else {
733 printk(KERN_CONT "Ok.\n"); 735 printk(KERN_CONT "Ok.\n");
736 return;
734 } 737 }
738
739 printk(KERN_CONT "No.\n");
740 panic("Linux doesn't support CPUs with broken WP.");
735} 741}
736 742
737void __init mem_init(void) 743void __init mem_init(void)
@@ -841,30 +847,6 @@ int arch_remove_memory(u64 start, u64 size)
841#endif 847#endif
842#endif 848#endif
843 849
844/*
845 * This function cannot be __init, since exceptions don't work in that
846 * section. Put this after the callers, so that it cannot be inlined.
847 */
848static noinline int do_test_wp_bit(void)
849{
850 char tmp_reg;
851 int flag;
852
853 __asm__ __volatile__(
854 " movb %0, %1 \n"
855 "1: movb %1, %0 \n"
856 " xorl %2, %2 \n"
857 "2: \n"
858 _ASM_EXTABLE(1b,2b)
859 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
860 "=q" (tmp_reg),
861 "=r" (flag)
862 :"2" (1)
863 :"memory");
864
865 return flag;
866}
867
868int kernel_set_to_readonly __read_mostly; 850int kernel_set_to_readonly __read_mostly;
869 851
870void set_kernel_text_rw(void) 852void set_kernel_text_rw(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f6da869810a8..745e5e183169 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end)
97 unsigned long address; 97 unsigned long address;
98 98
99 for (address = start; address <= end; address += PGDIR_SIZE) { 99 for (address = start; address <= end; address += PGDIR_SIZE) {
100 const pgd_t *pgd_ref = pgd_offset_k(address); 100 pgd_t *pgd_ref = pgd_offset_k(address);
101 const p4d_t *p4d_ref;
101 struct page *page; 102 struct page *page;
102 103
103 if (pgd_none(*pgd_ref)) 104 /*
105 * With folded p4d, pgd_none() is always false, we need to
106 * handle synchonization on p4d level.
107 */
108 BUILD_BUG_ON(pgd_none(*pgd_ref));
109 p4d_ref = p4d_offset(pgd_ref, address);
110
111 if (p4d_none(*p4d_ref))
104 continue; 112 continue;
105 113
106 spin_lock(&pgd_lock); 114 spin_lock(&pgd_lock);
107 list_for_each_entry(page, &pgd_list, lru) { 115 list_for_each_entry(page, &pgd_list, lru) {
108 pgd_t *pgd; 116 pgd_t *pgd;
117 p4d_t *p4d;
109 spinlock_t *pgt_lock; 118 spinlock_t *pgt_lock;
110 119
111 pgd = (pgd_t *)page_address(page) + pgd_index(address); 120 pgd = (pgd_t *)page_address(page) + pgd_index(address);
121 p4d = p4d_offset(pgd, address);
112 /* the pgt_lock only for Xen */ 122 /* the pgt_lock only for Xen */
113 pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 123 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
114 spin_lock(pgt_lock); 124 spin_lock(pgt_lock);
115 125
116 if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) 126 if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
117 BUG_ON(pgd_page_vaddr(*pgd) 127 BUG_ON(p4d_page_vaddr(*p4d)
118 != pgd_page_vaddr(*pgd_ref)); 128 != p4d_page_vaddr(*p4d_ref));
119 129
120 if (pgd_none(*pgd)) 130 if (p4d_none(*p4d))
121 set_pgd(pgd, *pgd_ref); 131 set_p4d(p4d, *p4d_ref);
122 132
123 spin_unlock(pgt_lock); 133 spin_unlock(pgt_lock);
124 } 134 }
@@ -149,16 +159,28 @@ static __ref void *spp_getpage(void)
149 return ptr; 159 return ptr;
150} 160}
151 161
152static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) 162static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
153{ 163{
154 if (pgd_none(*pgd)) { 164 if (pgd_none(*pgd)) {
155 pud_t *pud = (pud_t *)spp_getpage(); 165 p4d_t *p4d = (p4d_t *)spp_getpage();
156 pgd_populate(&init_mm, pgd, pud); 166 pgd_populate(&init_mm, pgd, p4d);
157 if (pud != pud_offset(pgd, 0)) 167 if (p4d != p4d_offset(pgd, 0))
158 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", 168 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
159 pud, pud_offset(pgd, 0)); 169 p4d, p4d_offset(pgd, 0));
170 }
171 return p4d_offset(pgd, vaddr);
172}
173
174static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
175{
176 if (p4d_none(*p4d)) {
177 pud_t *pud = (pud_t *)spp_getpage();
178 p4d_populate(&init_mm, p4d, pud);
179 if (pud != pud_offset(p4d, 0))
180 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
181 pud, pud_offset(p4d, 0));
160 } 182 }
161 return pud_offset(pgd, vaddr); 183 return pud_offset(p4d, vaddr);
162} 184}
163 185
164static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) 186static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
@@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
167 pmd_t *pmd = (pmd_t *) spp_getpage(); 189 pmd_t *pmd = (pmd_t *) spp_getpage();
168 pud_populate(&init_mm, pud, pmd); 190 pud_populate(&init_mm, pud, pmd);
169 if (pmd != pmd_offset(pud, 0)) 191 if (pmd != pmd_offset(pud, 0))
170 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 192 printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
171 pmd, pmd_offset(pud, 0)); 193 pmd, pmd_offset(pud, 0));
172 } 194 }
173 return pmd_offset(pud, vaddr); 195 return pmd_offset(pud, vaddr);
@@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
179 pte_t *pte = (pte_t *) spp_getpage(); 201 pte_t *pte = (pte_t *) spp_getpage();
180 pmd_populate_kernel(&init_mm, pmd, pte); 202 pmd_populate_kernel(&init_mm, pmd, pte);
181 if (pte != pte_offset_kernel(pmd, 0)) 203 if (pte != pte_offset_kernel(pmd, 0))
182 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 204 printk(KERN_ERR "PAGETABLE BUG #03!\n");
183 } 205 }
184 return pte_offset_kernel(pmd, vaddr); 206 return pte_offset_kernel(pmd, vaddr);
185} 207}
186 208
187void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) 209static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
188{ 210{
189 pud_t *pud; 211 pmd_t *pmd = fill_pmd(pud, vaddr);
190 pmd_t *pmd; 212 pte_t *pte = fill_pte(pmd, vaddr);
191 pte_t *pte;
192
193 pud = pud_page + pud_index(vaddr);
194 pmd = fill_pmd(pud, vaddr);
195 pte = fill_pte(pmd, vaddr);
196 213
197 set_pte(pte, new_pte); 214 set_pte(pte, new_pte);
198 215
@@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
203 __flush_tlb_one(vaddr); 220 __flush_tlb_one(vaddr);
204} 221}
205 222
223void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
224{
225 p4d_t *p4d = p4d_page + p4d_index(vaddr);
226 pud_t *pud = fill_pud(p4d, vaddr);
227
228 __set_pte_vaddr(pud, vaddr, new_pte);
229}
230
231void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
232{
233 pud_t *pud = pud_page + pud_index(vaddr);
234
235 __set_pte_vaddr(pud, vaddr, new_pte);
236}
237
206void set_pte_vaddr(unsigned long vaddr, pte_t pteval) 238void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
207{ 239{
208 pgd_t *pgd; 240 pgd_t *pgd;
209 pud_t *pud_page; 241 p4d_t *p4d_page;
210 242
211 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); 243 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
212 244
@@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
216 "PGD FIXMAP MISSING, it should be setup in head.S!\n"); 248 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
217 return; 249 return;
218 } 250 }
219 pud_page = (pud_t*)pgd_page_vaddr(*pgd); 251
220 set_pte_vaddr_pud(pud_page, vaddr, pteval); 252 p4d_page = p4d_offset(pgd, 0);
253 set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
221} 254}
222 255
223pmd_t * __init populate_extra_pmd(unsigned long vaddr) 256pmd_t * __init populate_extra_pmd(unsigned long vaddr)
224{ 257{
225 pgd_t *pgd; 258 pgd_t *pgd;
259 p4d_t *p4d;
226 pud_t *pud; 260 pud_t *pud;
227 261
228 pgd = pgd_offset_k(vaddr); 262 pgd = pgd_offset_k(vaddr);
229 pud = fill_pud(pgd, vaddr); 263 p4d = fill_p4d(pgd, vaddr);
264 pud = fill_pud(p4d, vaddr);
230 return fill_pmd(pud, vaddr); 265 return fill_pmd(pud, vaddr);
231} 266}
232 267
@@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
245 enum page_cache_mode cache) 280 enum page_cache_mode cache)
246{ 281{
247 pgd_t *pgd; 282 pgd_t *pgd;
283 p4d_t *p4d;
248 pud_t *pud; 284 pud_t *pud;
249 pmd_t *pmd; 285 pmd_t *pmd;
250 pgprot_t prot; 286 pgprot_t prot;
@@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
255 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { 291 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
256 pgd = pgd_offset_k((unsigned long)__va(phys)); 292 pgd = pgd_offset_k((unsigned long)__va(phys));
257 if (pgd_none(*pgd)) { 293 if (pgd_none(*pgd)) {
294 p4d = (p4d_t *) spp_getpage();
295 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
296 _PAGE_USER));
297 }
298 p4d = p4d_offset(pgd, (unsigned long)__va(phys));
299 if (p4d_none(*p4d)) {
258 pud = (pud_t *) spp_getpage(); 300 pud = (pud_t *) spp_getpage();
259 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | 301 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
260 _PAGE_USER)); 302 _PAGE_USER));
261 } 303 }
262 pud = pud_offset(pgd, (unsigned long)__va(phys)); 304 pud = pud_offset(p4d, (unsigned long)__va(phys));
263 if (pud_none(*pud)) { 305 if (pud_none(*pud)) {
264 pmd = (pmd_t *) spp_getpage(); 306 pmd = (pmd_t *) spp_getpage();
265 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | 307 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
@@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start,
563 605
564 for (; vaddr < vaddr_end; vaddr = vaddr_next) { 606 for (; vaddr < vaddr_end; vaddr = vaddr_next) {
565 pgd_t *pgd = pgd_offset_k(vaddr); 607 pgd_t *pgd = pgd_offset_k(vaddr);
608 p4d_t *p4d;
566 pud_t *pud; 609 pud_t *pud;
567 610
568 vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; 611 vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
569 612
570 if (pgd_val(*pgd)) { 613 BUILD_BUG_ON(pgd_none(*pgd));
571 pud = (pud_t *)pgd_page_vaddr(*pgd); 614 p4d = p4d_offset(pgd, vaddr);
615 if (p4d_val(*p4d)) {
616 pud = (pud_t *)p4d_page_vaddr(*p4d);
572 paddr_last = phys_pud_init(pud, __pa(vaddr), 617 paddr_last = phys_pud_init(pud, __pa(vaddr),
573 __pa(vaddr_end), 618 __pa(vaddr_end),
574 page_size_mask); 619 page_size_mask);
@@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
580 page_size_mask); 625 page_size_mask);
581 626
582 spin_lock(&init_mm.page_table_lock); 627 spin_lock(&init_mm.page_table_lock);
583 pgd_populate(&init_mm, pgd, pud); 628 p4d_populate(&init_mm, p4d, pud);
584 spin_unlock(&init_mm.page_table_lock); 629 spin_unlock(&init_mm.page_table_lock);
585 pgd_changed = true; 630 pgd_changed = true;
586 } 631 }
@@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
726 spin_unlock(&init_mm.page_table_lock); 771 spin_unlock(&init_mm.page_table_lock);
727} 772}
728 773
774static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
775{
776 pud_t *pud;
777 int i;
778
779 for (i = 0; i < PTRS_PER_PUD; i++) {
780 pud = pud_start + i;
781 if (!pud_none(*pud))
782 return;
783 }
784
785 /* free a pud talbe */
786 free_pagetable(p4d_page(*p4d), 0);
787 spin_lock(&init_mm.page_table_lock);
788 p4d_clear(p4d);
789 spin_unlock(&init_mm.page_table_lock);
790}
791
729static void __meminit 792static void __meminit
730remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, 793remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
731 bool direct) 794 bool direct)
@@ -899,7 +962,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
899 continue; 962 continue;
900 } 963 }
901 964
902 pmd_base = (pmd_t *)pud_page_vaddr(*pud); 965 pmd_base = pmd_offset(pud, 0);
903 remove_pmd_table(pmd_base, addr, next, direct); 966 remove_pmd_table(pmd_base, addr, next, direct);
904 free_pmd_table(pmd_base, pud); 967 free_pmd_table(pmd_base, pud);
905 } 968 }
@@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
908 update_page_count(PG_LEVEL_1G, -pages); 971 update_page_count(PG_LEVEL_1G, -pages);
909} 972}
910 973
974static void __meminit
975remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
976 bool direct)
977{
978 unsigned long next, pages = 0;
979 pud_t *pud_base;
980 p4d_t *p4d;
981
982 p4d = p4d_start + p4d_index(addr);
983 for (; addr < end; addr = next, p4d++) {
984 next = p4d_addr_end(addr, end);
985
986 if (!p4d_present(*p4d))
987 continue;
988
989 BUILD_BUG_ON(p4d_large(*p4d));
990
991 pud_base = pud_offset(p4d, 0);
992 remove_pud_table(pud_base, addr, next, direct);
993 free_pud_table(pud_base, p4d);
994 }
995
996 if (direct)
997 update_page_count(PG_LEVEL_512G, -pages);
998}
999
911/* start and end are both virtual address. */ 1000/* start and end are both virtual address. */
912static void __meminit 1001static void __meminit
913remove_pagetable(unsigned long start, unsigned long end, bool direct) 1002remove_pagetable(unsigned long start, unsigned long end, bool direct)
@@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
915 unsigned long next; 1004 unsigned long next;
916 unsigned long addr; 1005 unsigned long addr;
917 pgd_t *pgd; 1006 pgd_t *pgd;
918 pud_t *pud; 1007 p4d_t *p4d;
919 1008
920 for (addr = start; addr < end; addr = next) { 1009 for (addr = start; addr < end; addr = next) {
921 next = pgd_addr_end(addr, end); 1010 next = pgd_addr_end(addr, end);
@@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
924 if (!pgd_present(*pgd)) 1013 if (!pgd_present(*pgd))
925 continue; 1014 continue;
926 1015
927 pud = (pud_t *)pgd_page_vaddr(*pgd); 1016 p4d = p4d_offset(pgd, 0);
928 remove_pud_table(pud, addr, next, direct); 1017 remove_p4d_table(p4d, addr, next, direct);
929 } 1018 }
930 1019
931 flush_tlb_all(); 1020 flush_tlb_all();
@@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr)
1090{ 1179{
1091 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; 1180 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1092 pgd_t *pgd; 1181 pgd_t *pgd;
1182 p4d_t *p4d;
1093 pud_t *pud; 1183 pud_t *pud;
1094 pmd_t *pmd; 1184 pmd_t *pmd;
1095 pte_t *pte; 1185 pte_t *pte;
@@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr)
1101 if (pgd_none(*pgd)) 1191 if (pgd_none(*pgd))
1102 return 0; 1192 return 0;
1103 1193
1104 pud = pud_offset(pgd, addr); 1194 p4d = p4d_offset(pgd, addr);
1195 if (p4d_none(*p4d))
1196 return 0;
1197
1198 pud = pud_offset(p4d, addr);
1105 if (pud_none(*pud)) 1199 if (pud_none(*pud))
1106 return 0; 1200 return 0;
1107 1201
@@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
1158 unsigned long addr; 1252 unsigned long addr;
1159 unsigned long next; 1253 unsigned long next;
1160 pgd_t *pgd; 1254 pgd_t *pgd;
1255 p4d_t *p4d;
1161 pud_t *pud; 1256 pud_t *pud;
1162 pmd_t *pmd; 1257 pmd_t *pmd;
1163 1258
@@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
1168 if (!pgd) 1263 if (!pgd)
1169 return -ENOMEM; 1264 return -ENOMEM;
1170 1265
1171 pud = vmemmap_pud_populate(pgd, addr, node); 1266 p4d = vmemmap_p4d_populate(pgd, addr, node);
1267 if (!p4d)
1268 return -ENOMEM;
1269
1270 pud = vmemmap_pud_populate(p4d, addr, node);
1172 if (!pud) 1271 if (!pud)
1173 return -ENOMEM; 1272 return -ENOMEM;
1174 1273
@@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
1236 unsigned long end = (unsigned long)(start_page + size); 1335 unsigned long end = (unsigned long)(start_page + size);
1237 unsigned long next; 1336 unsigned long next;
1238 pgd_t *pgd; 1337 pgd_t *pgd;
1338 p4d_t *p4d;
1239 pud_t *pud; 1339 pud_t *pud;
1240 pmd_t *pmd; 1340 pmd_t *pmd;
1241 unsigned int nr_pages; 1341 unsigned int nr_pages;
@@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr,
1251 } 1351 }
1252 get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); 1352 get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
1253 1353
1254 pud = pud_offset(pgd, addr); 1354 p4d = p4d_offset(pgd, addr);
1355 if (p4d_none(*p4d)) {
1356 next = (addr + PAGE_SIZE) & PAGE_MASK;
1357 continue;
1358 }
1359 get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
1360
1361 pud = pud_offset(p4d, addr);
1255 if (pud_none(*pud)) { 1362 if (pud_none(*pud)) {
1256 next = (addr + PAGE_SIZE) & PAGE_MASK; 1363 next = (addr + PAGE_SIZE) & PAGE_MASK;
1257 continue; 1364 continue;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c43b6b33463a..e4f7b25df18e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -426,7 +426,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
426 /* Don't assume we're using swapper_pg_dir at this point */ 426 /* Don't assume we're using swapper_pg_dir at this point */
427 pgd_t *base = __va(read_cr3()); 427 pgd_t *base = __va(read_cr3());
428 pgd_t *pgd = &base[pgd_index(addr)]; 428 pgd_t *pgd = &base[pgd_index(addr)];
429 pud_t *pud = pud_offset(pgd, addr); 429 p4d_t *p4d = p4d_offset(pgd, addr);
430 pud_t *pud = pud_offset(p4d, addr);
430 pmd_t *pmd = pmd_offset(pud, addr); 431 pmd_t *pmd = pmd_offset(pud, addr);
431 432
432 return pmd; 433 return pmd;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index da92df32d0f1..0c7d8129bed6 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -34,8 +34,19 @@ static int __init map_range(struct range *range)
34static void __init clear_pgds(unsigned long start, 34static void __init clear_pgds(unsigned long start,
35 unsigned long end) 35 unsigned long end)
36{ 36{
37 for (; start < end; start += PGDIR_SIZE) 37 pgd_t *pgd;
38 pgd_clear(pgd_offset_k(start)); 38
39 for (; start < end; start += PGDIR_SIZE) {
40 pgd = pgd_offset_k(start);
41 /*
42 * With folded p4d, pgd_clear() is nop, use p4d_clear()
43 * instead.
44 */
45 if (CONFIG_PGTABLE_LEVELS < 5)
46 p4d_clear(p4d_offset(pgd, start));
47 else
48 pgd_clear(pgd);
49 }
39} 50}
40 51
41static void __init kasan_map_early_shadow(pgd_t *pgd) 52static void __init kasan_map_early_shadow(pgd_t *pgd)
@@ -45,8 +56,18 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
45 unsigned long end = KASAN_SHADOW_END; 56 unsigned long end = KASAN_SHADOW_END;
46 57
47 for (i = pgd_index(start); start < end; i++) { 58 for (i = pgd_index(start); start < end; i++) {
48 pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) 59 switch (CONFIG_PGTABLE_LEVELS) {
49 | _KERNPG_TABLE); 60 case 4:
61 pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
62 _KERNPG_TABLE);
63 break;
64 case 5:
65 pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
66 _KERNPG_TABLE);
67 break;
68 default:
69 BUILD_BUG();
70 }
50 start += PGDIR_SIZE; 71 start += PGDIR_SIZE;
51 } 72 }
52} 73}
@@ -74,6 +95,7 @@ void __init kasan_early_init(void)
74 pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; 95 pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
75 pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; 96 pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
76 pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; 97 pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
98 p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
77 99
78 for (i = 0; i < PTRS_PER_PTE; i++) 100 for (i = 0; i < PTRS_PER_PTE; i++)
79 kasan_zero_pte[i] = __pte(pte_val); 101 kasan_zero_pte[i] = __pte(pte_val);
@@ -84,6 +106,9 @@ void __init kasan_early_init(void)
84 for (i = 0; i < PTRS_PER_PUD; i++) 106 for (i = 0; i < PTRS_PER_PUD; i++)
85 kasan_zero_pud[i] = __pud(pud_val); 107 kasan_zero_pud[i] = __pud(pud_val);
86 108
109 for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
110 kasan_zero_p4d[i] = __p4d(p4d_val);
111
87 kasan_map_early_shadow(early_level4_pgt); 112 kasan_map_early_shadow(early_level4_pgt);
88 kasan_map_early_shadow(init_level4_pgt); 113 kasan_map_early_shadow(init_level4_pgt);
89} 114}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 7940166c799b..19ad095b41df 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -30,30 +30,44 @@
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/sched/signal.h> 31#include <linux/sched/signal.h>
32#include <linux/sched/mm.h> 32#include <linux/sched/mm.h>
33#include <linux/compat.h>
33#include <asm/elf.h> 34#include <asm/elf.h>
34 35
35struct va_alignment __read_mostly va_align = { 36struct va_alignment __read_mostly va_align = {
36 .flags = -1, 37 .flags = -1,
37}; 38};
38 39
39static unsigned long stack_maxrandom_size(void) 40unsigned long tasksize_32bit(void)
41{
42 return IA32_PAGE_OFFSET;
43}
44
45unsigned long tasksize_64bit(void)
46{
47 return TASK_SIZE_MAX;
48}
49
50static unsigned long stack_maxrandom_size(unsigned long task_size)
40{ 51{
41 unsigned long max = 0; 52 unsigned long max = 0;
42 if ((current->flags & PF_RANDOMIZE) && 53 if ((current->flags & PF_RANDOMIZE) &&
43 !(current->personality & ADDR_NO_RANDOMIZE)) { 54 !(current->personality & ADDR_NO_RANDOMIZE)) {
44 max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; 55 max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
56 max <<= PAGE_SHIFT;
45 } 57 }
46 58
47 return max; 59 return max;
48} 60}
49 61
50/* 62#ifdef CONFIG_COMPAT
51 * Top of mmap area (just below the process stack). 63# define mmap32_rnd_bits mmap_rnd_compat_bits
52 * 64# define mmap64_rnd_bits mmap_rnd_bits
53 * Leave an at least ~128 MB hole with possible stack randomization. 65#else
54 */ 66# define mmap32_rnd_bits mmap_rnd_bits
55#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) 67# define mmap64_rnd_bits mmap_rnd_bits
56#define MAX_GAP (TASK_SIZE/6*5) 68#endif
69
70#define SIZE_128M (128 * 1024 * 1024UL)
57 71
58static int mmap_is_legacy(void) 72static int mmap_is_legacy(void)
59{ 73{
@@ -66,54 +80,91 @@ static int mmap_is_legacy(void)
66 return sysctl_legacy_va_layout; 80 return sysctl_legacy_va_layout;
67} 81}
68 82
69unsigned long arch_mmap_rnd(void) 83static unsigned long arch_rnd(unsigned int rndbits)
70{ 84{
71 unsigned long rnd; 85 return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
72 86}
73 if (mmap_is_ia32())
74#ifdef CONFIG_COMPAT
75 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
76#else
77 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
78#endif
79 else
80 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
81 87
82 return rnd << PAGE_SHIFT; 88unsigned long arch_mmap_rnd(void)
89{
90 if (!(current->flags & PF_RANDOMIZE))
91 return 0;
92 return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
83} 93}
84 94
85static unsigned long mmap_base(unsigned long rnd) 95static unsigned long mmap_base(unsigned long rnd, unsigned long task_size)
86{ 96{
87 unsigned long gap = rlimit(RLIMIT_STACK); 97 unsigned long gap = rlimit(RLIMIT_STACK);
98 unsigned long gap_min, gap_max;
99
100 /*
101 * Top of mmap area (just below the process stack).
102 * Leave an at least ~128 MB hole with possible stack randomization.
103 */
104 gap_min = SIZE_128M + stack_maxrandom_size(task_size);
105 gap_max = (task_size / 6) * 5;
88 106
89 if (gap < MIN_GAP) 107 if (gap < gap_min)
90 gap = MIN_GAP; 108 gap = gap_min;
91 else if (gap > MAX_GAP) 109 else if (gap > gap_max)
92 gap = MAX_GAP; 110 gap = gap_max;
111
112 return PAGE_ALIGN(task_size - gap - rnd);
113}
93 114
94 return PAGE_ALIGN(TASK_SIZE - gap - rnd); 115static unsigned long mmap_legacy_base(unsigned long rnd,
116 unsigned long task_size)
117{
118 return __TASK_UNMAPPED_BASE(task_size) + rnd;
95} 119}
96 120
97/* 121/*
98 * This function, called very early during the creation of a new 122 * This function, called very early during the creation of a new
99 * process VM image, sets up which VM layout function to use: 123 * process VM image, sets up which VM layout function to use:
100 */ 124 */
125static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
126 unsigned long random_factor, unsigned long task_size)
127{
128 *legacy_base = mmap_legacy_base(random_factor, task_size);
129 if (mmap_is_legacy())
130 *base = *legacy_base;
131 else
132 *base = mmap_base(random_factor, task_size);
133}
134
101void arch_pick_mmap_layout(struct mm_struct *mm) 135void arch_pick_mmap_layout(struct mm_struct *mm)
102{ 136{
103 unsigned long random_factor = 0UL; 137 if (mmap_is_legacy())
138 mm->get_unmapped_area = arch_get_unmapped_area;
139 else
140 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
104 141
105 if (current->flags & PF_RANDOMIZE) 142 arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
106 random_factor = arch_mmap_rnd(); 143 arch_rnd(mmap64_rnd_bits), tasksize_64bit());
144
145#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
146 /*
147 * The mmap syscall mapping base decision depends solely on the
148 * syscall type (64-bit or compat). This applies for 64bit
149 * applications and 32bit applications. The 64bit syscall uses
150 * mmap_base, the compat syscall uses mmap_compat_base.
151 */
152 arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
153 arch_rnd(mmap32_rnd_bits), tasksize_32bit());
154#endif
155}
107 156
108 mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; 157unsigned long get_mmap_base(int is_legacy)
158{
159 struct mm_struct *mm = current->mm;
109 160
110 if (mmap_is_legacy()) { 161#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
111 mm->mmap_base = mm->mmap_legacy_base; 162 if (in_compat_syscall()) {
112 mm->get_unmapped_area = arch_get_unmapped_area; 163 return is_legacy ? mm->mmap_compat_legacy_base
113 } else { 164 : mm->mmap_compat_base;
114 mm->mmap_base = mmap_base(random_factor);
115 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
116 } 165 }
166#endif
167 return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
117} 168}
118 169
119const char *arch_vma_name(struct vm_area_struct *vma) 170const char *arch_vma_name(struct vm_area_struct *vma)
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index cd44ae727df7..1c34b767c84c 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -526,15 +526,7 @@ int mpx_handle_bd_fault(void)
526 if (!kernel_managing_mpx_tables(current->mm)) 526 if (!kernel_managing_mpx_tables(current->mm))
527 return -EINVAL; 527 return -EINVAL;
528 528
529 if (do_mpx_bt_fault()) { 529 return do_mpx_bt_fault();
530 force_sig(SIGSEGV, current);
531 /*
532 * The force_sig() is essentially "handling" this
533 * exception, so we do not pass up the error
534 * from do_mpx_bt_fault().
535 */
536 }
537 return 0;
538} 530}
539 531
540/* 532/*
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f9d99535f233..25504d5aa816 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -201,7 +201,7 @@ static void __init alloc_node_data(int nid)
201 nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, 201 nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
202 MEMBLOCK_ALLOC_ACCESSIBLE); 202 MEMBLOCK_ALLOC_ACCESSIBLE);
203 if (!nd_pa) { 203 if (!nd_pa) {
204 pr_err("Cannot find %zu bytes in node %d\n", 204 pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
205 nd_size, nid); 205 nd_size, nid);
206 return; 206 return;
207 } 207 }
@@ -225,7 +225,7 @@ static void __init alloc_node_data(int nid)
225 * numa_cleanup_meminfo - Cleanup a numa_meminfo 225 * numa_cleanup_meminfo - Cleanup a numa_meminfo
226 * @mi: numa_meminfo to clean up 226 * @mi: numa_meminfo to clean up
227 * 227 *
228 * Sanitize @mi by merging and removing unncessary memblks. Also check for 228 * Sanitize @mi by merging and removing unnecessary memblks. Also check for
229 * conflicts and clear unused memblks. 229 * conflicts and clear unused memblks.
230 * 230 *
231 * RETURNS: 231 * RETURNS:
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a57e8e02f457..56b22fa504df 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
346pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, 346pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
347 unsigned int *level) 347 unsigned int *level)
348{ 348{
349 p4d_t *p4d;
349 pud_t *pud; 350 pud_t *pud;
350 pmd_t *pmd; 351 pmd_t *pmd;
351 352
@@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
354 if (pgd_none(*pgd)) 355 if (pgd_none(*pgd))
355 return NULL; 356 return NULL;
356 357
357 pud = pud_offset(pgd, address); 358 p4d = p4d_offset(pgd, address);
359 if (p4d_none(*p4d))
360 return NULL;
361
362 *level = PG_LEVEL_512G;
363 if (p4d_large(*p4d) || !p4d_present(*p4d))
364 return (pte_t *)p4d;
365
366 pud = pud_offset(p4d, address);
358 if (pud_none(*pud)) 367 if (pud_none(*pud))
359 return NULL; 368 return NULL;
360 369
@@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
406pmd_t *lookup_pmd_address(unsigned long address) 415pmd_t *lookup_pmd_address(unsigned long address)
407{ 416{
408 pgd_t *pgd; 417 pgd_t *pgd;
418 p4d_t *p4d;
409 pud_t *pud; 419 pud_t *pud;
410 420
411 pgd = pgd_offset_k(address); 421 pgd = pgd_offset_k(address);
412 if (pgd_none(*pgd)) 422 if (pgd_none(*pgd))
413 return NULL; 423 return NULL;
414 424
415 pud = pud_offset(pgd, address); 425 p4d = p4d_offset(pgd, address);
426 if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
427 return NULL;
428
429 pud = pud_offset(p4d, address);
416 if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) 430 if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
417 return NULL; 431 return NULL;
418 432
@@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
477 491
478 list_for_each_entry(page, &pgd_list, lru) { 492 list_for_each_entry(page, &pgd_list, lru) {
479 pgd_t *pgd; 493 pgd_t *pgd;
494 p4d_t *p4d;
480 pud_t *pud; 495 pud_t *pud;
481 pmd_t *pmd; 496 pmd_t *pmd;
482 497
483 pgd = (pgd_t *)page_address(page) + pgd_index(address); 498 pgd = (pgd_t *)page_address(page) + pgd_index(address);
484 pud = pud_offset(pgd, address); 499 p4d = p4d_offset(pgd, address);
500 pud = pud_offset(p4d, address);
485 pmd = pmd_offset(pud, address); 501 pmd = pmd_offset(pud, address);
486 set_pte_atomic((pte_t *)pmd, pte); 502 set_pte_atomic((pte_t *)pmd, pte);
487 } 503 }
@@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
836 pud_clear(pud); 852 pud_clear(pud);
837} 853}
838 854
839static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) 855static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
840{ 856{
841 pud_t *pud = pud_offset(pgd, start); 857 pud_t *pud = pud_offset(p4d, start);
842 858
843 /* 859 /*
844 * Not on a GB page boundary? 860 * Not on a GB page boundary?
@@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa,
1004 return num_pages; 1020 return num_pages;
1005} 1021}
1006 1022
1007static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, 1023static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
1008 pgprot_t pgprot) 1024 pgprot_t pgprot)
1009{ 1025{
1010 pud_t *pud; 1026 pud_t *pud;
1011 unsigned long end; 1027 unsigned long end;
@@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
1026 cur_pages = (pre_end - start) >> PAGE_SHIFT; 1042 cur_pages = (pre_end - start) >> PAGE_SHIFT;
1027 cur_pages = min_t(int, (int)cpa->numpages, cur_pages); 1043 cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
1028 1044
1029 pud = pud_offset(pgd, start); 1045 pud = pud_offset(p4d, start);
1030 1046
1031 /* 1047 /*
1032 * Need a PMD page? 1048 * Need a PMD page?
@@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
1047 if (cpa->numpages == cur_pages) 1063 if (cpa->numpages == cur_pages)
1048 return cur_pages; 1064 return cur_pages;
1049 1065
1050 pud = pud_offset(pgd, start); 1066 pud = pud_offset(p4d, start);
1051 pud_pgprot = pgprot_4k_2_large(pgprot); 1067 pud_pgprot = pgprot_4k_2_large(pgprot);
1052 1068
1053 /* 1069 /*
@@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
1067 if (start < end) { 1083 if (start < end) {
1068 long tmp; 1084 long tmp;
1069 1085
1070 pud = pud_offset(pgd, start); 1086 pud = pud_offset(p4d, start);
1071 if (pud_none(*pud)) 1087 if (pud_none(*pud))
1072 if (alloc_pmd_page(pud)) 1088 if (alloc_pmd_page(pud))
1073 return -1; 1089 return -1;
@@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1090{ 1106{
1091 pgprot_t pgprot = __pgprot(_KERNPG_TABLE); 1107 pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1092 pud_t *pud = NULL; /* shut up gcc */ 1108 pud_t *pud = NULL; /* shut up gcc */
1109 p4d_t *p4d;
1093 pgd_t *pgd_entry; 1110 pgd_t *pgd_entry;
1094 long ret; 1111 long ret;
1095 1112
1096 pgd_entry = cpa->pgd + pgd_index(addr); 1113 pgd_entry = cpa->pgd + pgd_index(addr);
1097 1114
1115 if (pgd_none(*pgd_entry)) {
1116 p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1117 if (!p4d)
1118 return -1;
1119
1120 set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
1121 }
1122
1098 /* 1123 /*
1099 * Allocate a PUD page and hand it down for mapping. 1124 * Allocate a PUD page and hand it down for mapping.
1100 */ 1125 */
1101 if (pgd_none(*pgd_entry)) { 1126 p4d = p4d_offset(pgd_entry, addr);
1127 if (p4d_none(*p4d)) {
1102 pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 1128 pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1103 if (!pud) 1129 if (!pud)
1104 return -1; 1130 return -1;
1105 1131
1106 set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); 1132 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
1107 } 1133 }
1108 1134
1109 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); 1135 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1110 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); 1136 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
1111 1137
1112 ret = populate_pud(cpa, addr, pgd_entry, pgprot); 1138 ret = populate_pud(cpa, addr, p4d, pgprot);
1113 if (ret < 0) { 1139 if (ret < 0) {
1114 /* 1140 /*
1115 * Leave the PUD page in place in case some other CPU or thread 1141 * Leave the PUD page in place in case some other CPU or thread
1116 * already found it, but remove any useless entries we just 1142 * already found it, but remove any useless entries we just
1117 * added to it. 1143 * added to it.
1118 */ 1144 */
1119 unmap_pud_range(pgd_entry, addr, 1145 unmap_pud_range(p4d, addr,
1120 addr + (cpa->numpages << PAGE_SHIFT)); 1146 addr + (cpa->numpages << PAGE_SHIFT));
1121 return ret; 1147 return ret;
1122 } 1148 }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6cbdff26bb96..508a708eb9a6 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
82 tlb_remove_page(tlb, virt_to_page(pud)); 82 tlb_remove_page(tlb, virt_to_page(pud));
83} 83}
84
85#if CONFIG_PGTABLE_LEVELS > 4
86void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
87{
88 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
89 tlb_remove_page(tlb, virt_to_page(p4d));
90}
91#endif /* CONFIG_PGTABLE_LEVELS > 4 */
84#endif /* CONFIG_PGTABLE_LEVELS > 3 */ 92#endif /* CONFIG_PGTABLE_LEVELS > 3 */
85#endif /* CONFIG_PGTABLE_LEVELS > 2 */ 93#endif /* CONFIG_PGTABLE_LEVELS > 2 */
86 94
@@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
120 references from swapper_pg_dir. */ 128 references from swapper_pg_dir. */
121 if (CONFIG_PGTABLE_LEVELS == 2 || 129 if (CONFIG_PGTABLE_LEVELS == 2 ||
122 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || 130 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
123 CONFIG_PGTABLE_LEVELS == 4) { 131 CONFIG_PGTABLE_LEVELS >= 4) {
124 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 132 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
125 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 133 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
126 KERNEL_PGD_PTRS); 134 KERNEL_PGD_PTRS);
@@ -261,13 +269,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
261 269
262static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) 270static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
263{ 271{
272 p4d_t *p4d;
264 pud_t *pud; 273 pud_t *pud;
265 int i; 274 int i;
266 275
267 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ 276 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
268 return; 277 return;
269 278
270 pud = pud_offset(pgd, 0); 279 p4d = p4d_offset(pgd, 0);
280 pud = pud_offset(p4d, 0);
271 281
272 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { 282 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
273 pmd_t *pmd = pmds[i]; 283 pmd_t *pmd = pmds[i];
@@ -580,6 +590,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
580} 590}
581 591
582#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 592#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
593#ifdef CONFIG_X86_5LEVEL
594/**
595 * p4d_set_huge - setup kernel P4D mapping
596 *
597 * No 512GB pages yet -- always return 0
598 */
599int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
600{
601 return 0;
602}
603
604/**
605 * p4d_clear_huge - clear kernel P4D mapping when it is set
606 *
607 * No 512GB pages yet -- always return 0
608 */
609int p4d_clear_huge(p4d_t *p4d)
610{
611 return 0;
612}
613#endif
614
583/** 615/**
584 * pud_set_huge - setup kernel PUD mapping 616 * pud_set_huge - setup kernel PUD mapping
585 * 617 *
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index de53c52551a5..b9bd5b8b14fa 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20;
26void set_pte_vaddr(unsigned long vaddr, pte_t pteval) 26void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
27{ 27{
28 pgd_t *pgd; 28 pgd_t *pgd;
29 p4d_t *p4d;
29 pud_t *pud; 30 pud_t *pud;
30 pmd_t *pmd; 31 pmd_t *pmd;
31 pte_t *pte; 32 pte_t *pte;
@@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
35 BUG(); 36 BUG();
36 return; 37 return;
37 } 38 }
38 pud = pud_offset(pgd, vaddr); 39 p4d = p4d_offset(pgd, vaddr);
40 if (p4d_none(*p4d)) {
41 BUG();
42 return;
43 }
44 pud = pud_offset(p4d, vaddr);
39 if (pud_none(*pud)) { 45 if (pud_none(*pud)) {
40 BUG(); 46 BUG();
41 return; 47 return;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a7655f6caf7d..6e7bedf69af7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -263,8 +263,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
263{ 263{
264 struct flush_tlb_info info; 264 struct flush_tlb_info info;
265 265
266 if (end == 0)
267 end = start + PAGE_SIZE;
268 info.flush_mm = mm; 266 info.flush_mm = mm;
269 info.flush_start = start; 267 info.flush_start = start;
270 info.flush_end = end; 268 info.flush_end = end;
@@ -289,23 +287,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
289 smp_call_function_many(cpumask, flush_tlb_func, &info, 1); 287 smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
290} 288}
291 289
292void flush_tlb_current_task(void)
293{
294 struct mm_struct *mm = current->mm;
295
296 preempt_disable();
297
298 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
299
300 /* This is an implicit full barrier that synchronizes with switch_mm. */
301 local_flush_tlb();
302
303 trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
304 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
305 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
306 preempt_enable();
307}
308
309/* 290/*
310 * See Documentation/x86/tlb.txt for details. We choose 33 291 * See Documentation/x86/tlb.txt for details. We choose 33
311 * because it is large enough to cover the vast majority (at 292 * because it is large enough to cover the vast majority (at
@@ -326,6 +307,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
326 unsigned long base_pages_to_flush = TLB_FLUSH_ALL; 307 unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
327 308
328 preempt_disable(); 309 preempt_disable();
310
311 if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
312 base_pages_to_flush = (end - start) >> PAGE_SHIFT;
313 if (base_pages_to_flush > tlb_single_page_flush_ceiling)
314 base_pages_to_flush = TLB_FLUSH_ALL;
315
329 if (current->active_mm != mm) { 316 if (current->active_mm != mm) {
330 /* Synchronize with switch_mm. */ 317 /* Synchronize with switch_mm. */
331 smp_mb(); 318 smp_mb();
@@ -342,15 +329,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
342 goto out; 329 goto out;
343 } 330 }
344 331
345 if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
346 base_pages_to_flush = (end - start) >> PAGE_SHIFT;
347
348 /* 332 /*
349 * Both branches below are implicit full barriers (MOV to CR or 333 * Both branches below are implicit full barriers (MOV to CR or
350 * INVLPG) that synchronize with switch_mm. 334 * INVLPG) that synchronize with switch_mm.
351 */ 335 */
352 if (base_pages_to_flush > tlb_single_page_flush_ceiling) { 336 if (base_pages_to_flush == TLB_FLUSH_ALL) {
353 base_pages_to_flush = TLB_FLUSH_ALL;
354 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 337 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
355 local_flush_tlb(); 338 local_flush_tlb();
356 } else { 339 } else {
@@ -393,7 +376,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
393 } 376 }
394 377
395 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 378 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
396 flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); 379 flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
397 380
398 preempt_enable(); 381 preempt_enable();
399} 382}
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index cef39b097649..3481268da3d0 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void)
68 load_cr3(initial_page_table); 68 load_cr3(initial_page_table);
69 __flush_tlb_all(); 69 __flush_tlb_all();
70 70
71 gdt_descr.address = __pa(get_cpu_gdt_table(0)); 71 gdt_descr.address = get_cpu_gdt_paddr(0);
72 gdt_descr.size = GDT_SIZE - 1; 72 gdt_descr.size = GDT_SIZE - 1;
73 load_gdt(&gdt_descr); 73 load_gdt(&gdt_descr);
74 74
@@ -79,7 +79,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
79{ 79{
80 struct desc_ptr gdt_descr; 80 struct desc_ptr gdt_descr;
81 81
82 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); 82 gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0);
83 gdt_descr.size = GDT_SIZE - 1; 83 gdt_descr.size = GDT_SIZE - 1;
84 load_gdt(&gdt_descr); 84 load_gdt(&gdt_descr);
85 85
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 642a8698ad61..c488625c9712 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -135,6 +135,7 @@ static pgd_t *efi_pgd;
135int __init efi_alloc_page_tables(void) 135int __init efi_alloc_page_tables(void)
136{ 136{
137 pgd_t *pgd; 137 pgd_t *pgd;
138 p4d_t *p4d;
138 pud_t *pud; 139 pud_t *pud;
139 gfp_t gfp_mask; 140 gfp_t gfp_mask;
140 141
@@ -147,15 +148,20 @@ int __init efi_alloc_page_tables(void)
147 return -ENOMEM; 148 return -ENOMEM;
148 149
149 pgd = efi_pgd + pgd_index(EFI_VA_END); 150 pgd = efi_pgd + pgd_index(EFI_VA_END);
151 p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END);
152 if (!p4d) {
153 free_page((unsigned long)efi_pgd);
154 return -ENOMEM;
155 }
150 156
151 pud = pud_alloc_one(NULL, 0); 157 pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
152 if (!pud) { 158 if (!pud) {
159 if (CONFIG_PGTABLE_LEVELS > 4)
160 free_page((unsigned long) pgd_page_vaddr(*pgd));
153 free_page((unsigned long)efi_pgd); 161 free_page((unsigned long)efi_pgd);
154 return -ENOMEM; 162 return -ENOMEM;
155 } 163 }
156 164
157 pgd_populate(NULL, pgd, pud);
158
159 return 0; 165 return 0;
160} 166}
161 167
@@ -166,6 +172,7 @@ void efi_sync_low_kernel_mappings(void)
166{ 172{
167 unsigned num_entries; 173 unsigned num_entries;
168 pgd_t *pgd_k, *pgd_efi; 174 pgd_t *pgd_k, *pgd_efi;
175 p4d_t *p4d_k, *p4d_efi;
169 pud_t *pud_k, *pud_efi; 176 pud_t *pud_k, *pud_efi;
170 177
171 if (efi_enabled(EFI_OLD_MEMMAP)) 178 if (efi_enabled(EFI_OLD_MEMMAP))
@@ -190,23 +197,37 @@ void efi_sync_low_kernel_mappings(void)
190 memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries); 197 memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
191 198
192 /* 199 /*
200 * As with PGDs, we share all P4D entries apart from the one entry
201 * that covers the EFI runtime mapping space.
202 */
203 BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END));
204 BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK));
205
206 pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
207 pgd_k = pgd_offset_k(EFI_VA_END);
208 p4d_efi = p4d_offset(pgd_efi, 0);
209 p4d_k = p4d_offset(pgd_k, 0);
210
211 num_entries = p4d_index(EFI_VA_END);
212 memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries);
213
214 /*
193 * We share all the PUD entries apart from those that map the 215 * We share all the PUD entries apart from those that map the
194 * EFI regions. Copy around them. 216 * EFI regions. Copy around them.
195 */ 217 */
196 BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0); 218 BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
197 BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); 219 BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
198 220
199 pgd_efi = efi_pgd + pgd_index(EFI_VA_END); 221 p4d_efi = p4d_offset(pgd_efi, EFI_VA_END);
200 pud_efi = pud_offset(pgd_efi, 0); 222 p4d_k = p4d_offset(pgd_k, EFI_VA_END);
201 223 pud_efi = pud_offset(p4d_efi, 0);
202 pgd_k = pgd_offset_k(EFI_VA_END); 224 pud_k = pud_offset(p4d_k, 0);
203 pud_k = pud_offset(pgd_k, 0);
204 225
205 num_entries = pud_index(EFI_VA_END); 226 num_entries = pud_index(EFI_VA_END);
206 memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); 227 memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
207 228
208 pud_efi = pud_offset(pgd_efi, EFI_VA_START); 229 pud_efi = pud_offset(p4d_efi, EFI_VA_START);
209 pud_k = pud_offset(pgd_k, EFI_VA_START); 230 pud_k = pud_offset(p4d_k, EFI_VA_START);
210 231
211 num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); 232 num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
212 memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); 233 memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 66ade16c7693..6b05a9219ea2 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -95,7 +95,7 @@ static void __save_processor_state(struct saved_context *ctxt)
95 * 'pmode_gdt' in wakeup_start. 95 * 'pmode_gdt' in wakeup_start.
96 */ 96 */
97 ctxt->gdt_desc.size = GDT_SIZE - 1; 97 ctxt->gdt_desc.size = GDT_SIZE - 1;
98 ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); 98 ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id());
99 99
100 store_tr(ctxt->tr); 100 store_tr(ctxt->tr);
101 101
@@ -162,7 +162,7 @@ static void fix_processor_context(void)
162 int cpu = smp_processor_id(); 162 int cpu = smp_processor_id();
163 struct tss_struct *t = &per_cpu(cpu_tss, cpu); 163 struct tss_struct *t = &per_cpu(cpu_tss, cpu);
164#ifdef CONFIG_X86_64 164#ifdef CONFIG_X86_64
165 struct desc_struct *desc = get_cpu_gdt_table(cpu); 165 struct desc_struct *desc = get_cpu_gdt_rw(cpu);
166 tss_desc tss; 166 tss_desc tss;
167#endif 167#endif
168 set_tss_desc(cpu, t); /* 168 set_tss_desc(cpu, t); /*
@@ -183,6 +183,9 @@ static void fix_processor_context(void)
183 load_mm_ldt(current->active_mm); /* This does lldt */ 183 load_mm_ldt(current->active_mm); /* This does lldt */
184 184
185 fpu__resume_cpu(); 185 fpu__resume_cpu();
186
187 /* The processor is back on the direct GDT, load back the fixmap */
188 load_fixmap_gdt(cpu);
186} 189}
187 190
188/** 191/**
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 9f14bd34581d..c35fdb585c68 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -32,6 +32,7 @@ pgd_t *resume_pg_dir;
32 */ 32 */
33static pmd_t *resume_one_md_table_init(pgd_t *pgd) 33static pmd_t *resume_one_md_table_init(pgd_t *pgd)
34{ 34{
35 p4d_t *p4d;
35 pud_t *pud; 36 pud_t *pud;
36 pmd_t *pmd_table; 37 pmd_t *pmd_table;
37 38
@@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd)
41 return NULL; 42 return NULL;
42 43
43 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 44 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
44 pud = pud_offset(pgd, 0); 45 p4d = p4d_offset(pgd, 0);
46 pud = pud_offset(p4d, 0);
45 47
46 BUG_ON(pmd_table != pmd_offset(pud, 0)); 48 BUG_ON(pmd_table != pmd_offset(pud, 0));
47#else 49#else
48 pud = pud_offset(pgd, 0); 50 p4d = p4d_offset(pgd, 0);
51 pud = pud_offset(p4d, 0);
49 pmd_table = pmd_offset(pud, 0); 52 pmd_table = pmd_offset(pud, 0);
50#endif 53#endif
51 54
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 053801b022dd..6a61194ffd58 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -50,6 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
50{ 50{
51 pmd_t *pmd; 51 pmd_t *pmd;
52 pud_t *pud; 52 pud_t *pud;
53 p4d_t *p4d;
53 54
54 /* 55 /*
55 * The new mapping only has to cover the page containing the image 56 * The new mapping only has to cover the page containing the image
@@ -64,6 +65,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
64 * the virtual address space after switching over to the original page 65 * the virtual address space after switching over to the original page
65 * tables used by the image kernel. 66 * tables used by the image kernel.
66 */ 67 */
68
69 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
70 p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
71 if (!p4d)
72 return -ENOMEM;
73 }
74
67 pud = (pud_t *)get_safe_page(GFP_ATOMIC); 75 pud = (pud_t *)get_safe_page(GFP_ATOMIC);
68 if (!pud) 76 if (!pud)
69 return -ENOMEM; 77 return -ENOMEM;
@@ -76,8 +84,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
76 __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); 84 __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
77 set_pud(pud + pud_index(restore_jump_address), 85 set_pud(pud + pud_index(restore_jump_address),
78 __pud(__pa(pmd) | _KERNPG_TABLE)); 86 __pud(__pa(pmd) | _KERNPG_TABLE));
79 set_pgd(pgd + pgd_index(restore_jump_address), 87 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
80 __pgd(__pa(pud) | _KERNPG_TABLE)); 88 set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE));
89 set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE));
90 } else {
91 /* No p4d for 4-level paging: point the pgd to the pud page table */
92 set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE));
93 }
81 94
82 return 0; 95 return 0;
83} 96}
@@ -125,7 +138,10 @@ static int set_up_temporary_mappings(void)
125static int relocate_restore_code(void) 138static int relocate_restore_code(void)
126{ 139{
127 pgd_t *pgd; 140 pgd_t *pgd;
141 p4d_t *p4d;
128 pud_t *pud; 142 pud_t *pud;
143 pmd_t *pmd;
144 pte_t *pte;
129 145
130 relocated_restore_code = get_safe_page(GFP_ATOMIC); 146 relocated_restore_code = get_safe_page(GFP_ATOMIC);
131 if (!relocated_restore_code) 147 if (!relocated_restore_code)
@@ -135,22 +151,25 @@ static int relocate_restore_code(void)
135 151
136 /* Make the page containing the relocated code executable */ 152 /* Make the page containing the relocated code executable */
137 pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); 153 pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
138 pud = pud_offset(pgd, relocated_restore_code); 154 p4d = p4d_offset(pgd, relocated_restore_code);
155 if (p4d_large(*p4d)) {
156 set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
157 goto out;
158 }
159 pud = pud_offset(p4d, relocated_restore_code);
139 if (pud_large(*pud)) { 160 if (pud_large(*pud)) {
140 set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); 161 set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
141 } else { 162 goto out;
142 pmd_t *pmd = pmd_offset(pud, relocated_restore_code); 163 }
143 164 pmd = pmd_offset(pud, relocated_restore_code);
144 if (pmd_large(*pmd)) { 165 if (pmd_large(*pmd)) {
145 set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); 166 set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
146 } else { 167 goto out;
147 pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code);
148
149 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
150 }
151 } 168 }
169 pte = pte_offset_kernel(pmd, relocated_restore_code);
170 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
171out:
152 __flush_tlb_all(); 172 __flush_tlb_all();
153
154 return 0; 173 return 0;
155} 174}
156 175
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 504ec746b2e4..30822e8e64ac 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -711,7 +711,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
711 711
712 *shadow = t->tls_array[i]; 712 *shadow = t->tls_array[i];
713 713
714 gdt = get_cpu_gdt_table(cpu); 714 gdt = get_cpu_gdt_rw(cpu);
715 maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 715 maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
716 mc = __xen_mc_entry(0); 716 mc = __xen_mc_entry(0);
717 717
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 1d68be6e3ff1..f226038a39ca 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -535,40 +535,41 @@ static pgd_t *xen_get_user_pgd(pgd_t *pgd)
535 return user_ptr; 535 return user_ptr;
536} 536}
537 537
538static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) 538static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
539{ 539{
540 struct mmu_update u; 540 struct mmu_update u;
541 541
542 u.ptr = virt_to_machine(ptr).maddr; 542 u.ptr = virt_to_machine(ptr).maddr;
543 u.val = pgd_val_ma(val); 543 u.val = p4d_val_ma(val);
544 xen_extend_mmu_update(&u); 544 xen_extend_mmu_update(&u);
545} 545}
546 546
547/* 547/*
548 * Raw hypercall-based set_pgd, intended for in early boot before 548 * Raw hypercall-based set_p4d, intended for in early boot before
549 * there's a page structure. This implies: 549 * there's a page structure. This implies:
550 * 1. The only existing pagetable is the kernel's 550 * 1. The only existing pagetable is the kernel's
551 * 2. It is always pinned 551 * 2. It is always pinned
552 * 3. It has no user pagetable attached to it 552 * 3. It has no user pagetable attached to it
553 */ 553 */
554static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) 554static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
555{ 555{
556 preempt_disable(); 556 preempt_disable();
557 557
558 xen_mc_batch(); 558 xen_mc_batch();
559 559
560 __xen_set_pgd_hyper(ptr, val); 560 __xen_set_p4d_hyper(ptr, val);
561 561
562 xen_mc_issue(PARAVIRT_LAZY_MMU); 562 xen_mc_issue(PARAVIRT_LAZY_MMU);
563 563
564 preempt_enable(); 564 preempt_enable();
565} 565}
566 566
567static void xen_set_pgd(pgd_t *ptr, pgd_t val) 567static void xen_set_p4d(p4d_t *ptr, p4d_t val)
568{ 568{
569 pgd_t *user_ptr = xen_get_user_pgd(ptr); 569 pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
570 pgd_t pgd_val;
570 571
571 trace_xen_mmu_set_pgd(ptr, user_ptr, val); 572 trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
572 573
573 /* If page is not pinned, we can just update the entry 574 /* If page is not pinned, we can just update the entry
574 directly */ 575 directly */
@@ -576,7 +577,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
576 *ptr = val; 577 *ptr = val;
577 if (user_ptr) { 578 if (user_ptr) {
578 WARN_ON(xen_page_pinned(user_ptr)); 579 WARN_ON(xen_page_pinned(user_ptr));
579 *user_ptr = val; 580 pgd_val.pgd = p4d_val_ma(val);
581 *user_ptr = pgd_val;
580 } 582 }
581 return; 583 return;
582 } 584 }
@@ -585,14 +587,72 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
585 user updates together. */ 587 user updates together. */
586 xen_mc_batch(); 588 xen_mc_batch();
587 589
588 __xen_set_pgd_hyper(ptr, val); 590 __xen_set_p4d_hyper(ptr, val);
589 if (user_ptr) 591 if (user_ptr)
590 __xen_set_pgd_hyper(user_ptr, val); 592 __xen_set_p4d_hyper((p4d_t *)user_ptr, val);
591 593
592 xen_mc_issue(PARAVIRT_LAZY_MMU); 594 xen_mc_issue(PARAVIRT_LAZY_MMU);
593} 595}
594#endif /* CONFIG_PGTABLE_LEVELS == 4 */ 596#endif /* CONFIG_PGTABLE_LEVELS == 4 */
595 597
598static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
599 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
600 bool last, unsigned long limit)
601{
602 int i, nr, flush = 0;
603
604 nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
605 for (i = 0; i < nr; i++) {
606 if (!pmd_none(pmd[i]))
607 flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
608 }
609 return flush;
610}
611
612static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
613 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
614 bool last, unsigned long limit)
615{
616 int i, nr, flush = 0;
617
618 nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
619 for (i = 0; i < nr; i++) {
620 pmd_t *pmd;
621
622 if (pud_none(pud[i]))
623 continue;
624
625 pmd = pmd_offset(&pud[i], 0);
626 if (PTRS_PER_PMD > 1)
627 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
628 flush |= xen_pmd_walk(mm, pmd, func,
629 last && i == nr - 1, limit);
630 }
631 return flush;
632}
633
634static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
635 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
636 bool last, unsigned long limit)
637{
638 int i, nr, flush = 0;
639
640 nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
641 for (i = 0; i < nr; i++) {
642 pud_t *pud;
643
644 if (p4d_none(p4d[i]))
645 continue;
646
647 pud = pud_offset(&p4d[i], 0);
648 if (PTRS_PER_PUD > 1)
649 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
650 flush |= xen_pud_walk(mm, pud, func,
651 last && i == nr - 1, limit);
652 }
653 return flush;
654}
655
596/* 656/*
597 * (Yet another) pagetable walker. This one is intended for pinning a 657 * (Yet another) pagetable walker. This one is intended for pinning a
598 * pagetable. This means that it walks a pagetable and calls the 658 * pagetable. This means that it walks a pagetable and calls the
@@ -613,10 +673,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
613 enum pt_level), 673 enum pt_level),
614 unsigned long limit) 674 unsigned long limit)
615{ 675{
616 int flush = 0; 676 int i, nr, flush = 0;
617 unsigned hole_low, hole_high; 677 unsigned hole_low, hole_high;
618 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
619 unsigned pgdidx, pudidx, pmdidx;
620 678
621 /* The limit is the last byte to be touched */ 679 /* The limit is the last byte to be touched */
622 limit--; 680 limit--;
@@ -633,65 +691,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
633 hole_low = pgd_index(USER_LIMIT); 691 hole_low = pgd_index(USER_LIMIT);
634 hole_high = pgd_index(PAGE_OFFSET); 692 hole_high = pgd_index(PAGE_OFFSET);
635 693
636 pgdidx_limit = pgd_index(limit); 694 nr = pgd_index(limit) + 1;
637#if PTRS_PER_PUD > 1 695 for (i = 0; i < nr; i++) {
638 pudidx_limit = pud_index(limit); 696 p4d_t *p4d;
639#else
640 pudidx_limit = 0;
641#endif
642#if PTRS_PER_PMD > 1
643 pmdidx_limit = pmd_index(limit);
644#else
645 pmdidx_limit = 0;
646#endif
647
648 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
649 pud_t *pud;
650 697
651 if (pgdidx >= hole_low && pgdidx < hole_high) 698 if (i >= hole_low && i < hole_high)
652 continue; 699 continue;
653 700
654 if (!pgd_val(pgd[pgdidx])) 701 if (pgd_none(pgd[i]))
655 continue; 702 continue;
656 703
657 pud = pud_offset(&pgd[pgdidx], 0); 704 p4d = p4d_offset(&pgd[i], 0);
658 705 if (PTRS_PER_P4D > 1)
659 if (PTRS_PER_PUD > 1) /* not folded */ 706 flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
660 flush |= (*func)(mm, virt_to_page(pud), PT_PUD); 707 flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
661
662 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
663 pmd_t *pmd;
664
665 if (pgdidx == pgdidx_limit &&
666 pudidx > pudidx_limit)
667 goto out;
668
669 if (pud_none(pud[pudidx]))
670 continue;
671
672 pmd = pmd_offset(&pud[pudidx], 0);
673
674 if (PTRS_PER_PMD > 1) /* not folded */
675 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
676
677 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
678 struct page *pte;
679
680 if (pgdidx == pgdidx_limit &&
681 pudidx == pudidx_limit &&
682 pmdidx > pmdidx_limit)
683 goto out;
684
685 if (pmd_none(pmd[pmdidx]))
686 continue;
687
688 pte = pmd_page(pmd[pmdidx]);
689 flush |= (*func)(mm, pte, PT_PTE);
690 }
691 }
692 } 708 }
693 709
694out:
695 /* Do the top level last, so that the callbacks can use it as 710 /* Do the top level last, so that the callbacks can use it as
696 a cue to do final things like tlb flushes. */ 711 a cue to do final things like tlb flushes. */
697 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); 712 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
@@ -1150,57 +1165,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1150 xen_free_ro_pages(pa, PAGE_SIZE); 1165 xen_free_ro_pages(pa, PAGE_SIZE);
1151} 1166}
1152 1167
1168static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
1169{
1170 unsigned long pa;
1171 pte_t *pte_tbl;
1172 int i;
1173
1174 if (pmd_large(*pmd)) {
1175 pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1176 xen_free_ro_pages(pa, PMD_SIZE);
1177 return;
1178 }
1179
1180 pte_tbl = pte_offset_kernel(pmd, 0);
1181 for (i = 0; i < PTRS_PER_PTE; i++) {
1182 if (pte_none(pte_tbl[i]))
1183 continue;
1184 pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
1185 xen_free_ro_pages(pa, PAGE_SIZE);
1186 }
1187 set_pmd(pmd, __pmd(0));
1188 xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
1189}
1190
1191static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
1192{
1193 unsigned long pa;
1194 pmd_t *pmd_tbl;
1195 int i;
1196
1197 if (pud_large(*pud)) {
1198 pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1199 xen_free_ro_pages(pa, PUD_SIZE);
1200 return;
1201 }
1202
1203 pmd_tbl = pmd_offset(pud, 0);
1204 for (i = 0; i < PTRS_PER_PMD; i++) {
1205 if (pmd_none(pmd_tbl[i]))
1206 continue;
1207 xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
1208 }
1209 set_pud(pud, __pud(0));
1210 xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
1211}
1212
1213static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
1214{
1215 unsigned long pa;
1216 pud_t *pud_tbl;
1217 int i;
1218
1219 if (p4d_large(*p4d)) {
1220 pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
1221 xen_free_ro_pages(pa, P4D_SIZE);
1222 return;
1223 }
1224
1225 pud_tbl = pud_offset(p4d, 0);
1226 for (i = 0; i < PTRS_PER_PUD; i++) {
1227 if (pud_none(pud_tbl[i]))
1228 continue;
1229 xen_cleanmfnmap_pud(pud_tbl + i, unpin);
1230 }
1231 set_p4d(p4d, __p4d(0));
1232 xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
1233}
1234
1153/* 1235/*
1154 * Since it is well isolated we can (and since it is perhaps large we should) 1236 * Since it is well isolated we can (and since it is perhaps large we should)
1155 * also free the page tables mapping the initial P->M table. 1237 * also free the page tables mapping the initial P->M table.
1156 */ 1238 */
1157static void __init xen_cleanmfnmap(unsigned long vaddr) 1239static void __init xen_cleanmfnmap(unsigned long vaddr)
1158{ 1240{
1159 unsigned long va = vaddr & PMD_MASK; 1241 pgd_t *pgd;
1160 unsigned long pa; 1242 p4d_t *p4d;
1161 pgd_t *pgd = pgd_offset_k(va);
1162 pud_t *pud_page = pud_offset(pgd, 0);
1163 pud_t *pud;
1164 pmd_t *pmd;
1165 pte_t *pte;
1166 unsigned int i; 1243 unsigned int i;
1167 bool unpin; 1244 bool unpin;
1168 1245
1169 unpin = (vaddr == 2 * PGDIR_SIZE); 1246 unpin = (vaddr == 2 * PGDIR_SIZE);
1170 set_pgd(pgd, __pgd(0)); 1247 vaddr &= PMD_MASK;
1171 do { 1248 pgd = pgd_offset_k(vaddr);
1172 pud = pud_page + pud_index(va); 1249 p4d = p4d_offset(pgd, 0);
1173 if (pud_none(*pud)) { 1250 for (i = 0; i < PTRS_PER_P4D; i++) {
1174 va += PUD_SIZE; 1251 if (p4d_none(p4d[i]))
1175 } else if (pud_large(*pud)) { 1252 continue;
1176 pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; 1253 xen_cleanmfnmap_p4d(p4d + i, unpin);
1177 xen_free_ro_pages(pa, PUD_SIZE); 1254 }
1178 va += PUD_SIZE; 1255 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
1179 } else { 1256 set_pgd(pgd, __pgd(0));
1180 pmd = pmd_offset(pud, va); 1257 xen_cleanmfnmap_free_pgtbl(p4d, unpin);
1181 if (pmd_large(*pmd)) { 1258 }
1182 pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1183 xen_free_ro_pages(pa, PMD_SIZE);
1184 } else if (!pmd_none(*pmd)) {
1185 pte = pte_offset_kernel(pmd, va);
1186 set_pmd(pmd, __pmd(0));
1187 for (i = 0; i < PTRS_PER_PTE; ++i) {
1188 if (pte_none(pte[i]))
1189 break;
1190 pa = pte_pfn(pte[i]) << PAGE_SHIFT;
1191 xen_free_ro_pages(pa, PAGE_SIZE);
1192 }
1193 xen_cleanmfnmap_free_pgtbl(pte, unpin);
1194 }
1195 va += PMD_SIZE;
1196 if (pmd_index(va))
1197 continue;
1198 set_pud(pud, __pud(0));
1199 xen_cleanmfnmap_free_pgtbl(pmd, unpin);
1200 }
1201
1202 } while (pud_index(va) || pmd_index(va));
1203 xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
1204} 1259}
1205 1260
1206static void __init xen_pagetable_p2m_free(void) 1261static void __init xen_pagetable_p2m_free(void)
@@ -1538,7 +1593,6 @@ static int xen_pgd_alloc(struct mm_struct *mm)
1538 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); 1593 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1539 } 1594 }
1540#endif 1595#endif
1541
1542 return ret; 1596 return ret;
1543} 1597}
1544 1598
@@ -1730,7 +1784,7 @@ static void xen_release_pmd(unsigned long pfn)
1730 xen_release_ptpage(pfn, PT_PMD); 1784 xen_release_ptpage(pfn, PT_PMD);
1731} 1785}
1732 1786
1733#if CONFIG_PGTABLE_LEVELS == 4 1787#if CONFIG_PGTABLE_LEVELS >= 4
1734static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) 1788static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1735{ 1789{
1736 xen_alloc_ptpage(mm, pfn, PT_PUD); 1790 xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2071,21 +2125,27 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
2071 */ 2125 */
2072void __init xen_relocate_p2m(void) 2126void __init xen_relocate_p2m(void)
2073{ 2127{
2074 phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; 2128 phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
2075 unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; 2129 unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
2076 int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; 2130 int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
2077 pte_t *pt; 2131 pte_t *pt;
2078 pmd_t *pmd; 2132 pmd_t *pmd;
2079 pud_t *pud; 2133 pud_t *pud;
2134 p4d_t *p4d = NULL;
2080 pgd_t *pgd; 2135 pgd_t *pgd;
2081 unsigned long *new_p2m; 2136 unsigned long *new_p2m;
2137 int save_pud;
2082 2138
2083 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 2139 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
2084 n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; 2140 n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
2085 n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; 2141 n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
2086 n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; 2142 n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
2087 n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; 2143 n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
2088 n_frames = n_pte + n_pt + n_pmd + n_pud; 2144 if (PTRS_PER_P4D > 1)
2145 n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
2146 else
2147 n_p4d = 0;
2148 n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
2089 2149
2090 new_area = xen_find_free_area(PFN_PHYS(n_frames)); 2150 new_area = xen_find_free_area(PFN_PHYS(n_frames));
2091 if (!new_area) { 2151 if (!new_area) {
@@ -2101,55 +2161,76 @@ void __init xen_relocate_p2m(void)
2101 * To avoid any possible virtual address collision, just use 2161 * To avoid any possible virtual address collision, just use
2102 * 2 * PUD_SIZE for the new area. 2162 * 2 * PUD_SIZE for the new area.
2103 */ 2163 */
2104 pud_phys = new_area; 2164 p4d_phys = new_area;
2165 pud_phys = p4d_phys + PFN_PHYS(n_p4d);
2105 pmd_phys = pud_phys + PFN_PHYS(n_pud); 2166 pmd_phys = pud_phys + PFN_PHYS(n_pud);
2106 pt_phys = pmd_phys + PFN_PHYS(n_pmd); 2167 pt_phys = pmd_phys + PFN_PHYS(n_pmd);
2107 p2m_pfn = PFN_DOWN(pt_phys) + n_pt; 2168 p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
2108 2169
2109 pgd = __va(read_cr3()); 2170 pgd = __va(read_cr3());
2110 new_p2m = (unsigned long *)(2 * PGDIR_SIZE); 2171 new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
2111 for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { 2172 idx_p4d = 0;
2112 pud = early_memremap(pud_phys, PAGE_SIZE); 2173 save_pud = n_pud;
2113 clear_page(pud); 2174 do {
2114 for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); 2175 if (n_p4d > 0) {
2115 idx_pmd++) { 2176 p4d = early_memremap(p4d_phys, PAGE_SIZE);
2116 pmd = early_memremap(pmd_phys, PAGE_SIZE); 2177 clear_page(p4d);
2117 clear_page(pmd); 2178 n_pud = min(save_pud, PTRS_PER_P4D);
2118 for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); 2179 }
2119 idx_pt++) { 2180 for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
2120 pt = early_memremap(pt_phys, PAGE_SIZE); 2181 pud = early_memremap(pud_phys, PAGE_SIZE);
2121 clear_page(pt); 2182 clear_page(pud);
2122 for (idx_pte = 0; 2183 for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
2123 idx_pte < min(n_pte, PTRS_PER_PTE); 2184 idx_pmd++) {
2124 idx_pte++) { 2185 pmd = early_memremap(pmd_phys, PAGE_SIZE);
2125 set_pte(pt + idx_pte, 2186 clear_page(pmd);
2126 pfn_pte(p2m_pfn, PAGE_KERNEL)); 2187 for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
2127 p2m_pfn++; 2188 idx_pt++) {
2189 pt = early_memremap(pt_phys, PAGE_SIZE);
2190 clear_page(pt);
2191 for (idx_pte = 0;
2192 idx_pte < min(n_pte, PTRS_PER_PTE);
2193 idx_pte++) {
2194 set_pte(pt + idx_pte,
2195 pfn_pte(p2m_pfn, PAGE_KERNEL));
2196 p2m_pfn++;
2197 }
2198 n_pte -= PTRS_PER_PTE;
2199 early_memunmap(pt, PAGE_SIZE);
2200 make_lowmem_page_readonly(__va(pt_phys));
2201 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
2202 PFN_DOWN(pt_phys));
2203 set_pmd(pmd + idx_pt,
2204 __pmd(_PAGE_TABLE | pt_phys));
2205 pt_phys += PAGE_SIZE;
2128 } 2206 }
2129 n_pte -= PTRS_PER_PTE; 2207 n_pt -= PTRS_PER_PMD;
2130 early_memunmap(pt, PAGE_SIZE); 2208 early_memunmap(pmd, PAGE_SIZE);
2131 make_lowmem_page_readonly(__va(pt_phys)); 2209 make_lowmem_page_readonly(__va(pmd_phys));
2132 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, 2210 pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
2133 PFN_DOWN(pt_phys)); 2211 PFN_DOWN(pmd_phys));
2134 set_pmd(pmd + idx_pt, 2212 set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
2135 __pmd(_PAGE_TABLE | pt_phys)); 2213 pmd_phys += PAGE_SIZE;
2136 pt_phys += PAGE_SIZE;
2137 } 2214 }
2138 n_pt -= PTRS_PER_PMD; 2215 n_pmd -= PTRS_PER_PUD;
2139 early_memunmap(pmd, PAGE_SIZE); 2216 early_memunmap(pud, PAGE_SIZE);
2140 make_lowmem_page_readonly(__va(pmd_phys)); 2217 make_lowmem_page_readonly(__va(pud_phys));
2141 pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, 2218 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
2142 PFN_DOWN(pmd_phys)); 2219 if (n_p4d > 0)
2143 set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); 2220 set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
2144 pmd_phys += PAGE_SIZE; 2221 else
2222 set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
2223 pud_phys += PAGE_SIZE;
2145 } 2224 }
2146 n_pmd -= PTRS_PER_PUD; 2225 if (n_p4d > 0) {
2147 early_memunmap(pud, PAGE_SIZE); 2226 save_pud -= PTRS_PER_P4D;
2148 make_lowmem_page_readonly(__va(pud_phys)); 2227 early_memunmap(p4d, PAGE_SIZE);
2149 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); 2228 make_lowmem_page_readonly(__va(p4d_phys));
2150 set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); 2229 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
2151 pud_phys += PAGE_SIZE; 2230 set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
2152 } 2231 p4d_phys += PAGE_SIZE;
2232 }
2233 } while (++idx_p4d < n_p4d);
2153 2234
2154 /* Now copy the old p2m info to the new area. */ 2235 /* Now copy the old p2m info to the new area. */
2155 memcpy(new_p2m, xen_p2m_addr, size); 2236 memcpy(new_p2m, xen_p2m_addr, size);
@@ -2326,6 +2407,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2326#endif 2407#endif
2327 case FIX_TEXT_POKE0: 2408 case FIX_TEXT_POKE0:
2328 case FIX_TEXT_POKE1: 2409 case FIX_TEXT_POKE1:
2410 case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
2329 /* All local page mappings */ 2411 /* All local page mappings */
2330 pte = pfn_pte(phys, prot); 2412 pte = pfn_pte(phys, prot);
2331 break; 2413 break;
@@ -2378,8 +2460,8 @@ static void __init xen_post_allocator_init(void)
2378 pv_mmu_ops.set_pte = xen_set_pte; 2460 pv_mmu_ops.set_pte = xen_set_pte;
2379 pv_mmu_ops.set_pmd = xen_set_pmd; 2461 pv_mmu_ops.set_pmd = xen_set_pmd;
2380 pv_mmu_ops.set_pud = xen_set_pud; 2462 pv_mmu_ops.set_pud = xen_set_pud;
2381#if CONFIG_PGTABLE_LEVELS == 4 2463#if CONFIG_PGTABLE_LEVELS >= 4
2382 pv_mmu_ops.set_pgd = xen_set_pgd; 2464 pv_mmu_ops.set_p4d = xen_set_p4d;
2383#endif 2465#endif
2384 2466
2385 /* This will work as long as patching hasn't happened yet 2467 /* This will work as long as patching hasn't happened yet
@@ -2388,7 +2470,7 @@ static void __init xen_post_allocator_init(void)
2388 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 2470 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2389 pv_mmu_ops.release_pte = xen_release_pte; 2471 pv_mmu_ops.release_pte = xen_release_pte;
2390 pv_mmu_ops.release_pmd = xen_release_pmd; 2472 pv_mmu_ops.release_pmd = xen_release_pmd;
2391#if CONFIG_PGTABLE_LEVELS == 4 2473#if CONFIG_PGTABLE_LEVELS >= 4
2392 pv_mmu_ops.alloc_pud = xen_alloc_pud; 2474 pv_mmu_ops.alloc_pud = xen_alloc_pud;
2393 pv_mmu_ops.release_pud = xen_release_pud; 2475 pv_mmu_ops.release_pud = xen_release_pud;
2394#endif 2476#endif
@@ -2454,10 +2536,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2454 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), 2536 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2455 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), 2537 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2456 2538
2457#if CONFIG_PGTABLE_LEVELS == 4 2539#if CONFIG_PGTABLE_LEVELS >= 4
2458 .pud_val = PV_CALLEE_SAVE(xen_pud_val), 2540 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2459 .make_pud = PV_CALLEE_SAVE(xen_make_pud), 2541 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2460 .set_pgd = xen_set_pgd_hyper, 2542 .set_p4d = xen_set_p4d_hyper,
2461 2543
2462 .alloc_pud = xen_alloc_pmd_init, 2544 .alloc_pud = xen_alloc_pmd_init,
2463 .release_pud = xen_release_pmd_init, 2545 .release_pud = xen_release_pmd_init,
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 73809bb951b4..3fe2b3292915 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -5,6 +5,7 @@
5 5
6enum pt_level { 6enum pt_level {
7 PT_PGD, 7 PT_PGD,
8 PT_P4D,
8 PT_PUD, 9 PT_PUD,
9 PT_PMD, 10 PT_PMD,
10 PT_PTE 11 PT_PTE
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7ff2f1bfb7ec..eaa36162ed4a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -392,7 +392,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
392 if (ctxt == NULL) 392 if (ctxt == NULL)
393 return -ENOMEM; 393 return -ENOMEM;
394 394
395 gdt = get_cpu_gdt_table(cpu); 395 gdt = get_cpu_gdt_rw(cpu);
396 396
397#ifdef CONFIG_X86_32 397#ifdef CONFIG_X86_32
398 ctxt->user_regs.fs = __KERNEL_PERCPU; 398 ctxt->user_regs.fs = __KERNEL_PERCPU;
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 033f49b31fdc..cb0d742fa23f 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -43,6 +43,7 @@ static void dax_pmem_percpu_exit(void *data)
43 struct dax_pmem *dax_pmem = to_dax_pmem(ref); 43 struct dax_pmem *dax_pmem = to_dax_pmem(ref);
44 44
45 dev_dbg(dax_pmem->dev, "%s\n", __func__); 45 dev_dbg(dax_pmem->dev, "%s\n", __func__);
46 wait_for_completion(&dax_pmem->cmp);
46 percpu_ref_exit(ref); 47 percpu_ref_exit(ref);
47} 48}
48 49
@@ -53,7 +54,6 @@ static void dax_pmem_percpu_kill(void *data)
53 54
54 dev_dbg(dax_pmem->dev, "%s\n", __func__); 55 dev_dbg(dax_pmem->dev, "%s\n", __func__);
55 percpu_ref_kill(ref); 56 percpu_ref_kill(ref);
56 wait_for_completion(&dax_pmem->cmp);
57} 57}
58 58
59static int dax_pmem_probe(struct device *dev) 59static int dax_pmem_probe(struct device *dev)
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index d71f6323ac00..b4f79b923aea 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -504,7 +504,7 @@ void __init lguest_arch_host_init(void)
504 * byte, not the size, hence the "-1"). 504 * byte, not the size, hence the "-1").
505 */ 505 */
506 state->host_gdt_desc.size = GDT_SIZE-1; 506 state->host_gdt_desc.size = GDT_SIZE-1;
507 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 507 state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
508 508
509 /* 509 /*
510 * All CPUs on the Host use the same Interrupt Descriptor 510 * All CPUs on the Host use the same Interrupt Descriptor
@@ -554,8 +554,8 @@ void __init lguest_arch_host_init(void)
554 * The Host needs to be able to use the LGUEST segments on this 554 * The Host needs to be able to use the LGUEST segments on this
555 * CPU, too, so put them in the Host GDT. 555 * CPU, too, so put them in the Host GDT.
556 */ 556 */
557 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 557 get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
558 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 558 get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
559 } 559 }
560 560
561 /* 561 /*
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 5b536be5a12e..fbc640bf06b0 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -25,6 +25,7 @@
25#include <linux/badblocks.h> 25#include <linux/badblocks.h>
26#include <linux/memremap.h> 26#include <linux/memremap.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/blk-mq.h>
28#include <linux/pfn_t.h> 29#include <linux/pfn_t.h>
29#include <linux/slab.h> 30#include <linux/slab.h>
30#include <linux/pmem.h> 31#include <linux/pmem.h>
@@ -231,6 +232,11 @@ static void pmem_release_queue(void *q)
231 blk_cleanup_queue(q); 232 blk_cleanup_queue(q);
232} 233}
233 234
235static void pmem_freeze_queue(void *q)
236{
237 blk_freeze_queue_start(q);
238}
239
234static void pmem_release_disk(void *disk) 240static void pmem_release_disk(void *disk)
235{ 241{
236 del_gendisk(disk); 242 del_gendisk(disk);
@@ -284,6 +290,9 @@ static int pmem_attach_disk(struct device *dev,
284 if (!q) 290 if (!q)
285 return -ENOMEM; 291 return -ENOMEM;
286 292
293 if (devm_add_action_or_reset(dev, pmem_release_queue, q))
294 return -ENOMEM;
295
287 pmem->pfn_flags = PFN_DEV; 296 pmem->pfn_flags = PFN_DEV;
288 if (is_nd_pfn(dev)) { 297 if (is_nd_pfn(dev)) {
289 addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, 298 addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
@@ -303,10 +312,10 @@ static int pmem_attach_disk(struct device *dev,
303 pmem->size, ARCH_MEMREMAP_PMEM); 312 pmem->size, ARCH_MEMREMAP_PMEM);
304 313
305 /* 314 /*
306 * At release time the queue must be dead before 315 * At release time the queue must be frozen before
307 * devm_memremap_pages is unwound 316 * devm_memremap_pages is unwound
308 */ 317 */
309 if (devm_add_action_or_reset(dev, pmem_release_queue, q)) 318 if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
310 return -ENOMEM; 319 return -ENOMEM;
311 320
312 if (IS_ERR(addr)) 321 if (IS_ERR(addr))
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 438d4c72c7b3..ff563db025b3 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -54,7 +54,7 @@ __asm__(".text \n"
54 54
55#define Q2_SET_SEL(cpu, selname, address, size) \ 55#define Q2_SET_SEL(cpu, selname, address, size) \
56do { \ 56do { \
57 struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \ 57 struct desc_struct *gdt = get_cpu_gdt_rw((cpu)); \
58 set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \ 58 set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \
59 set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ 59 set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \
60} while(0) 60} while(0)
@@ -95,8 +95,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
95 return PNP_FUNCTION_NOT_SUPPORTED; 95 return PNP_FUNCTION_NOT_SUPPORTED;
96 96
97 cpu = get_cpu(); 97 cpu = get_cpu();
98 save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8]; 98 save_desc_40 = get_cpu_gdt_rw(cpu)[0x40 / 8];
99 get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc; 99 get_cpu_gdt_rw(cpu)[0x40 / 8] = bad_bios_desc;
100 100
101 /* On some boxes IRQ's during PnP BIOS calls are deadly. */ 101 /* On some boxes IRQ's during PnP BIOS calls are deadly. */
102 spin_lock_irqsave(&pnp_bios_lock, flags); 102 spin_lock_irqsave(&pnp_bios_lock, flags);
@@ -134,7 +134,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
134 :"memory"); 134 :"memory");
135 spin_unlock_irqrestore(&pnp_bios_lock, flags); 135 spin_unlock_irqrestore(&pnp_bios_lock, flags);
136 136
137 get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40; 137 get_cpu_gdt_rw(cpu)[0x40 / 8] = save_desc_40;
138 put_cpu(); 138 put_cpu();
139 139
140 /* If we get here and this is set then the PnP BIOS faulted on us. */ 140 /* If we get here and this is set then the PnP BIOS faulted on us. */
@@ -477,7 +477,7 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
477 pnp_bios_callpoint.segment = PNP_CS16; 477 pnp_bios_callpoint.segment = PNP_CS16;
478 478
479 for_each_possible_cpu(i) { 479 for_each_possible_cpu(i) {
480 struct desc_struct *gdt = get_cpu_gdt_table(i); 480 struct desc_struct *gdt = get_cpu_gdt_rw(i);
481 if (!gdt) 481 if (!gdt)
482 continue; 482 continue;
483 set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32], 483 set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32],
diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h
index cc5d9a1405df..41e5b6784b97 100644
--- a/include/asm-generic/mm_hooks.h
+++ b/include/asm-generic/mm_hooks.h
@@ -32,10 +32,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
32 /* by default, allow everything */ 32 /* by default, allow everything */
33 return true; 33 return true;
34} 34}
35
36static inline bool arch_pte_access_permitted(pte_t pte, bool write)
37{
38 /* by default, allow everything */
39 return true;
40}
41#endif /* _ASM_GENERIC_MM_HOOKS_H */ 35#endif /* _ASM_GENERIC_MM_HOOKS_H */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 1fad160f35de..7dfa767dc680 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -341,6 +341,31 @@ static inline int pte_unused(pte_t pte)
341} 341}
342#endif 342#endif
343 343
344#ifndef pte_access_permitted
345#define pte_access_permitted(pte, write) \
346 (pte_present(pte) && (!(write) || pte_write(pte)))
347#endif
348
349#ifndef pmd_access_permitted
350#define pmd_access_permitted(pmd, write) \
351 (pmd_present(pmd) && (!(write) || pmd_write(pmd)))
352#endif
353
354#ifndef pud_access_permitted
355#define pud_access_permitted(pud, write) \
356 (pud_present(pud) && (!(write) || pud_write(pud)))
357#endif
358
359#ifndef p4d_access_permitted
360#define p4d_access_permitted(p4d, write) \
361 (p4d_present(p4d) && (!(write) || p4d_write(p4d)))
362#endif
363
364#ifndef pgd_access_permitted
365#define pgd_access_permitted(pgd, write) \
366 (pgd_present(pgd) && (!(write) || pgd_write(pgd)))
367#endif
368
344#ifndef __HAVE_ARCH_PMD_SAME 369#ifndef __HAVE_ARCH_PMD_SAME
345#ifdef CONFIG_TRANSPARENT_HUGEPAGE 370#ifdef CONFIG_TRANSPARENT_HUGEPAGE
346static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) 371static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00a8fa7e366a..695da2a19b4c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -432,6 +432,10 @@ static inline int pud_devmap(pud_t pud)
432{ 432{
433 return 0; 433 return 0;
434} 434}
435static inline int pgd_devmap(pgd_t pgd)
436{
437 return 0;
438}
435#endif 439#endif
436 440
437/* 441/*
@@ -758,19 +762,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
758} 762}
759 763
760#ifdef CONFIG_ZONE_DEVICE 764#ifdef CONFIG_ZONE_DEVICE
761void get_zone_device_page(struct page *page);
762void put_zone_device_page(struct page *page);
763static inline bool is_zone_device_page(const struct page *page) 765static inline bool is_zone_device_page(const struct page *page)
764{ 766{
765 return page_zonenum(page) == ZONE_DEVICE; 767 return page_zonenum(page) == ZONE_DEVICE;
766} 768}
767#else 769#else
768static inline void get_zone_device_page(struct page *page)
769{
770}
771static inline void put_zone_device_page(struct page *page)
772{
773}
774static inline bool is_zone_device_page(const struct page *page) 770static inline bool is_zone_device_page(const struct page *page)
775{ 771{
776 return false; 772 return false;
@@ -786,9 +782,6 @@ static inline void get_page(struct page *page)
786 */ 782 */
787 VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); 783 VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
788 page_ref_inc(page); 784 page_ref_inc(page);
789
790 if (unlikely(is_zone_device_page(page)))
791 get_zone_device_page(page);
792} 785}
793 786
794static inline void put_page(struct page *page) 787static inline void put_page(struct page *page)
@@ -797,9 +790,6 @@ static inline void put_page(struct page *page)
797 790
798 if (put_page_testzero(page)) 791 if (put_page_testzero(page))
799 __put_page(page); 792 __put_page(page);
800
801 if (unlikely(is_zone_device_page(page)))
802 put_zone_device_page(page);
803} 793}
804 794
805#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 795#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f60f45fe226f..45cdb27791a3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -367,6 +367,11 @@ struct mm_struct {
367#endif 367#endif
368 unsigned long mmap_base; /* base of mmap area */ 368 unsigned long mmap_base; /* base of mmap area */
369 unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ 369 unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
370#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
371 /* Base adresses for compatible mmap() */
372 unsigned long mmap_compat_base;
373 unsigned long mmap_compat_legacy_base;
374#endif
370 unsigned long task_size; /* size of task vm space */ 375 unsigned long task_size; /* size of task vm space */
371 unsigned long highest_vm_end; /* highest vma end address */ 376 unsigned long highest_vm_end; /* highest vma end address */
372 pgd_t * pgd; 377 pgd_t * pgd;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 84943e8057ef..316a19f6b635 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page)
148 148
149#ifdef CONFIG_TINY_RCU 149#ifdef CONFIG_TINY_RCU
150# ifdef CONFIG_PREEMPT_COUNT 150# ifdef CONFIG_PREEMPT_COUNT
151 VM_BUG_ON(!in_atomic()); 151 VM_BUG_ON(!in_atomic() && !irqs_disabled());
152# endif 152# endif
153 /* 153 /*
154 * Preempt must be disabled here - we rely on rcu_read_lock doing 154 * Preempt must be disabled here - we rely on rcu_read_lock doing
@@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
186 186
187#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) 187#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
188# ifdef CONFIG_PREEMPT_COUNT 188# ifdef CONFIG_PREEMPT_COUNT
189 VM_BUG_ON(!in_atomic()); 189 VM_BUG_ON(!in_atomic() && !irqs_disabled());
190# endif 190# endif
191 VM_BUG_ON_PAGE(page_count(page) == 0, page); 191 VM_BUG_ON_PAGE(page_count(page) == 0, page);
192 page_ref_add(page, count); 192 page_ref_add(page, count);
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index bce990f5a35d..31acce9019a6 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -241,21 +241,21 @@ TRACE_EVENT(xen_mmu_set_pud,
241 (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval) 241 (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval)
242 ); 242 );
243 243
244TRACE_EVENT(xen_mmu_set_pgd, 244TRACE_EVENT(xen_mmu_set_p4d,
245 TP_PROTO(pgd_t *pgdp, pgd_t *user_pgdp, pgd_t pgdval), 245 TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t p4dval),
246 TP_ARGS(pgdp, user_pgdp, pgdval), 246 TP_ARGS(p4dp, user_p4dp, p4dval),
247 TP_STRUCT__entry( 247 TP_STRUCT__entry(
248 __field(pgd_t *, pgdp) 248 __field(p4d_t *, p4dp)
249 __field(pgd_t *, user_pgdp) 249 __field(p4d_t *, user_p4dp)
250 __field(pgdval_t, pgdval) 250 __field(p4dval_t, p4dval)
251 ), 251 ),
252 TP_fast_assign(__entry->pgdp = pgdp; 252 TP_fast_assign(__entry->p4dp = p4dp;
253 __entry->user_pgdp = user_pgdp; 253 __entry->user_p4dp = user_p4dp;
254 __entry->pgdval = pgdval.pgd), 254 __entry->p4dval = p4d_val(p4dval)),
255 TP_printk("pgdp %p user_pgdp %p pgdval %0*llx (raw %0*llx)", 255 TP_printk("p4dp %p user_p4dp %p p4dval %0*llx (raw %0*llx)",
256 __entry->pgdp, __entry->user_pgdp, 256 __entry->p4dp, __entry->user_p4dp,
257 (int)sizeof(pgdval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pgdval)), 257 (int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)),
258 (int)sizeof(pgdval_t) * 2, (unsigned long long)__entry->pgdval) 258 (int)sizeof(p4dval_t) * 2, (unsigned long long)__entry->p4dval)
259 ); 259 );
260 260
261TRACE_EVENT(xen_mmu_pud_clear, 261TRACE_EVENT(xen_mmu_pud_clear,
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 07e85e5229da..23a6483c3666 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -182,18 +182,6 @@ struct page_map {
182 struct vmem_altmap altmap; 182 struct vmem_altmap altmap;
183}; 183};
184 184
185void get_zone_device_page(struct page *page)
186{
187 percpu_ref_get(page->pgmap->ref);
188}
189EXPORT_SYMBOL(get_zone_device_page);
190
191void put_zone_device_page(struct page *page)
192{
193 put_dev_pagemap(page->pgmap);
194}
195EXPORT_SYMBOL(put_zone_device_page);
196
197static void pgmap_radix_release(struct resource *res) 185static void pgmap_radix_release(struct resource *res)
198{ 186{
199 resource_size_t key, align_start, align_size, align_end; 187 resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
237 struct resource *res = &page_map->res; 225 struct resource *res = &page_map->res;
238 resource_size_t align_start, align_size; 226 resource_size_t align_start, align_size;
239 struct dev_pagemap *pgmap = &page_map->pgmap; 227 struct dev_pagemap *pgmap = &page_map->pgmap;
228 unsigned long pfn;
229
230 for_each_device_pfn(pfn, page_map)
231 put_page(pfn_to_page(pfn));
240 232
241 if (percpu_ref_tryget_live(pgmap->ref)) { 233 if (percpu_ref_tryget_live(pgmap->ref)) {
242 dev_WARN(dev, "%s: page mapping is still live!\n", __func__); 234 dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
277 * 269 *
278 * Notes: 270 * Notes:
279 * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time 271 * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
280 * (or devm release event). 272 * (or devm release event). The expected order of events is that @ref has
273 * been through percpu_ref_kill() before devm_memremap_pages_release(). The
274 * wait for the completion of all references being dropped and
275 * percpu_ref_exit() must occur after devm_memremap_pages_release().
281 * 276 *
282 * 2/ @res is expected to be a host memory range that could feasibly be 277 * 2/ @res is expected to be a host memory range that could feasibly be
283 * treated as a "System RAM" range, i.e. not a device mmio range, but 278 * treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
379 */ 374 */
380 list_del(&page->lru); 375 list_del(&page->lru);
381 page->pgmap = pgmap; 376 page->pgmap = pgmap;
377 percpu_ref_get(ref);
382 } 378 }
383 devres_add(dev, page_map); 379 devres_add(dev, page_map);
384 return __va(res->start); 380 return __va(res->start);
diff --git a/mm/gup.c b/mm/gup.c
index 04aa405350dc..527ec2c6cca3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1189,34 +1189,57 @@ struct page *get_dump_page(unsigned long addr)
1189 */ 1189 */
1190#ifdef CONFIG_HAVE_GENERIC_RCU_GUP 1190#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
1191 1191
1192#ifndef gup_get_pte
1193/*
1194 * We assume that the PTE can be read atomically. If this is not the case for
1195 * your architecture, please provide the helper.
1196 */
1197static inline pte_t gup_get_pte(pte_t *ptep)
1198{
1199 return READ_ONCE(*ptep);
1200}
1201#endif
1202
1203static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
1204{
1205 while ((*nr) - nr_start) {
1206 struct page *page = pages[--(*nr)];
1207
1208 ClearPageReferenced(page);
1209 put_page(page);
1210 }
1211}
1212
1192#ifdef __HAVE_ARCH_PTE_SPECIAL 1213#ifdef __HAVE_ARCH_PTE_SPECIAL
1193static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1214static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1194 int write, struct page **pages, int *nr) 1215 int write, struct page **pages, int *nr)
1195{ 1216{
1217 struct dev_pagemap *pgmap = NULL;
1218 int nr_start = *nr, ret = 0;
1196 pte_t *ptep, *ptem; 1219 pte_t *ptep, *ptem;
1197 int ret = 0;
1198 1220
1199 ptem = ptep = pte_offset_map(&pmd, addr); 1221 ptem = ptep = pte_offset_map(&pmd, addr);
1200 do { 1222 do {
1201 /* 1223 pte_t pte = gup_get_pte(ptep);
1202 * In the line below we are assuming that the pte can be read
1203 * atomically. If this is not the case for your architecture,
1204 * please wrap this in a helper function!
1205 *
1206 * for an example see gup_get_pte in arch/x86/mm/gup.c
1207 */
1208 pte_t pte = READ_ONCE(*ptep);
1209 struct page *head, *page; 1224 struct page *head, *page;
1210 1225
1211 /* 1226 /*
1212 * Similar to the PMD case below, NUMA hinting must take slow 1227 * Similar to the PMD case below, NUMA hinting must take slow
1213 * path using the pte_protnone check. 1228 * path using the pte_protnone check.
1214 */ 1229 */
1215 if (!pte_present(pte) || pte_special(pte) || 1230 if (pte_protnone(pte))
1216 pte_protnone(pte) || (write && !pte_write(pte)))
1217 goto pte_unmap; 1231 goto pte_unmap;
1218 1232
1219 if (!arch_pte_access_permitted(pte, write)) 1233 if (!pte_access_permitted(pte, write))
1234 goto pte_unmap;
1235
1236 if (pte_devmap(pte)) {
1237 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
1238 if (unlikely(!pgmap)) {
1239 undo_dev_pagemap(nr, nr_start, pages);
1240 goto pte_unmap;
1241 }
1242 } else if (pte_special(pte))
1220 goto pte_unmap; 1243 goto pte_unmap;
1221 1244
1222 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1245 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1232 } 1255 }
1233 1256
1234 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1257 VM_BUG_ON_PAGE(compound_head(page) != head, page);
1258
1259 put_dev_pagemap(pgmap);
1260 SetPageReferenced(page);
1235 pages[*nr] = page; 1261 pages[*nr] = page;
1236 (*nr)++; 1262 (*nr)++;
1237 1263
@@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1261} 1287}
1262#endif /* __HAVE_ARCH_PTE_SPECIAL */ 1288#endif /* __HAVE_ARCH_PTE_SPECIAL */
1263 1289
1290#ifdef __HAVE_ARCH_PTE_DEVMAP
1291static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1292 unsigned long end, struct page **pages, int *nr)
1293{
1294 int nr_start = *nr;
1295 struct dev_pagemap *pgmap = NULL;
1296
1297 do {
1298 struct page *page = pfn_to_page(pfn);
1299
1300 pgmap = get_dev_pagemap(pfn, pgmap);
1301 if (unlikely(!pgmap)) {
1302 undo_dev_pagemap(nr, nr_start, pages);
1303 return 0;
1304 }
1305 SetPageReferenced(page);
1306 pages[*nr] = page;
1307 get_page(page);
1308 put_dev_pagemap(pgmap);
1309 (*nr)++;
1310 pfn++;
1311 } while (addr += PAGE_SIZE, addr != end);
1312 return 1;
1313}
1314
1315static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
1316 unsigned long end, struct page **pages, int *nr)
1317{
1318 unsigned long fault_pfn;
1319
1320 fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1321 return __gup_device_huge(fault_pfn, addr, end, pages, nr);
1322}
1323
1324static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
1325 unsigned long end, struct page **pages, int *nr)
1326{
1327 unsigned long fault_pfn;
1328
1329 fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1330 return __gup_device_huge(fault_pfn, addr, end, pages, nr);
1331}
1332#else
1333static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
1334 unsigned long end, struct page **pages, int *nr)
1335{
1336 BUILD_BUG();
1337 return 0;
1338}
1339
1340static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
1341 unsigned long end, struct page **pages, int *nr)
1342{
1343 BUILD_BUG();
1344 return 0;
1345}
1346#endif
1347
1264static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1348static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1265 unsigned long end, int write, struct page **pages, int *nr) 1349 unsigned long end, int write, struct page **pages, int *nr)
1266{ 1350{
1267 struct page *head, *page; 1351 struct page *head, *page;
1268 int refs; 1352 int refs;
1269 1353
1270 if (write && !pmd_write(orig)) 1354 if (!pmd_access_permitted(orig, write))
1271 return 0; 1355 return 0;
1272 1356
1357 if (pmd_devmap(orig))
1358 return __gup_device_huge_pmd(orig, addr, end, pages, nr);
1359
1273 refs = 0; 1360 refs = 0;
1274 head = pmd_page(orig); 1361 head = pmd_page(orig);
1275 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1362 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1293 return 0; 1380 return 0;
1294 } 1381 }
1295 1382
1383 SetPageReferenced(head);
1296 return 1; 1384 return 1;
1297} 1385}
1298 1386
@@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1302 struct page *head, *page; 1390 struct page *head, *page;
1303 int refs; 1391 int refs;
1304 1392
1305 if (write && !pud_write(orig)) 1393 if (!pud_access_permitted(orig, write))
1306 return 0; 1394 return 0;
1307 1395
1396 if (pud_devmap(orig))
1397 return __gup_device_huge_pud(orig, addr, end, pages, nr);
1398
1308 refs = 0; 1399 refs = 0;
1309 head = pud_page(orig); 1400 head = pud_page(orig);
1310 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 1401 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1328 return 0; 1419 return 0;
1329 } 1420 }
1330 1421
1422 SetPageReferenced(head);
1331 return 1; 1423 return 1;
1332} 1424}
1333 1425
@@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1338 int refs; 1430 int refs;
1339 struct page *head, *page; 1431 struct page *head, *page;
1340 1432
1341 if (write && !pgd_write(orig)) 1433 if (!pgd_access_permitted(orig, write))
1342 return 0; 1434 return 0;
1343 1435
1436 BUILD_BUG_ON(pgd_devmap(orig));
1344 refs = 0; 1437 refs = 0;
1345 head = pgd_page(orig); 1438 head = pgd_page(orig);
1346 page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); 1439 page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
@@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1364 return 0; 1457 return 0;
1365 } 1458 }
1366 1459
1460 SetPageReferenced(head);
1367 return 1; 1461 return 1;
1368} 1462}
1369 1463
@@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1520 return nr; 1614 return nr;
1521} 1615}
1522 1616
1617#ifndef gup_fast_permitted
1618/*
1619 * Check if it's allowed to use __get_user_pages_fast() for the range, or
1620 * we need to fall back to the slow version:
1621 */
1622bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
1623{
1624 unsigned long len, end;
1625
1626 len = (unsigned long) nr_pages << PAGE_SHIFT;
1627 end = start + len;
1628 return end >= start;
1629}
1630#endif
1631
1523/** 1632/**
1524 * get_user_pages_fast() - pin user pages in memory 1633 * get_user_pages_fast() - pin user pages in memory
1525 * @start: starting user address 1634 * @start: starting user address
@@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1539int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1648int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1540 struct page **pages) 1649 struct page **pages)
1541{ 1650{
1542 int nr, ret; 1651 int nr = 0, ret = 0;
1543 1652
1544 start &= PAGE_MASK; 1653 start &= PAGE_MASK;
1545 nr = __get_user_pages_fast(start, nr_pages, write, pages); 1654
1546 ret = nr; 1655 if (gup_fast_permitted(start, nr_pages, write)) {
1656 nr = __get_user_pages_fast(start, nr_pages, write, pages);
1657 ret = nr;
1658 }
1547 1659
1548 if (nr < nr_pages) { 1660 if (nr < nr_pages) {
1549 /* Try to get the remaining pages with get_user_pages */ 1661 /* Try to get the remaining pages with get_user_pages */
diff --git a/mm/swap.c b/mm/swap.c
index 5dabf444d724..d8d9ee9e311a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
97 97
98void __put_page(struct page *page) 98void __put_page(struct page *page)
99{ 99{
100 if (is_zone_device_page(page)) {
101 put_dev_pagemap(page->pgmap);
102
103 /*
104 * The page belongs to the device that created pgmap. Do
105 * not return it to page allocator.
106 */
107 return;
108 }
109
100 if (unlikely(PageCompound(page))) 110 if (unlikely(PageCompound(page)))
101 __put_compound_page(page); 111 __put_compound_page(page);
102 else 112 else
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index f6121612e769..b9a22f18566a 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -409,6 +409,51 @@ static void *threadproc(void *ctx)
409 } 409 }
410} 410}
411 411
412#ifdef __i386__
413
414#ifndef SA_RESTORE
415#define SA_RESTORER 0x04000000
416#endif
417
418/*
419 * The UAPI header calls this 'struct sigaction', which conflicts with
420 * glibc. Sigh.
421 */
422struct fake_ksigaction {
423 void *handler; /* the real type is nasty */
424 unsigned long sa_flags;
425 void (*sa_restorer)(void);
426 unsigned char sigset[8];
427};
428
429static void fix_sa_restorer(int sig)
430{
431 struct fake_ksigaction ksa;
432
433 if (syscall(SYS_rt_sigaction, sig, NULL, &ksa, 8) == 0) {
434 /*
435 * glibc has a nasty bug: it sometimes writes garbage to
436 * sa_restorer. This interacts quite badly with anything
437 * that fiddles with SS because it can trigger legacy
438 * stack switching. Patch it up. See:
439 *
440 * https://sourceware.org/bugzilla/show_bug.cgi?id=21269
441 */
442 if (!(ksa.sa_flags & SA_RESTORER) && ksa.sa_restorer) {
443 ksa.sa_restorer = NULL;
444 if (syscall(SYS_rt_sigaction, sig, &ksa, NULL,
445 sizeof(ksa.sigset)) != 0)
446 err(1, "rt_sigaction");
447 }
448 }
449}
450#else
451static void fix_sa_restorer(int sig)
452{
453 /* 64-bit glibc works fine. */
454}
455#endif
456
412static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 457static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
413 int flags) 458 int flags)
414{ 459{
@@ -420,6 +465,7 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
420 if (sigaction(sig, &sa, 0)) 465 if (sigaction(sig, &sa, 0))
421 err(1, "sigaction"); 466 err(1, "sigaction");
422 467
468 fix_sa_restorer(sig);
423} 469}
424 470
425static jmp_buf jmpbuf; 471static jmp_buf jmpbuf;
diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c
index 616ee9673339..a8df159a8924 100644
--- a/tools/testing/selftests/x86/mpx-mini-test.c
+++ b/tools/testing/selftests/x86/mpx-mini-test.c
@@ -404,8 +404,6 @@ void handler(int signum, siginfo_t *si, void *vucontext)
404 dprintf2("info->si_lower: %p\n", __si_bounds_lower(si)); 404 dprintf2("info->si_lower: %p\n", __si_bounds_lower(si));
405 dprintf2("info->si_upper: %p\n", __si_bounds_upper(si)); 405 dprintf2("info->si_upper: %p\n", __si_bounds_upper(si));
406 406
407 check_siginfo_vs_shadow(si);
408
409 for (i = 0; i < 8; i++) 407 for (i = 0; i < 8; i++)
410 dprintf3("[%d]: %p\n", i, si_addr_ptr[i]); 408 dprintf3("[%d]: %p\n", i, si_addr_ptr[i]);
411 switch (br_reason) { 409 switch (br_reason) {
@@ -416,6 +414,9 @@ void handler(int signum, siginfo_t *si, void *vucontext)
416 exit(5); 414 exit(5);
417 case 1: /* #BR MPX bounds exception */ 415 case 1: /* #BR MPX bounds exception */
418 /* these are normal and we expect to see them */ 416 /* these are normal and we expect to see them */
417
418 check_siginfo_vs_shadow(si);
419
419 dprintf1("bounds exception (normal): status 0x%jx at %p si_addr: %p\n", 420 dprintf1("bounds exception (normal): status 0x%jx at %p si_addr: %p\n",
420 status, (void *)ip, si->si_addr); 421 status, (void *)ip, si->si_addr);
421 num_bnd_chk++; 422 num_bnd_chk++;