diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-02 02:54:56 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-02 02:54:56 -0400 |
commit | d3b5d35290d729a2518af00feca867385a1b08fa (patch) | |
tree | 7b56c0863d59bc57f7c7dcf5d5665c56b05f1d1b | |
parent | aa2a4b6569d5b10491b606a86e574dff3852597a (diff) | |
parent | 71389703839ebe9cb426c72d5f0bd549592e583c (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The main x86 MM changes in this cycle were:
- continued native kernel PCID support preparation patches to the TLB
flushing code (Andy Lutomirski)
- various fixes related to 32-bit compat syscall returning address
over 4Gb in applications, launched from 64-bit binaries - motivated
by C/R frameworks such as Virtuozzo. (Dmitry Safonov)
- continued Intel 5-level paging enablement: in particular the
conversion of x86 GUP to the generic GUP code. (Kirill A. Shutemov)
- x86/mpx ABI corner case fixes/enhancements (Joerg Roedel)
- ... plus misc updates, fixes and cleanups"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (62 commits)
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash
x86/mm: Fix flush_tlb_page() on Xen
x86/mm: Make flush_tlb_mm_range() more predictable
x86/mm: Remove flush_tlb() and flush_tlb_current_task()
x86/vm86/32: Switch to flush_tlb_mm_range() in mark_screen_rdonly()
x86/mm/64: Fix crash in remove_pagetable()
Revert "x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"
x86/boot/e820: Remove a redundant self assignment
x86/mm: Fix dump pagetables for 4 levels of page tables
x86/mpx, selftests: Only check bounds-vs-shadow when we keep shadow
x86/mpx: Correctly report do_mpx_bt_fault() failures to user-space
Revert "x86/mm/numa: Remove numa_nodemask_from_meminfo()"
x86/espfix: Add support for 5-level paging
x86/kasan: Extend KASAN to support 5-level paging
x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL=y
x86/paravirt: Add 5-level support to the paravirt code
x86/mm: Define virtual memory map for 5-level paging
x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert
x86/boot: Detect 5-level paging support
x86/mm/numa: Remove numa_nodemask_from_meminfo()
...
93 files changed, 1845 insertions, 711 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 5724092db811..b0798e281aa6 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt | |||
@@ -4,7 +4,7 @@ | |||
4 | Virtual memory map with 4 level page tables: | 4 | Virtual memory map with 4 level page tables: |
5 | 5 | ||
6 | 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm | 6 | 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm |
7 | hole caused by [48:63] sign extension | 7 | hole caused by [47:63] sign extension |
8 | ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor | 8 | ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor |
9 | ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory | 9 | ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory |
10 | ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole | 10 | ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole |
@@ -19,16 +19,43 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks | |||
19 | ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space | 19 | ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space |
20 | ... unused hole ... | 20 | ... unused hole ... |
21 | ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 | 21 | ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 |
22 | ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) | ||
23 | ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls | ||
24 | ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole | ||
25 | |||
26 | Virtual memory map with 5 level page tables: | ||
27 | |||
28 | 0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm | ||
29 | hole caused by [56:63] sign extension | ||
30 | ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor | ||
31 | ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory | ||
32 | ff90000000000000 - ff91ffffffffffff (=49 bits) hole | ||
33 | ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space | ||
34 | ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole | ||
35 | ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) | ||
36 | ... unused hole ... | ||
37 | ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB) | ||
38 | ... unused hole ... | ||
39 | ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks | ||
40 | ... unused hole ... | ||
41 | ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space | ||
42 | ... unused hole ... | ||
43 | ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 | ||
22 | ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space | 44 | ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space |
23 | ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls | 45 | ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls |
24 | ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole | 46 | ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole |
25 | 47 | ||
48 | Architecture defines a 64-bit virtual address. Implementations can support | ||
49 | less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 | ||
50 | through to the most-significant implemented bit are set to either all ones | ||
51 | or all zero. This causes hole between user space and kernel addresses. | ||
52 | |||
26 | The direct mapping covers all memory in the system up to the highest | 53 | The direct mapping covers all memory in the system up to the highest |
27 | memory address (this means in some cases it can also include PCI memory | 54 | memory address (this means in some cases it can also include PCI memory |
28 | holes). | 55 | holes). |
29 | 56 | ||
30 | vmalloc space is lazily synchronized into the different PML4 pages of | 57 | vmalloc space is lazily synchronized into the different PML4/PML5 pages of |
31 | the processes using the page fault handler, with init_level4_pgt as | 58 | the processes using the page fault handler, with init_top_pgt as |
32 | reference. | 59 | reference. |
33 | 60 | ||
34 | Current X86-64 implementations support up to 46 bits of address space (64 TB), | 61 | Current X86-64 implementations support up to 46 bits of address space (64 TB), |
@@ -39,6 +66,9 @@ memory window (this size is arbitrary, it can be raised later if needed). | |||
39 | The mappings are not part of any other kernel PGD and are only available | 66 | The mappings are not part of any other kernel PGD and are only available |
40 | during EFI runtime calls. | 67 | during EFI runtime calls. |
41 | 68 | ||
69 | The module mapping space size changes based on the CONFIG requirements for the | ||
70 | following fixmap section. | ||
71 | |||
42 | Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all | 72 | Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all |
43 | physical memory, vmalloc/ioremap space and virtual memory map are randomized. | 73 | physical memory, vmalloc/ioremap space and virtual memory map are randomized. |
44 | Their order is preserved but their base will be offset early at boot time. | 74 | Their order is preserved but their base will be offset early at boot time. |
diff --git a/arch/Kconfig b/arch/Kconfig index cd211a14a88f..c4d6833aacd9 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -700,6 +700,13 @@ config ARCH_MMAP_RND_COMPAT_BITS | |||
700 | This value can be changed after boot using the | 700 | This value can be changed after boot using the |
701 | /proc/sys/vm/mmap_rnd_compat_bits tunable | 701 | /proc/sys/vm/mmap_rnd_compat_bits tunable |
702 | 702 | ||
703 | config HAVE_ARCH_COMPAT_MMAP_BASES | ||
704 | bool | ||
705 | help | ||
706 | This allows 64bit applications to invoke 32-bit mmap() syscall | ||
707 | and vice-versa 32-bit applications to call 64-bit mmap(). | ||
708 | Required for applications doing different bitness syscalls. | ||
709 | |||
703 | config HAVE_COPY_THREAD_TLS | 710 | config HAVE_COPY_THREAD_TLS |
704 | bool | 711 | bool |
705 | help | 712 | help |
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b9e3f0aca261..ecf9885ab660 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h | |||
@@ -163,11 +163,5 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, | |||
163 | /* by default, allow everything */ | 163 | /* by default, allow everything */ |
164 | return true; | 164 | return true; |
165 | } | 165 | } |
166 | |||
167 | static inline bool arch_pte_access_permitted(pte_t pte, bool write) | ||
168 | { | ||
169 | /* by default, allow everything */ | ||
170 | return true; | ||
171 | } | ||
172 | #endif /* __KERNEL__ */ | 166 | #endif /* __KERNEL__ */ |
173 | #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ | 167 | #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ |
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 6e31d87fb669..fa2bf69be182 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h | |||
@@ -156,10 +156,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, | |||
156 | /* by default, allow everything */ | 156 | /* by default, allow everything */ |
157 | return true; | 157 | return true; |
158 | } | 158 | } |
159 | |||
160 | static inline bool arch_pte_access_permitted(pte_t pte, bool write) | ||
161 | { | ||
162 | /* by default, allow everything */ | ||
163 | return true; | ||
164 | } | ||
165 | #endif /* __S390_MMU_CONTEXT_H */ | 159 | #endif /* __S390_MMU_CONTEXT_H */ |
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index 94ac2739918c..b668e351fd6c 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h | |||
@@ -37,12 +37,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, | |||
37 | return true; | 37 | return true; |
38 | } | 38 | } |
39 | 39 | ||
40 | static inline bool arch_pte_access_permitted(pte_t pte, bool write) | ||
41 | { | ||
42 | /* by default, allow everything */ | ||
43 | return true; | ||
44 | } | ||
45 | |||
46 | /* | 40 | /* |
47 | * end asm-generic/mm_hooks.h functions | 41 | * end asm-generic/mm_hooks.h functions |
48 | */ | 42 | */ |
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index 62dfc644c908..59b06b48f27d 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h | |||
@@ -103,10 +103,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, | |||
103 | /* by default, allow everything */ | 103 | /* by default, allow everything */ |
104 | return true; | 104 | return true; |
105 | } | 105 | } |
106 | |||
107 | static inline bool arch_pte_access_permitted(pte_t pte, bool write) | ||
108 | { | ||
109 | /* by default, allow everything */ | ||
110 | return true; | ||
111 | } | ||
112 | #endif | 106 | #endif |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2b899858532a..8d4f87e5bba3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -105,6 +105,7 @@ config X86 | |||
105 | select HAVE_ARCH_KMEMCHECK | 105 | select HAVE_ARCH_KMEMCHECK |
106 | select HAVE_ARCH_MMAP_RND_BITS if MMU | 106 | select HAVE_ARCH_MMAP_RND_BITS if MMU |
107 | select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT | 107 | select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT |
108 | select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT | ||
108 | select HAVE_ARCH_SECCOMP_FILTER | 109 | select HAVE_ARCH_SECCOMP_FILTER |
109 | select HAVE_ARCH_TRACEHOOK | 110 | select HAVE_ARCH_TRACEHOOK |
110 | select HAVE_ARCH_TRANSPARENT_HUGEPAGE | 111 | select HAVE_ARCH_TRANSPARENT_HUGEPAGE |
@@ -289,6 +290,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC | |||
289 | config KASAN_SHADOW_OFFSET | 290 | config KASAN_SHADOW_OFFSET |
290 | hex | 291 | hex |
291 | depends on KASAN | 292 | depends on KASAN |
293 | default 0xdff8000000000000 if X86_5LEVEL | ||
292 | default 0xdffffc0000000000 | 294 | default 0xdffffc0000000000 |
293 | 295 | ||
294 | config HAVE_INTEL_TXT | 296 | config HAVE_INTEL_TXT |
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 4ad7d70e8739..8f0c4c9fc904 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c | |||
@@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] = | |||
44 | 0, /* REQUIRED_MASK5 not implemented in this file */ | 44 | 0, /* REQUIRED_MASK5 not implemented in this file */ |
45 | REQUIRED_MASK6, | 45 | REQUIRED_MASK6, |
46 | 0, /* REQUIRED_MASK7 not implemented in this file */ | 46 | 0, /* REQUIRED_MASK7 not implemented in this file */ |
47 | 0, /* REQUIRED_MASK8 not implemented in this file */ | ||
48 | 0, /* REQUIRED_MASK9 not implemented in this file */ | ||
49 | 0, /* REQUIRED_MASK10 not implemented in this file */ | ||
50 | 0, /* REQUIRED_MASK11 not implemented in this file */ | ||
51 | 0, /* REQUIRED_MASK12 not implemented in this file */ | ||
52 | 0, /* REQUIRED_MASK13 not implemented in this file */ | ||
53 | 0, /* REQUIRED_MASK14 not implemented in this file */ | ||
54 | 0, /* REQUIRED_MASK15 not implemented in this file */ | ||
55 | REQUIRED_MASK16, | ||
47 | }; | 56 | }; |
48 | 57 | ||
49 | #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) | 58 | #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) |
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 6687ab953257..9e77c23c2422 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c | |||
@@ -70,16 +70,19 @@ int has_eflag(unsigned long mask) | |||
70 | # define EBX_REG "=b" | 70 | # define EBX_REG "=b" |
71 | #endif | 71 | #endif |
72 | 72 | ||
73 | static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d) | 73 | static inline void cpuid_count(u32 id, u32 count, |
74 | u32 *a, u32 *b, u32 *c, u32 *d) | ||
74 | { | 75 | { |
75 | asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" | 76 | asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" |
76 | "cpuid \n\t" | 77 | "cpuid \n\t" |
77 | ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t" | 78 | ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t" |
78 | : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b) | 79 | : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b) |
79 | : "a" (id) | 80 | : "a" (id), "c" (count) |
80 | ); | 81 | ); |
81 | } | 82 | } |
82 | 83 | ||
84 | #define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d) | ||
85 | |||
83 | void get_cpuflags(void) | 86 | void get_cpuflags(void) |
84 | { | 87 | { |
85 | u32 max_intel_level, max_amd_level; | 88 | u32 max_intel_level, max_amd_level; |
@@ -108,6 +111,11 @@ void get_cpuflags(void) | |||
108 | cpu.model += ((tfms >> 16) & 0xf) << 4; | 111 | cpu.model += ((tfms >> 16) & 0xf) << 4; |
109 | } | 112 | } |
110 | 113 | ||
114 | if (max_intel_level >= 0x00000007) { | ||
115 | cpuid_count(0x00000007, 0, &ignored, &ignored, | ||
116 | &cpu.flags[16], &ignored); | ||
117 | } | ||
118 | |||
111 | cpuid(0x80000000, &max_amd_level, &ignored, &ignored, | 119 | cpuid(0x80000000, &max_amd_level, &ignored, &ignored, |
112 | &ignored); | 120 | &ignored); |
113 | 121 | ||
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2b2a2948ffe..607d72c4a485 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S | |||
@@ -265,12 +265,9 @@ return_from_SYSCALL_64: | |||
265 | * | 265 | * |
266 | * If width of "canonical tail" ever becomes variable, this will need | 266 | * If width of "canonical tail" ever becomes variable, this will need |
267 | * to be updated to remain correct on both old and new CPUs. | 267 | * to be updated to remain correct on both old and new CPUs. |
268 | * | ||
269 | * Change top 16 bits to be the sign-extension of 47th bit | ||
268 | */ | 270 | */ |
269 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
270 | .error "virtual address width changed -- SYSRET checks need update" | ||
271 | .endif | ||
272 | |||
273 | /* Change top 16 bits to be the sign-extension of 47th bit */ | ||
274 | shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx | 271 | shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx |
275 | sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx | 272 | sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx |
276 | 273 | ||
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index faf80fdeeacc..139ad7726e10 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c | |||
@@ -361,7 +361,7 @@ static void vgetcpu_cpu_init(void *arg) | |||
361 | d.p = 1; /* Present */ | 361 | d.p = 1; /* Present */ |
362 | d.d = 1; /* 32-bit */ | 362 | d.d = 1; /* 32-bit */ |
363 | 363 | ||
364 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); | 364 | write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); |
365 | } | 365 | } |
366 | 366 | ||
367 | static int vgetcpu_online(unsigned int cpu) | 367 | static int vgetcpu_online(unsigned int cpu) |
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 1548ca92ad3f..d0a21b12dd58 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <asm/desc_defs.h> | 4 | #include <asm/desc_defs.h> |
5 | #include <asm/ldt.h> | 5 | #include <asm/ldt.h> |
6 | #include <asm/mmu.h> | 6 | #include <asm/mmu.h> |
7 | #include <asm/fixmap.h> | ||
7 | 8 | ||
8 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
9 | #include <linux/percpu.h> | 10 | #include <linux/percpu.h> |
@@ -45,11 +46,43 @@ struct gdt_page { | |||
45 | 46 | ||
46 | DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); | 47 | DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); |
47 | 48 | ||
48 | static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) | 49 | /* Provide the original GDT */ |
50 | static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) | ||
49 | { | 51 | { |
50 | return per_cpu(gdt_page, cpu).gdt; | 52 | return per_cpu(gdt_page, cpu).gdt; |
51 | } | 53 | } |
52 | 54 | ||
55 | /* Provide the current original GDT */ | ||
56 | static inline struct desc_struct *get_current_gdt_rw(void) | ||
57 | { | ||
58 | return this_cpu_ptr(&gdt_page)->gdt; | ||
59 | } | ||
60 | |||
61 | /* Get the fixmap index for a specific processor */ | ||
62 | static inline unsigned int get_cpu_gdt_ro_index(int cpu) | ||
63 | { | ||
64 | return FIX_GDT_REMAP_BEGIN + cpu; | ||
65 | } | ||
66 | |||
67 | /* Provide the fixmap address of the remapped GDT */ | ||
68 | static inline struct desc_struct *get_cpu_gdt_ro(int cpu) | ||
69 | { | ||
70 | unsigned int idx = get_cpu_gdt_ro_index(cpu); | ||
71 | return (struct desc_struct *)__fix_to_virt(idx); | ||
72 | } | ||
73 | |||
74 | /* Provide the current read-only GDT */ | ||
75 | static inline struct desc_struct *get_current_gdt_ro(void) | ||
76 | { | ||
77 | return get_cpu_gdt_ro(smp_processor_id()); | ||
78 | } | ||
79 | |||
80 | /* Provide the physical address of the GDT page. */ | ||
81 | static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) | ||
82 | { | ||
83 | return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); | ||
84 | } | ||
85 | |||
53 | #ifdef CONFIG_X86_64 | 86 | #ifdef CONFIG_X86_64 |
54 | 87 | ||
55 | static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, | 88 | static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, |
@@ -174,7 +207,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t | |||
174 | 207 | ||
175 | static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) | 208 | static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) |
176 | { | 209 | { |
177 | struct desc_struct *d = get_cpu_gdt_table(cpu); | 210 | struct desc_struct *d = get_cpu_gdt_rw(cpu); |
178 | tss_desc tss; | 211 | tss_desc tss; |
179 | 212 | ||
180 | set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, | 213 | set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, |
@@ -194,22 +227,90 @@ static inline void native_set_ldt(const void *addr, unsigned int entries) | |||
194 | 227 | ||
195 | set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, | 228 | set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, |
196 | entries * LDT_ENTRY_SIZE - 1); | 229 | entries * LDT_ENTRY_SIZE - 1); |
197 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, | 230 | write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT, |
198 | &ldt, DESC_LDT); | 231 | &ldt, DESC_LDT); |
199 | asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); | 232 | asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); |
200 | } | 233 | } |
201 | } | 234 | } |
202 | 235 | ||
236 | static inline void native_load_gdt(const struct desc_ptr *dtr) | ||
237 | { | ||
238 | asm volatile("lgdt %0"::"m" (*dtr)); | ||
239 | } | ||
240 | |||
241 | static inline void native_load_idt(const struct desc_ptr *dtr) | ||
242 | { | ||
243 | asm volatile("lidt %0"::"m" (*dtr)); | ||
244 | } | ||
245 | |||
246 | static inline void native_store_gdt(struct desc_ptr *dtr) | ||
247 | { | ||
248 | asm volatile("sgdt %0":"=m" (*dtr)); | ||
249 | } | ||
250 | |||
251 | static inline void native_store_idt(struct desc_ptr *dtr) | ||
252 | { | ||
253 | asm volatile("sidt %0":"=m" (*dtr)); | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is | ||
258 | * a read-only remapping. To prevent a page fault, the GDT is switched to the | ||
259 | * original writeable version when needed. | ||
260 | */ | ||
261 | #ifdef CONFIG_X86_64 | ||
262 | static inline void native_load_tr_desc(void) | ||
263 | { | ||
264 | struct desc_ptr gdt; | ||
265 | int cpu = raw_smp_processor_id(); | ||
266 | bool restore = 0; | ||
267 | struct desc_struct *fixmap_gdt; | ||
268 | |||
269 | native_store_gdt(&gdt); | ||
270 | fixmap_gdt = get_cpu_gdt_ro(cpu); | ||
271 | |||
272 | /* | ||
273 | * If the current GDT is the read-only fixmap, swap to the original | ||
274 | * writeable version. Swap back at the end. | ||
275 | */ | ||
276 | if (gdt.address == (unsigned long)fixmap_gdt) { | ||
277 | load_direct_gdt(cpu); | ||
278 | restore = 1; | ||
279 | } | ||
280 | asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); | ||
281 | if (restore) | ||
282 | load_fixmap_gdt(cpu); | ||
283 | } | ||
284 | #else | ||
203 | static inline void native_load_tr_desc(void) | 285 | static inline void native_load_tr_desc(void) |
204 | { | 286 | { |
205 | asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); | 287 | asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); |
206 | } | 288 | } |
289 | #endif | ||
290 | |||
291 | static inline unsigned long native_store_tr(void) | ||
292 | { | ||
293 | unsigned long tr; | ||
294 | |||
295 | asm volatile("str %0":"=r" (tr)); | ||
296 | |||
297 | return tr; | ||
298 | } | ||
299 | |||
300 | static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) | ||
301 | { | ||
302 | struct desc_struct *gdt = get_cpu_gdt_rw(cpu); | ||
303 | unsigned int i; | ||
304 | |||
305 | for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) | ||
306 | gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; | ||
307 | } | ||
207 | 308 | ||
208 | DECLARE_PER_CPU(bool, __tss_limit_invalid); | 309 | DECLARE_PER_CPU(bool, __tss_limit_invalid); |
209 | 310 | ||
210 | static inline void force_reload_TR(void) | 311 | static inline void force_reload_TR(void) |
211 | { | 312 | { |
212 | struct desc_struct *d = get_cpu_gdt_table(smp_processor_id()); | 313 | struct desc_struct *d = get_current_gdt_rw(); |
213 | tss_desc tss; | 314 | tss_desc tss; |
214 | 315 | ||
215 | memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc)); | 316 | memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc)); |
@@ -257,44 +358,6 @@ static inline void invalidate_tss_limit(void) | |||
257 | this_cpu_write(__tss_limit_invalid, true); | 358 | this_cpu_write(__tss_limit_invalid, true); |
258 | } | 359 | } |
259 | 360 | ||
260 | static inline void native_load_gdt(const struct desc_ptr *dtr) | ||
261 | { | ||
262 | asm volatile("lgdt %0"::"m" (*dtr)); | ||
263 | } | ||
264 | |||
265 | static inline void native_load_idt(const struct desc_ptr *dtr) | ||
266 | { | ||
267 | asm volatile("lidt %0"::"m" (*dtr)); | ||
268 | } | ||
269 | |||
270 | static inline void native_store_gdt(struct desc_ptr *dtr) | ||
271 | { | ||
272 | asm volatile("sgdt %0":"=m" (*dtr)); | ||
273 | } | ||
274 | |||
275 | static inline void native_store_idt(struct desc_ptr *dtr) | ||
276 | { | ||
277 | asm volatile("sidt %0":"=m" (*dtr)); | ||
278 | } | ||
279 | |||
280 | static inline unsigned long native_store_tr(void) | ||
281 | { | ||
282 | unsigned long tr; | ||
283 | |||
284 | asm volatile("str %0":"=r" (tr)); | ||
285 | |||
286 | return tr; | ||
287 | } | ||
288 | |||
289 | static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) | ||
290 | { | ||
291 | struct desc_struct *gdt = get_cpu_gdt_table(cpu); | ||
292 | unsigned int i; | ||
293 | |||
294 | for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) | ||
295 | gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; | ||
296 | } | ||
297 | |||
298 | /* This intentionally ignores lm, since 32-bit apps don't have that field. */ | 361 | /* This intentionally ignores lm, since 32-bit apps don't have that field. */ |
299 | #define LDT_empty(info) \ | 362 | #define LDT_empty(info) \ |
300 | ((info)->base_addr == 0 && \ | 363 | ((info)->base_addr == 0 && \ |
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 85599ad4d024..5dff775af7cd 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h | |||
@@ -36,6 +36,12 @@ | |||
36 | # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) | 36 | # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) |
37 | #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ | 37 | #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ |
38 | 38 | ||
39 | #ifdef CONFIG_X86_5LEVEL | ||
40 | # define DISABLE_LA57 0 | ||
41 | #else | ||
42 | # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) | ||
43 | #endif | ||
44 | |||
39 | /* | 45 | /* |
40 | * Make sure to add features to the correct mask | 46 | * Make sure to add features to the correct mask |
41 | */ | 47 | */ |
@@ -55,7 +61,7 @@ | |||
55 | #define DISABLED_MASK13 0 | 61 | #define DISABLED_MASK13 0 |
56 | #define DISABLED_MASK14 0 | 62 | #define DISABLED_MASK14 0 |
57 | #define DISABLED_MASK15 0 | 63 | #define DISABLED_MASK15 0 |
58 | #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) | 64 | #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57) |
59 | #define DISABLED_MASK17 0 | 65 | #define DISABLED_MASK17 0 |
60 | #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) | 66 | #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) |
61 | 67 | ||
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 3762536619f8..e8ab9a46bc68 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h | |||
@@ -293,8 +293,23 @@ do { \ | |||
293 | } \ | 293 | } \ |
294 | } while (0) | 294 | } while (0) |
295 | 295 | ||
296 | /* | ||
297 | * True on X86_32 or when emulating IA32 on X86_64 | ||
298 | */ | ||
299 | static inline int mmap_is_ia32(void) | ||
300 | { | ||
301 | return IS_ENABLED(CONFIG_X86_32) || | ||
302 | (IS_ENABLED(CONFIG_COMPAT) && | ||
303 | test_thread_flag(TIF_ADDR32)); | ||
304 | } | ||
305 | |||
306 | extern unsigned long tasksize_32bit(void); | ||
307 | extern unsigned long tasksize_64bit(void); | ||
308 | extern unsigned long get_mmap_base(int is_legacy); | ||
309 | |||
296 | #ifdef CONFIG_X86_32 | 310 | #ifdef CONFIG_X86_32 |
297 | 311 | ||
312 | #define __STACK_RND_MASK(is32bit) (0x7ff) | ||
298 | #define STACK_RND_MASK (0x7ff) | 313 | #define STACK_RND_MASK (0x7ff) |
299 | 314 | ||
300 | #define ARCH_DLINFO ARCH_DLINFO_IA32 | 315 | #define ARCH_DLINFO ARCH_DLINFO_IA32 |
@@ -304,7 +319,8 @@ do { \ | |||
304 | #else /* CONFIG_X86_32 */ | 319 | #else /* CONFIG_X86_32 */ |
305 | 320 | ||
306 | /* 1GB for 64bit, 8MB for 32bit */ | 321 | /* 1GB for 64bit, 8MB for 32bit */ |
307 | #define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) | 322 | #define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) |
323 | #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) | ||
308 | 324 | ||
309 | #define ARCH_DLINFO \ | 325 | #define ARCH_DLINFO \ |
310 | do { \ | 326 | do { \ |
@@ -348,16 +364,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, | |||
348 | int uses_interp); | 364 | int uses_interp); |
349 | #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages | 365 | #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages |
350 | 366 | ||
351 | /* | ||
352 | * True on X86_32 or when emulating IA32 on X86_64 | ||
353 | */ | ||
354 | static inline int mmap_is_ia32(void) | ||
355 | { | ||
356 | return IS_ENABLED(CONFIG_X86_32) || | ||
357 | (IS_ENABLED(CONFIG_COMPAT) && | ||
358 | test_thread_flag(TIF_ADDR32)); | ||
359 | } | ||
360 | |||
361 | /* Do not change the values. See get_align_mask() */ | 367 | /* Do not change the values. See get_align_mask() */ |
362 | enum align_flags { | 368 | enum align_flags { |
363 | ALIGN_VA_32 = BIT(0), | 369 | ALIGN_VA_32 = BIT(0), |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 8554f960e21b..b65155cc3760 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
@@ -100,6 +100,10 @@ enum fixed_addresses { | |||
100 | #ifdef CONFIG_X86_INTEL_MID | 100 | #ifdef CONFIG_X86_INTEL_MID |
101 | FIX_LNW_VRTC, | 101 | FIX_LNW_VRTC, |
102 | #endif | 102 | #endif |
103 | /* Fixmap entries to remap the GDTs, one per processor. */ | ||
104 | FIX_GDT_REMAP_BEGIN, | ||
105 | FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, | ||
106 | |||
103 | __end_of_permanent_fixed_addresses, | 107 | __end_of_permanent_fixed_addresses, |
104 | 108 | ||
105 | /* | 109 | /* |
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 1410b567ecde..f527b02a0ee3 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h | |||
@@ -11,9 +11,12 @@ | |||
11 | * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT | 11 | * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT |
12 | */ | 12 | */ |
13 | #define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ | 13 | #define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ |
14 | (0xffff800000000000ULL >> 3)) | 14 | ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3)) |
15 | /* 47 bits for kernel address -> (47 - 3) bits for shadow */ | 15 | /* |
16 | #define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3))) | 16 | * 47 bits for kernel address -> (47 - 3) bits for shadow |
17 | * 56 bits for kernel address -> (56 - 3) bits for shadow | ||
18 | */ | ||
19 | #define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3))) | ||
17 | 20 | ||
18 | #ifndef __ASSEMBLY__ | 21 | #ifndef __ASSEMBLY__ |
19 | 22 | ||
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 282630e4c6ea..70ef205489f0 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h | |||
@@ -164,6 +164,7 @@ struct kimage_arch { | |||
164 | }; | 164 | }; |
165 | #else | 165 | #else |
166 | struct kimage_arch { | 166 | struct kimage_arch { |
167 | p4d_t *p4d; | ||
167 | pud_t *pud; | 168 | pud_t *pud; |
168 | pmd_t *pmd; | 169 | pmd_t *pmd; |
169 | pte_t *pte; | 170 | pte_t *pte; |
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 306c7e12af55..68b329d77b3a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
@@ -268,8 +268,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, | |||
268 | return __pkru_allows_pkey(vma_pkey(vma), write); | 268 | return __pkru_allows_pkey(vma_pkey(vma), write); |
269 | } | 269 | } |
270 | 270 | ||
271 | static inline bool arch_pte_access_permitted(pte_t pte, bool write) | ||
272 | { | ||
273 | return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); | ||
274 | } | ||
275 | #endif /* _ASM_X86_MMU_CONTEXT_H */ | 271 | #endif /* _ASM_X86_MMU_CONTEXT_H */ |
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 9215e0527647..3f5f08b010d0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h | |||
@@ -36,7 +36,12 @@ | |||
36 | * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's | 36 | * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's |
37 | * what Xen requires. | 37 | * what Xen requires. |
38 | */ | 38 | */ |
39 | #ifdef CONFIG_X86_5LEVEL | ||
40 | #define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL) | ||
41 | #else | ||
39 | #define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) | 42 | #define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) |
43 | #endif | ||
44 | |||
40 | #ifdef CONFIG_RANDOMIZE_MEMORY | 45 | #ifdef CONFIG_RANDOMIZE_MEMORY |
41 | #define __PAGE_OFFSET page_offset_base | 46 | #define __PAGE_OFFSET page_offset_base |
42 | #else | 47 | #else |
@@ -46,8 +51,13 @@ | |||
46 | #define __START_KERNEL_map _AC(0xffffffff80000000, UL) | 51 | #define __START_KERNEL_map _AC(0xffffffff80000000, UL) |
47 | 52 | ||
48 | /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ | 53 | /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ |
54 | #ifdef CONFIG_X86_5LEVEL | ||
55 | #define __PHYSICAL_MASK_SHIFT 52 | ||
56 | #define __VIRTUAL_MASK_SHIFT 56 | ||
57 | #else | ||
49 | #define __PHYSICAL_MASK_SHIFT 46 | 58 | #define __PHYSICAL_MASK_SHIFT 46 |
50 | #define __VIRTUAL_MASK_SHIFT 47 | 59 | #define __VIRTUAL_MASK_SHIFT 47 |
60 | #endif | ||
51 | 61 | ||
52 | /* | 62 | /* |
53 | * Kernel image size is limited to 1GiB due to the fixmap living in the | 63 | * Kernel image size is limited to 1GiB due to the fixmap living in the |
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 0489884fdc44..55fa56fe4e45 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -357,6 +357,16 @@ static inline void paravirt_release_pud(unsigned long pfn) | |||
357 | PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); | 357 | PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); |
358 | } | 358 | } |
359 | 359 | ||
360 | static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) | ||
361 | { | ||
362 | PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn); | ||
363 | } | ||
364 | |||
365 | static inline void paravirt_release_p4d(unsigned long pfn) | ||
366 | { | ||
367 | PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn); | ||
368 | } | ||
369 | |||
360 | static inline void pte_update(struct mm_struct *mm, unsigned long addr, | 370 | static inline void pte_update(struct mm_struct *mm, unsigned long addr, |
361 | pte_t *ptep) | 371 | pte_t *ptep) |
362 | { | 372 | { |
@@ -536,7 +546,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud) | |||
536 | PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, | 546 | PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, |
537 | val); | 547 | val); |
538 | } | 548 | } |
539 | #if CONFIG_PGTABLE_LEVELS == 4 | 549 | #if CONFIG_PGTABLE_LEVELS >= 4 |
540 | static inline pud_t __pud(pudval_t val) | 550 | static inline pud_t __pud(pudval_t val) |
541 | { | 551 | { |
542 | pudval_t ret; | 552 | pudval_t ret; |
@@ -565,26 +575,54 @@ static inline pudval_t pud_val(pud_t pud) | |||
565 | return ret; | 575 | return ret; |
566 | } | 576 | } |
567 | 577 | ||
568 | static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) | 578 | static inline void pud_clear(pud_t *pudp) |
569 | { | 579 | { |
570 | pgdval_t val = native_pgd_val(pgd); | 580 | set_pud(pudp, __pud(0)); |
581 | } | ||
571 | 582 | ||
572 | if (sizeof(pgdval_t) > sizeof(long)) | 583 | static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) |
573 | PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp, | 584 | { |
585 | p4dval_t val = native_p4d_val(p4d); | ||
586 | |||
587 | if (sizeof(p4dval_t) > sizeof(long)) | ||
588 | PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp, | ||
574 | val, (u64)val >> 32); | 589 | val, (u64)val >> 32); |
575 | else | 590 | else |
576 | PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, | 591 | PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp, |
577 | val); | 592 | val); |
578 | } | 593 | } |
579 | 594 | ||
595 | #if CONFIG_PGTABLE_LEVELS >= 5 | ||
596 | |||
597 | static inline p4d_t __p4d(p4dval_t val) | ||
598 | { | ||
599 | p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val); | ||
600 | |||
601 | return (p4d_t) { ret }; | ||
602 | } | ||
603 | |||
604 | static inline p4dval_t p4d_val(p4d_t p4d) | ||
605 | { | ||
606 | return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d); | ||
607 | } | ||
608 | |||
609 | static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) | ||
610 | { | ||
611 | pgdval_t val = native_pgd_val(pgd); | ||
612 | |||
613 | PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val); | ||
614 | } | ||
615 | |||
580 | static inline void pgd_clear(pgd_t *pgdp) | 616 | static inline void pgd_clear(pgd_t *pgdp) |
581 | { | 617 | { |
582 | set_pgd(pgdp, __pgd(0)); | 618 | set_pgd(pgdp, __pgd(0)); |
583 | } | 619 | } |
584 | 620 | ||
585 | static inline void pud_clear(pud_t *pudp) | 621 | #endif /* CONFIG_PGTABLE_LEVELS == 5 */ |
622 | |||
623 | static inline void p4d_clear(p4d_t *p4dp) | ||
586 | { | 624 | { |
587 | set_pud(pudp, __pud(0)); | 625 | set_p4d(p4dp, __p4d(0)); |
588 | } | 626 | } |
589 | 627 | ||
590 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ | 628 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ |
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b060f962d581..7465d6fe336f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h | |||
@@ -238,9 +238,11 @@ struct pv_mmu_ops { | |||
238 | void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); | 238 | void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); |
239 | void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); | 239 | void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); |
240 | void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); | 240 | void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); |
241 | void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn); | ||
241 | void (*release_pte)(unsigned long pfn); | 242 | void (*release_pte)(unsigned long pfn); |
242 | void (*release_pmd)(unsigned long pfn); | 243 | void (*release_pmd)(unsigned long pfn); |
243 | void (*release_pud)(unsigned long pfn); | 244 | void (*release_pud)(unsigned long pfn); |
245 | void (*release_p4d)(unsigned long pfn); | ||
244 | 246 | ||
245 | /* Pagetable manipulation functions */ | 247 | /* Pagetable manipulation functions */ |
246 | void (*set_pte)(pte_t *ptep, pte_t pteval); | 248 | void (*set_pte)(pte_t *ptep, pte_t pteval); |
@@ -279,12 +281,21 @@ struct pv_mmu_ops { | |||
279 | struct paravirt_callee_save pmd_val; | 281 | struct paravirt_callee_save pmd_val; |
280 | struct paravirt_callee_save make_pmd; | 282 | struct paravirt_callee_save make_pmd; |
281 | 283 | ||
282 | #if CONFIG_PGTABLE_LEVELS == 4 | 284 | #if CONFIG_PGTABLE_LEVELS >= 4 |
283 | struct paravirt_callee_save pud_val; | 285 | struct paravirt_callee_save pud_val; |
284 | struct paravirt_callee_save make_pud; | 286 | struct paravirt_callee_save make_pud; |
285 | 287 | ||
286 | void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); | 288 | void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval); |
287 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ | 289 | |
290 | #if CONFIG_PGTABLE_LEVELS >= 5 | ||
291 | struct paravirt_callee_save p4d_val; | ||
292 | struct paravirt_callee_save make_p4d; | ||
293 | |||
294 | void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval); | ||
295 | #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ | ||
296 | |||
297 | #endif /* CONFIG_PGTABLE_LEVELS >= 4 */ | ||
298 | |||
288 | #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ | 299 | #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ |
289 | 300 | ||
290 | struct pv_lazy_ops lazy_mode; | 301 | struct pv_lazy_ops lazy_mode; |
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index b6d425999f99..b2d0cd8288aa 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h | |||
@@ -17,9 +17,11 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { | |||
17 | static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, | 17 | static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, |
18 | unsigned long start, unsigned long count) {} | 18 | unsigned long start, unsigned long count) {} |
19 | static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} | 19 | static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} |
20 | static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {} | ||
20 | static inline void paravirt_release_pte(unsigned long pfn) {} | 21 | static inline void paravirt_release_pte(unsigned long pfn) {} |
21 | static inline void paravirt_release_pmd(unsigned long pfn) {} | 22 | static inline void paravirt_release_pmd(unsigned long pfn) {} |
22 | static inline void paravirt_release_pud(unsigned long pfn) {} | 23 | static inline void paravirt_release_pud(unsigned long pfn) {} |
24 | static inline void paravirt_release_p4d(unsigned long pfn) {} | ||
23 | #endif | 25 | #endif |
24 | 26 | ||
25 | /* | 27 | /* |
@@ -121,10 +123,10 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) | |||
121 | #endif /* CONFIG_X86_PAE */ | 123 | #endif /* CONFIG_X86_PAE */ |
122 | 124 | ||
123 | #if CONFIG_PGTABLE_LEVELS > 3 | 125 | #if CONFIG_PGTABLE_LEVELS > 3 |
124 | static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) | 126 | static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) |
125 | { | 127 | { |
126 | paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); | 128 | paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); |
127 | set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); | 129 | set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); |
128 | } | 130 | } |
129 | 131 | ||
130 | static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) | 132 | static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) |
@@ -150,6 +152,37 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, | |||
150 | ___pud_free_tlb(tlb, pud); | 152 | ___pud_free_tlb(tlb, pud); |
151 | } | 153 | } |
152 | 154 | ||
155 | #if CONFIG_PGTABLE_LEVELS > 4 | ||
156 | static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) | ||
157 | { | ||
158 | paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); | ||
159 | set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); | ||
160 | } | ||
161 | |||
162 | static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) | ||
163 | { | ||
164 | gfp_t gfp = GFP_KERNEL_ACCOUNT; | ||
165 | |||
166 | if (mm == &init_mm) | ||
167 | gfp &= ~__GFP_ACCOUNT; | ||
168 | return (p4d_t *)get_zeroed_page(gfp); | ||
169 | } | ||
170 | |||
171 | static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) | ||
172 | { | ||
173 | BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); | ||
174 | free_page((unsigned long)p4d); | ||
175 | } | ||
176 | |||
177 | extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); | ||
178 | |||
179 | static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, | ||
180 | unsigned long address) | ||
181 | { | ||
182 | ___p4d_free_tlb(tlb, p4d); | ||
183 | } | ||
184 | |||
185 | #endif /* CONFIG_PGTABLE_LEVELS > 4 */ | ||
153 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ | 186 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ |
154 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ | 187 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ |
155 | 188 | ||
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 392576433e77..373ab1de909f 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h | |||
@@ -7,6 +7,7 @@ | |||
7 | typedef unsigned long pteval_t; | 7 | typedef unsigned long pteval_t; |
8 | typedef unsigned long pmdval_t; | 8 | typedef unsigned long pmdval_t; |
9 | typedef unsigned long pudval_t; | 9 | typedef unsigned long pudval_t; |
10 | typedef unsigned long p4dval_t; | ||
10 | typedef unsigned long pgdval_t; | 11 | typedef unsigned long pgdval_t; |
11 | typedef unsigned long pgprotval_t; | 12 | typedef unsigned long pgprotval_t; |
12 | 13 | ||
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index bcc89625ebe5..b8a4341faafa 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h | |||
@@ -7,6 +7,7 @@ | |||
7 | typedef u64 pteval_t; | 7 | typedef u64 pteval_t; |
8 | typedef u64 pmdval_t; | 8 | typedef u64 pmdval_t; |
9 | typedef u64 pudval_t; | 9 | typedef u64 pudval_t; |
10 | typedef u64 p4dval_t; | ||
10 | typedef u64 pgdval_t; | 11 | typedef u64 pgdval_t; |
11 | typedef u64 pgprotval_t; | 12 | typedef u64 pgprotval_t; |
12 | 13 | ||
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2197e5322df9..f5af95a0c6b8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -51,11 +51,19 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); | |||
51 | 51 | ||
52 | #define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) | 52 | #define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) |
53 | 53 | ||
54 | #ifndef __PAGETABLE_PUD_FOLDED | 54 | #ifndef __PAGETABLE_P4D_FOLDED |
55 | #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) | 55 | #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) |
56 | #define pgd_clear(pgd) native_pgd_clear(pgd) | 56 | #define pgd_clear(pgd) native_pgd_clear(pgd) |
57 | #endif | 57 | #endif |
58 | 58 | ||
59 | #ifndef set_p4d | ||
60 | # define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d) | ||
61 | #endif | ||
62 | |||
63 | #ifndef __PAGETABLE_PUD_FOLDED | ||
64 | #define p4d_clear(p4d) native_p4d_clear(p4d) | ||
65 | #endif | ||
66 | |||
59 | #ifndef set_pud | 67 | #ifndef set_pud |
60 | # define set_pud(pudp, pud) native_set_pud(pudp, pud) | 68 | # define set_pud(pudp, pud) native_set_pud(pudp, pud) |
61 | #endif | 69 | #endif |
@@ -72,6 +80,11 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); | |||
72 | #define pgd_val(x) native_pgd_val(x) | 80 | #define pgd_val(x) native_pgd_val(x) |
73 | #define __pgd(x) native_make_pgd(x) | 81 | #define __pgd(x) native_make_pgd(x) |
74 | 82 | ||
83 | #ifndef __PAGETABLE_P4D_FOLDED | ||
84 | #define p4d_val(x) native_p4d_val(x) | ||
85 | #define __p4d(x) native_make_p4d(x) | ||
86 | #endif | ||
87 | |||
75 | #ifndef __PAGETABLE_PUD_FOLDED | 88 | #ifndef __PAGETABLE_PUD_FOLDED |
76 | #define pud_val(x) native_pud_val(x) | 89 | #define pud_val(x) native_pud_val(x) |
77 | #define __pud(x) native_make_pud(x) | 90 | #define __pud(x) native_make_pud(x) |
@@ -177,6 +190,17 @@ static inline unsigned long pud_pfn(pud_t pud) | |||
177 | return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; | 190 | return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; |
178 | } | 191 | } |
179 | 192 | ||
193 | static inline unsigned long p4d_pfn(p4d_t p4d) | ||
194 | { | ||
195 | return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; | ||
196 | } | ||
197 | |||
198 | static inline int p4d_large(p4d_t p4d) | ||
199 | { | ||
200 | /* No 512 GiB pages yet */ | ||
201 | return 0; | ||
202 | } | ||
203 | |||
180 | #define pte_page(pte) pfn_to_page(pte_pfn(pte)) | 204 | #define pte_page(pte) pfn_to_page(pte_pfn(pte)) |
181 | 205 | ||
182 | static inline int pmd_large(pmd_t pte) | 206 | static inline int pmd_large(pmd_t pte) |
@@ -536,6 +560,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
536 | #define pte_pgprot(x) __pgprot(pte_flags(x)) | 560 | #define pte_pgprot(x) __pgprot(pte_flags(x)) |
537 | #define pmd_pgprot(x) __pgprot(pmd_flags(x)) | 561 | #define pmd_pgprot(x) __pgprot(pmd_flags(x)) |
538 | #define pud_pgprot(x) __pgprot(pud_flags(x)) | 562 | #define pud_pgprot(x) __pgprot(pud_flags(x)) |
563 | #define p4d_pgprot(x) __pgprot(p4d_flags(x)) | ||
539 | 564 | ||
540 | #define canon_pgprot(p) __pgprot(massage_pgprot(p)) | 565 | #define canon_pgprot(p) __pgprot(massage_pgprot(p)) |
541 | 566 | ||
@@ -585,6 +610,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); | |||
585 | #include <linux/mm_types.h> | 610 | #include <linux/mm_types.h> |
586 | #include <linux/mmdebug.h> | 611 | #include <linux/mmdebug.h> |
587 | #include <linux/log2.h> | 612 | #include <linux/log2.h> |
613 | #include <asm/fixmap.h> | ||
588 | 614 | ||
589 | static inline int pte_none(pte_t pte) | 615 | static inline int pte_none(pte_t pte) |
590 | { | 616 | { |
@@ -768,7 +794,52 @@ static inline int pud_large(pud_t pud) | |||
768 | } | 794 | } |
769 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ | 795 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ |
770 | 796 | ||
797 | static inline unsigned long pud_index(unsigned long address) | ||
798 | { | ||
799 | return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); | ||
800 | } | ||
801 | |||
771 | #if CONFIG_PGTABLE_LEVELS > 3 | 802 | #if CONFIG_PGTABLE_LEVELS > 3 |
803 | static inline int p4d_none(p4d_t p4d) | ||
804 | { | ||
805 | return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; | ||
806 | } | ||
807 | |||
808 | static inline int p4d_present(p4d_t p4d) | ||
809 | { | ||
810 | return p4d_flags(p4d) & _PAGE_PRESENT; | ||
811 | } | ||
812 | |||
813 | static inline unsigned long p4d_page_vaddr(p4d_t p4d) | ||
814 | { | ||
815 | return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); | ||
816 | } | ||
817 | |||
818 | /* | ||
819 | * Currently stuck as a macro due to indirect forward reference to | ||
820 | * linux/mmzone.h's __section_mem_map_addr() definition: | ||
821 | */ | ||
822 | #define p4d_page(p4d) \ | ||
823 | pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) | ||
824 | |||
825 | /* Find an entry in the third-level page table.. */ | ||
826 | static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) | ||
827 | { | ||
828 | return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address); | ||
829 | } | ||
830 | |||
831 | static inline int p4d_bad(p4d_t p4d) | ||
832 | { | ||
833 | return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; | ||
834 | } | ||
835 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ | ||
836 | |||
837 | static inline unsigned long p4d_index(unsigned long address) | ||
838 | { | ||
839 | return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); | ||
840 | } | ||
841 | |||
842 | #if CONFIG_PGTABLE_LEVELS > 4 | ||
772 | static inline int pgd_present(pgd_t pgd) | 843 | static inline int pgd_present(pgd_t pgd) |
773 | { | 844 | { |
774 | return pgd_flags(pgd) & _PAGE_PRESENT; | 845 | return pgd_flags(pgd) & _PAGE_PRESENT; |
@@ -786,14 +857,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) | |||
786 | #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) | 857 | #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) |
787 | 858 | ||
788 | /* to find an entry in a page-table-directory. */ | 859 | /* to find an entry in a page-table-directory. */ |
789 | static inline unsigned long pud_index(unsigned long address) | 860 | static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) |
790 | { | ||
791 | return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); | ||
792 | } | ||
793 | |||
794 | static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) | ||
795 | { | 861 | { |
796 | return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); | 862 | return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); |
797 | } | 863 | } |
798 | 864 | ||
799 | static inline int pgd_bad(pgd_t pgd) | 865 | static inline int pgd_bad(pgd_t pgd) |
@@ -811,7 +877,7 @@ static inline int pgd_none(pgd_t pgd) | |||
811 | */ | 877 | */ |
812 | return !native_pgd_val(pgd); | 878 | return !native_pgd_val(pgd); |
813 | } | 879 | } |
814 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ | 880 | #endif /* CONFIG_PGTABLE_LEVELS > 4 */ |
815 | 881 | ||
816 | #endif /* __ASSEMBLY__ */ | 882 | #endif /* __ASSEMBLY__ */ |
817 | 883 | ||
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index fbc73360aea0..bfab55675c16 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h | |||
@@ -14,7 +14,6 @@ | |||
14 | */ | 14 | */ |
15 | #ifndef __ASSEMBLY__ | 15 | #ifndef __ASSEMBLY__ |
16 | #include <asm/processor.h> | 16 | #include <asm/processor.h> |
17 | #include <asm/fixmap.h> | ||
18 | #include <linux/threads.h> | 17 | #include <linux/threads.h> |
19 | #include <asm/paravirt.h> | 18 | #include <asm/paravirt.h> |
20 | 19 | ||
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 73c7ccc38912..9991224f6238 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -35,15 +35,22 @@ extern void paging_init(void); | |||
35 | #define pud_ERROR(e) \ | 35 | #define pud_ERROR(e) \ |
36 | pr_err("%s:%d: bad pud %p(%016lx)\n", \ | 36 | pr_err("%s:%d: bad pud %p(%016lx)\n", \ |
37 | __FILE__, __LINE__, &(e), pud_val(e)) | 37 | __FILE__, __LINE__, &(e), pud_val(e)) |
38 | |||
39 | #if CONFIG_PGTABLE_LEVELS >= 5 | ||
40 | #define p4d_ERROR(e) \ | ||
41 | pr_err("%s:%d: bad p4d %p(%016lx)\n", \ | ||
42 | __FILE__, __LINE__, &(e), p4d_val(e)) | ||
43 | #endif | ||
44 | |||
38 | #define pgd_ERROR(e) \ | 45 | #define pgd_ERROR(e) \ |
39 | pr_err("%s:%d: bad pgd %p(%016lx)\n", \ | 46 | pr_err("%s:%d: bad pgd %p(%016lx)\n", \ |
40 | __FILE__, __LINE__, &(e), pgd_val(e)) | 47 | __FILE__, __LINE__, &(e), pgd_val(e)) |
41 | 48 | ||
42 | struct mm_struct; | 49 | struct mm_struct; |
43 | 50 | ||
51 | void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); | ||
44 | void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); | 52 | void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); |
45 | 53 | ||
46 | |||
47 | static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, | 54 | static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, |
48 | pte_t *ptep) | 55 | pte_t *ptep) |
49 | { | 56 | { |
@@ -121,6 +128,20 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) | |||
121 | #endif | 128 | #endif |
122 | } | 129 | } |
123 | 130 | ||
131 | static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) | ||
132 | { | ||
133 | *p4dp = p4d; | ||
134 | } | ||
135 | |||
136 | static inline void native_p4d_clear(p4d_t *p4d) | ||
137 | { | ||
138 | #ifdef CONFIG_X86_5LEVEL | ||
139 | native_set_p4d(p4d, native_make_p4d(0)); | ||
140 | #else | ||
141 | native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)}); | ||
142 | #endif | ||
143 | } | ||
144 | |||
124 | static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) | 145 | static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) |
125 | { | 146 | { |
126 | *pgdp = pgd; | 147 | *pgdp = pgd; |
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3a264200c62f..06470da156ba 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h | |||
@@ -13,6 +13,7 @@ | |||
13 | typedef unsigned long pteval_t; | 13 | typedef unsigned long pteval_t; |
14 | typedef unsigned long pmdval_t; | 14 | typedef unsigned long pmdval_t; |
15 | typedef unsigned long pudval_t; | 15 | typedef unsigned long pudval_t; |
16 | typedef unsigned long p4dval_t; | ||
16 | typedef unsigned long pgdval_t; | 17 | typedef unsigned long pgdval_t; |
17 | typedef unsigned long pgprotval_t; | 18 | typedef unsigned long pgprotval_t; |
18 | 19 | ||
@@ -22,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t; | |||
22 | 23 | ||
23 | #define SHARED_KERNEL_PMD 0 | 24 | #define SHARED_KERNEL_PMD 0 |
24 | 25 | ||
26 | #ifdef CONFIG_X86_5LEVEL | ||
27 | |||
28 | /* | ||
29 | * PGDIR_SHIFT determines what a top-level page table entry can map | ||
30 | */ | ||
31 | #define PGDIR_SHIFT 48 | ||
32 | #define PTRS_PER_PGD 512 | ||
33 | |||
34 | /* | ||
35 | * 4th level page in 5-level paging case | ||
36 | */ | ||
37 | #define P4D_SHIFT 39 | ||
38 | #define PTRS_PER_P4D 512 | ||
39 | #define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) | ||
40 | #define P4D_MASK (~(P4D_SIZE - 1)) | ||
41 | |||
42 | #else /* CONFIG_X86_5LEVEL */ | ||
43 | |||
25 | /* | 44 | /* |
26 | * PGDIR_SHIFT determines what a top-level page table entry can map | 45 | * PGDIR_SHIFT determines what a top-level page table entry can map |
27 | */ | 46 | */ |
28 | #define PGDIR_SHIFT 39 | 47 | #define PGDIR_SHIFT 39 |
29 | #define PTRS_PER_PGD 512 | 48 | #define PTRS_PER_PGD 512 |
30 | 49 | ||
50 | #endif /* CONFIG_X86_5LEVEL */ | ||
51 | |||
31 | /* | 52 | /* |
32 | * 3rd level page | 53 | * 3rd level page |
33 | */ | 54 | */ |
@@ -55,9 +76,15 @@ typedef struct { pteval_t pte; } pte_t; | |||
55 | 76 | ||
56 | /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ | 77 | /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ |
57 | #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) | 78 | #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) |
79 | #ifdef CONFIG_X86_5LEVEL | ||
80 | #define VMALLOC_SIZE_TB _AC(16384, UL) | ||
81 | #define __VMALLOC_BASE _AC(0xff92000000000000, UL) | ||
82 | #define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) | ||
83 | #else | ||
58 | #define VMALLOC_SIZE_TB _AC(32, UL) | 84 | #define VMALLOC_SIZE_TB _AC(32, UL) |
59 | #define __VMALLOC_BASE _AC(0xffffc90000000000, UL) | 85 | #define __VMALLOC_BASE _AC(0xffffc90000000000, UL) |
60 | #define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) | 86 | #define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) |
87 | #endif | ||
61 | #ifdef CONFIG_RANDOMIZE_MEMORY | 88 | #ifdef CONFIG_RANDOMIZE_MEMORY |
62 | #define VMALLOC_START vmalloc_base | 89 | #define VMALLOC_START vmalloc_base |
63 | #define VMEMMAP_START vmemmap_base | 90 | #define VMEMMAP_START vmemmap_base |
@@ -67,10 +94,11 @@ typedef struct { pteval_t pte; } pte_t; | |||
67 | #endif /* CONFIG_RANDOMIZE_MEMORY */ | 94 | #endif /* CONFIG_RANDOMIZE_MEMORY */ |
68 | #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) | 95 | #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) |
69 | #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) | 96 | #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) |
70 | #define MODULES_END _AC(0xffffffffff000000, UL) | 97 | /* The module sections ends with the start of the fixmap */ |
98 | #define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) | ||
71 | #define MODULES_LEN (MODULES_END - MODULES_VADDR) | 99 | #define MODULES_LEN (MODULES_END - MODULES_VADDR) |
72 | #define ESPFIX_PGD_ENTRY _AC(-2, UL) | 100 | #define ESPFIX_PGD_ENTRY _AC(-2, UL) |
73 | #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) | 101 | #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) |
74 | #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) | 102 | #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) |
75 | #define EFI_VA_END (-68 * (_AC(1, UL) << 30)) | 103 | #define EFI_VA_END (-68 * (_AC(1, UL) << 30)) |
76 | 104 | ||
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 62484333673d..bf9638e1ee42 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -272,9 +272,28 @@ static inline pgdval_t pgd_flags(pgd_t pgd) | |||
272 | return native_pgd_val(pgd) & PTE_FLAGS_MASK; | 272 | return native_pgd_val(pgd) & PTE_FLAGS_MASK; |
273 | } | 273 | } |
274 | 274 | ||
275 | #if CONFIG_PGTABLE_LEVELS > 3 | 275 | #if CONFIG_PGTABLE_LEVELS > 4 |
276 | #include <asm-generic/5level-fixup.h> | 276 | typedef struct { p4dval_t p4d; } p4d_t; |
277 | |||
278 | static inline p4d_t native_make_p4d(pudval_t val) | ||
279 | { | ||
280 | return (p4d_t) { val }; | ||
281 | } | ||
282 | |||
283 | static inline p4dval_t native_p4d_val(p4d_t p4d) | ||
284 | { | ||
285 | return p4d.p4d; | ||
286 | } | ||
287 | #else | ||
288 | #include <asm-generic/pgtable-nop4d.h> | ||
289 | |||
290 | static inline p4dval_t native_p4d_val(p4d_t p4d) | ||
291 | { | ||
292 | return native_pgd_val(p4d.pgd); | ||
293 | } | ||
294 | #endif | ||
277 | 295 | ||
296 | #if CONFIG_PGTABLE_LEVELS > 3 | ||
278 | typedef struct { pudval_t pud; } pud_t; | 297 | typedef struct { pudval_t pud; } pud_t; |
279 | 298 | ||
280 | static inline pud_t native_make_pud(pmdval_t val) | 299 | static inline pud_t native_make_pud(pmdval_t val) |
@@ -287,12 +306,11 @@ static inline pudval_t native_pud_val(pud_t pud) | |||
287 | return pud.pud; | 306 | return pud.pud; |
288 | } | 307 | } |
289 | #else | 308 | #else |
290 | #define __ARCH_USE_5LEVEL_HACK | ||
291 | #include <asm-generic/pgtable-nopud.h> | 309 | #include <asm-generic/pgtable-nopud.h> |
292 | 310 | ||
293 | static inline pudval_t native_pud_val(pud_t pud) | 311 | static inline pudval_t native_pud_val(pud_t pud) |
294 | { | 312 | { |
295 | return native_pgd_val(pud.pgd); | 313 | return native_pgd_val(pud.p4d.pgd); |
296 | } | 314 | } |
297 | #endif | 315 | #endif |
298 | 316 | ||
@@ -309,15 +327,30 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) | |||
309 | return pmd.pmd; | 327 | return pmd.pmd; |
310 | } | 328 | } |
311 | #else | 329 | #else |
312 | #define __ARCH_USE_5LEVEL_HACK | ||
313 | #include <asm-generic/pgtable-nopmd.h> | 330 | #include <asm-generic/pgtable-nopmd.h> |
314 | 331 | ||
315 | static inline pmdval_t native_pmd_val(pmd_t pmd) | 332 | static inline pmdval_t native_pmd_val(pmd_t pmd) |
316 | { | 333 | { |
317 | return native_pgd_val(pmd.pud.pgd); | 334 | return native_pgd_val(pmd.pud.p4d.pgd); |
318 | } | 335 | } |
319 | #endif | 336 | #endif |
320 | 337 | ||
338 | static inline p4dval_t p4d_pfn_mask(p4d_t p4d) | ||
339 | { | ||
340 | /* No 512 GiB huge pages yet */ | ||
341 | return PTE_PFN_MASK; | ||
342 | } | ||
343 | |||
344 | static inline p4dval_t p4d_flags_mask(p4d_t p4d) | ||
345 | { | ||
346 | return ~p4d_pfn_mask(p4d); | ||
347 | } | ||
348 | |||
349 | static inline p4dval_t p4d_flags(p4d_t p4d) | ||
350 | { | ||
351 | return native_p4d_val(p4d) & p4d_flags_mask(p4d); | ||
352 | } | ||
353 | |||
321 | static inline pudval_t pud_pfn_mask(pud_t pud) | 354 | static inline pudval_t pud_pfn_mask(pud_t pud) |
322 | { | 355 | { |
323 | if (native_pud_val(pud) & _PAGE_PSE) | 356 | if (native_pud_val(pud) & _PAGE_PSE) |
@@ -461,6 +494,7 @@ enum pg_level { | |||
461 | PG_LEVEL_4K, | 494 | PG_LEVEL_4K, |
462 | PG_LEVEL_2M, | 495 | PG_LEVEL_2M, |
463 | PG_LEVEL_1G, | 496 | PG_LEVEL_1G, |
497 | PG_LEVEL_512G, | ||
464 | PG_LEVEL_NUM | 498 | PG_LEVEL_NUM |
465 | }; | 499 | }; |
466 | 500 | ||
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 78defd0aa220..3cada998a402 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -709,6 +709,8 @@ extern struct desc_ptr early_gdt_descr; | |||
709 | 709 | ||
710 | extern void cpu_set_gdt(int); | 710 | extern void cpu_set_gdt(int); |
711 | extern void switch_to_new_gdt(int); | 711 | extern void switch_to_new_gdt(int); |
712 | extern void load_direct_gdt(int); | ||
713 | extern void load_fixmap_gdt(int); | ||
712 | extern void load_percpu_segment(int); | 714 | extern void load_percpu_segment(int); |
713 | extern void cpu_init(void); | 715 | extern void cpu_init(void); |
714 | 716 | ||
@@ -790,6 +792,7 @@ static inline void spin_lock_prefetch(const void *x) | |||
790 | /* | 792 | /* |
791 | * User space process size: 3GB (default). | 793 | * User space process size: 3GB (default). |
792 | */ | 794 | */ |
795 | #define IA32_PAGE_OFFSET PAGE_OFFSET | ||
793 | #define TASK_SIZE PAGE_OFFSET | 796 | #define TASK_SIZE PAGE_OFFSET |
794 | #define TASK_SIZE_MAX TASK_SIZE | 797 | #define TASK_SIZE_MAX TASK_SIZE |
795 | #define STACK_TOP TASK_SIZE | 798 | #define STACK_TOP TASK_SIZE |
@@ -866,7 +869,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, | |||
866 | * This decides where the kernel will search for a free chunk of vm | 869 | * This decides where the kernel will search for a free chunk of vm |
867 | * space during mmap's. | 870 | * space during mmap's. |
868 | */ | 871 | */ |
869 | #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) | 872 | #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) |
873 | #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) | ||
870 | 874 | ||
871 | #define KSTK_EIP(task) (task_pt_regs(task)->ip) | 875 | #define KSTK_EIP(task) (task_pt_regs(task)->ip) |
872 | 876 | ||
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index fac9a5c0abe9..d91ba04dd007 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h | |||
@@ -53,6 +53,12 @@ | |||
53 | # define NEED_MOVBE 0 | 53 | # define NEED_MOVBE 0 |
54 | #endif | 54 | #endif |
55 | 55 | ||
56 | #ifdef CONFIG_X86_5LEVEL | ||
57 | # define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31)) | ||
58 | #else | ||
59 | # define NEED_LA57 0 | ||
60 | #endif | ||
61 | |||
56 | #ifdef CONFIG_X86_64 | 62 | #ifdef CONFIG_X86_64 |
57 | #ifdef CONFIG_PARAVIRT | 63 | #ifdef CONFIG_PARAVIRT |
58 | /* Paravirtualized systems may not have PSE or PGE available */ | 64 | /* Paravirtualized systems may not have PSE or PGE available */ |
@@ -98,7 +104,7 @@ | |||
98 | #define REQUIRED_MASK13 0 | 104 | #define REQUIRED_MASK13 0 |
99 | #define REQUIRED_MASK14 0 | 105 | #define REQUIRED_MASK14 0 |
100 | #define REQUIRED_MASK15 0 | 106 | #define REQUIRED_MASK15 0 |
101 | #define REQUIRED_MASK16 0 | 107 | #define REQUIRED_MASK16 (NEED_LA57) |
102 | #define REQUIRED_MASK17 0 | 108 | #define REQUIRED_MASK17 0 |
103 | #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) | 109 | #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) |
104 | 110 | ||
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 4517d6b93188..1f5bee2c202f 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h | |||
@@ -26,8 +26,13 @@ | |||
26 | # endif | 26 | # endif |
27 | #else /* CONFIG_X86_32 */ | 27 | #else /* CONFIG_X86_32 */ |
28 | # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ | 28 | # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ |
29 | # define MAX_PHYSADDR_BITS 44 | 29 | # ifdef CONFIG_X86_5LEVEL |
30 | # define MAX_PHYSMEM_BITS 46 | 30 | # define MAX_PHYSADDR_BITS 52 |
31 | # define MAX_PHYSMEM_BITS 52 | ||
32 | # else | ||
33 | # define MAX_PHYSADDR_BITS 44 | ||
34 | # define MAX_PHYSMEM_BITS 46 | ||
35 | # endif | ||
31 | #endif | 36 | #endif |
32 | 37 | ||
33 | #endif /* CONFIG_SPARSEMEM */ | 38 | #endif /* CONFIG_SPARSEMEM */ |
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 58505f01962f..dcbd9bcce714 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h | |||
@@ -87,7 +87,7 @@ static inline void setup_stack_canary_segment(int cpu) | |||
87 | { | 87 | { |
88 | #ifdef CONFIG_X86_32 | 88 | #ifdef CONFIG_X86_32 |
89 | unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); | 89 | unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); |
90 | struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); | 90 | struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu); |
91 | struct desc_struct desc; | 91 | struct desc_struct desc; |
92 | 92 | ||
93 | desc = gdt_table[GDT_ENTRY_STACK_CANARY]; | 93 | desc = gdt_table[GDT_ENTRY_STACK_CANARY]; |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 75d002bdb3f3..6ed9ea469b48 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -215,7 +215,6 @@ static inline void __flush_tlb_one(unsigned long addr) | |||
215 | /* | 215 | /* |
216 | * TLB flushing: | 216 | * TLB flushing: |
217 | * | 217 | * |
218 | * - flush_tlb() flushes the current mm struct TLBs | ||
219 | * - flush_tlb_all() flushes all processes TLBs | 218 | * - flush_tlb_all() flushes all processes TLBs |
220 | * - flush_tlb_mm(mm) flushes the specified mm context TLB's | 219 | * - flush_tlb_mm(mm) flushes the specified mm context TLB's |
221 | * - flush_tlb_page(vma, vmaddr) flushes one page | 220 | * - flush_tlb_page(vma, vmaddr) flushes one page |
@@ -247,11 +246,6 @@ static inline void flush_tlb_all(void) | |||
247 | __flush_tlb_all(); | 246 | __flush_tlb_all(); |
248 | } | 247 | } |
249 | 248 | ||
250 | static inline void flush_tlb(void) | ||
251 | { | ||
252 | __flush_tlb_up(); | ||
253 | } | ||
254 | |||
255 | static inline void local_flush_tlb(void) | 249 | static inline void local_flush_tlb(void) |
256 | { | 250 | { |
257 | __flush_tlb_up(); | 251 | __flush_tlb_up(); |
@@ -313,14 +307,11 @@ static inline void flush_tlb_kernel_range(unsigned long start, | |||
313 | flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) | 307 | flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) |
314 | 308 | ||
315 | extern void flush_tlb_all(void); | 309 | extern void flush_tlb_all(void); |
316 | extern void flush_tlb_current_task(void); | ||
317 | extern void flush_tlb_page(struct vm_area_struct *, unsigned long); | 310 | extern void flush_tlb_page(struct vm_area_struct *, unsigned long); |
318 | extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 311 | extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
319 | unsigned long end, unsigned long vmflag); | 312 | unsigned long end, unsigned long vmflag); |
320 | extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); | 313 | extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); |
321 | 314 | ||
322 | #define flush_tlb() flush_tlb_current_task() | ||
323 | |||
324 | void native_flush_tlb_others(const struct cpumask *cpumask, | 315 | void native_flush_tlb_others(const struct cpumask *cpumask, |
325 | struct mm_struct *mm, | 316 | struct mm_struct *mm, |
326 | unsigned long start, unsigned long end); | 317 | unsigned long start, unsigned long end); |
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 64c5e745ebad..8a5a02b1dfba 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h | |||
@@ -280,13 +280,17 @@ static inline pte_t __pte_ma(pteval_t x) | |||
280 | 280 | ||
281 | #define pmd_val_ma(v) ((v).pmd) | 281 | #define pmd_val_ma(v) ((v).pmd) |
282 | #ifdef __PAGETABLE_PUD_FOLDED | 282 | #ifdef __PAGETABLE_PUD_FOLDED |
283 | #define pud_val_ma(v) ((v).pgd.pgd) | 283 | #define pud_val_ma(v) ((v).p4d.pgd.pgd) |
284 | #else | 284 | #else |
285 | #define pud_val_ma(v) ((v).pud) | 285 | #define pud_val_ma(v) ((v).pud) |
286 | #endif | 286 | #endif |
287 | #define __pmd_ma(x) ((pmd_t) { (x) } ) | 287 | #define __pmd_ma(x) ((pmd_t) { (x) } ) |
288 | 288 | ||
289 | #define pgd_val_ma(x) ((x).pgd) | 289 | #ifdef __PAGETABLE_P4D_FOLDED |
290 | #define p4d_val_ma(x) ((x).pgd.pgd) | ||
291 | #else | ||
292 | #define p4d_val_ma(x) ((x).p4d) | ||
293 | #endif | ||
290 | 294 | ||
291 | void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); | 295 | void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); |
292 | 296 | ||
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 48587335ede8..ed014814ea35 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c | |||
@@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void) | |||
101 | #ifdef CONFIG_SMP | 101 | #ifdef CONFIG_SMP |
102 | initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); | 102 | initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); |
103 | early_gdt_descr.address = | 103 | early_gdt_descr.address = |
104 | (unsigned long)get_cpu_gdt_table(smp_processor_id()); | 104 | (unsigned long)get_cpu_gdt_rw(smp_processor_id()); |
105 | initial_gs = per_cpu_offset(smp_processor_id()); | 105 | initial_gs = per_cpu_offset(smp_processor_id()); |
106 | #endif | 106 | #endif |
107 | initial_code = (unsigned long)wakeup_long64; | 107 | initial_code = (unsigned long)wakeup_long64; |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5a414545e8a3..446b0d3d4932 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call) | |||
609 | 609 | ||
610 | cpu = get_cpu(); | 610 | cpu = get_cpu(); |
611 | BUG_ON(cpu != 0); | 611 | BUG_ON(cpu != 0); |
612 | gdt = get_cpu_gdt_table(cpu); | 612 | gdt = get_cpu_gdt_rw(cpu); |
613 | save_desc_40 = gdt[0x40 / 8]; | 613 | save_desc_40 = gdt[0x40 / 8]; |
614 | gdt[0x40 / 8] = bad_bios_desc; | 614 | gdt[0x40 / 8] = bad_bios_desc; |
615 | 615 | ||
@@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call) | |||
685 | 685 | ||
686 | cpu = get_cpu(); | 686 | cpu = get_cpu(); |
687 | BUG_ON(cpu != 0); | 687 | BUG_ON(cpu != 0); |
688 | gdt = get_cpu_gdt_table(cpu); | 688 | gdt = get_cpu_gdt_rw(cpu); |
689 | save_desc_40 = gdt[0x40 / 8]; | 689 | save_desc_40 = gdt[0x40 / 8]; |
690 | gdt[0x40 / 8] = bad_bios_desc; | 690 | gdt[0x40 / 8] = bad_bios_desc; |
691 | 691 | ||
@@ -2352,7 +2352,7 @@ static int __init apm_init(void) | |||
2352 | * Note we only set APM segments on CPU zero, since we pin the APM | 2352 | * Note we only set APM segments on CPU zero, since we pin the APM |
2353 | * code to that CPU. | 2353 | * code to that CPU. |
2354 | */ | 2354 | */ |
2355 | gdt = get_cpu_gdt_table(0); | 2355 | gdt = get_cpu_gdt_rw(0); |
2356 | set_desc_base(&gdt[APM_CS >> 3], | 2356 | set_desc_base(&gdt[APM_CS >> 3], |
2357 | (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); | 2357 | (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); |
2358 | set_desc_base(&gdt[APM_CS_16 >> 3], | 2358 | set_desc_base(&gdt[APM_CS_16 >> 3], |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 58094a1f9e9d..8ee32119144d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -448,19 +448,60 @@ void load_percpu_segment(int cpu) | |||
448 | load_stack_canary_segment(); | 448 | load_stack_canary_segment(); |
449 | } | 449 | } |
450 | 450 | ||
451 | /* Setup the fixmap mapping only once per-processor */ | ||
452 | static inline void setup_fixmap_gdt(int cpu) | ||
453 | { | ||
454 | #ifdef CONFIG_X86_64 | ||
455 | /* On 64-bit systems, we use a read-only fixmap GDT. */ | ||
456 | pgprot_t prot = PAGE_KERNEL_RO; | ||
457 | #else | ||
458 | /* | ||
459 | * On native 32-bit systems, the GDT cannot be read-only because | ||
460 | * our double fault handler uses a task gate, and entering through | ||
461 | * a task gate needs to change an available TSS to busy. If the GDT | ||
462 | * is read-only, that will triple fault. | ||
463 | * | ||
464 | * On Xen PV, the GDT must be read-only because the hypervisor requires | ||
465 | * it. | ||
466 | */ | ||
467 | pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? | ||
468 | PAGE_KERNEL_RO : PAGE_KERNEL; | ||
469 | #endif | ||
470 | |||
471 | __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); | ||
472 | } | ||
473 | |||
474 | /* Load the original GDT from the per-cpu structure */ | ||
475 | void load_direct_gdt(int cpu) | ||
476 | { | ||
477 | struct desc_ptr gdt_descr; | ||
478 | |||
479 | gdt_descr.address = (long)get_cpu_gdt_rw(cpu); | ||
480 | gdt_descr.size = GDT_SIZE - 1; | ||
481 | load_gdt(&gdt_descr); | ||
482 | } | ||
483 | EXPORT_SYMBOL_GPL(load_direct_gdt); | ||
484 | |||
485 | /* Load a fixmap remapping of the per-cpu GDT */ | ||
486 | void load_fixmap_gdt(int cpu) | ||
487 | { | ||
488 | struct desc_ptr gdt_descr; | ||
489 | |||
490 | gdt_descr.address = (long)get_cpu_gdt_ro(cpu); | ||
491 | gdt_descr.size = GDT_SIZE - 1; | ||
492 | load_gdt(&gdt_descr); | ||
493 | } | ||
494 | EXPORT_SYMBOL_GPL(load_fixmap_gdt); | ||
495 | |||
451 | /* | 496 | /* |
452 | * Current gdt points %fs at the "master" per-cpu area: after this, | 497 | * Current gdt points %fs at the "master" per-cpu area: after this, |
453 | * it's on the real one. | 498 | * it's on the real one. |
454 | */ | 499 | */ |
455 | void switch_to_new_gdt(int cpu) | 500 | void switch_to_new_gdt(int cpu) |
456 | { | 501 | { |
457 | struct desc_ptr gdt_descr; | 502 | /* Load the original GDT */ |
458 | 503 | load_direct_gdt(cpu); | |
459 | gdt_descr.address = (long)get_cpu_gdt_table(cpu); | ||
460 | gdt_descr.size = GDT_SIZE - 1; | ||
461 | load_gdt(&gdt_descr); | ||
462 | /* Reload the per-cpu base */ | 504 | /* Reload the per-cpu base */ |
463 | |||
464 | load_percpu_segment(cpu); | 505 | load_percpu_segment(cpu); |
465 | } | 506 | } |
466 | 507 | ||
@@ -1526,6 +1567,9 @@ void cpu_init(void) | |||
1526 | 1567 | ||
1527 | if (is_uv_system()) | 1568 | if (is_uv_system()) |
1528 | uv_cpu_init(); | 1569 | uv_cpu_init(); |
1570 | |||
1571 | setup_fixmap_gdt(cpu); | ||
1572 | load_fixmap_gdt(cpu); | ||
1529 | } | 1573 | } |
1530 | 1574 | ||
1531 | #else | 1575 | #else |
@@ -1581,6 +1625,9 @@ void cpu_init(void) | |||
1581 | dbg_restore_debug_regs(); | 1625 | dbg_restore_debug_regs(); |
1582 | 1626 | ||
1583 | fpu__init_cpu(); | 1627 | fpu__init_cpu(); |
1628 | |||
1629 | setup_fixmap_gdt(cpu); | ||
1630 | load_fixmap_gdt(cpu); | ||
1584 | } | 1631 | } |
1585 | #endif | 1632 | #endif |
1586 | 1633 | ||
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6e9b26fa6d05..d78a586ba8dc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -270,7 +270,6 @@ int __init e820__update_table(struct e820_table *table) | |||
270 | if (table->nr_entries < 2) | 270 | if (table->nr_entries < 2) |
271 | return -1; | 271 | return -1; |
272 | 272 | ||
273 | table->nr_entries = table->nr_entries; | ||
274 | BUG_ON(table->nr_entries > max_nr_entries); | 273 | BUG_ON(table->nr_entries > max_nr_entries); |
275 | 274 | ||
276 | /* Bail out if we find any unreasonable addresses in the map: */ | 275 | /* Bail out if we find any unreasonable addresses in the map: */ |
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 04f89caef9c4..8e598a1ad986 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c | |||
@@ -50,11 +50,11 @@ | |||
50 | #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) | 50 | #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) |
51 | 51 | ||
52 | /* There is address space for how many espfix pages? */ | 52 | /* There is address space for how many espfix pages? */ |
53 | #define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) | 53 | #define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16)) |
54 | 54 | ||
55 | #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) | 55 | #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) |
56 | #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS | 56 | #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS |
57 | # error "Need more than one PGD for the ESPFIX hack" | 57 | # error "Need more virtual address space for the ESPFIX hack" |
58 | #endif | 58 | #endif |
59 | 59 | ||
60 | #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) | 60 | #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) |
@@ -121,11 +121,13 @@ static void init_espfix_random(void) | |||
121 | 121 | ||
122 | void __init init_espfix_bsp(void) | 122 | void __init init_espfix_bsp(void) |
123 | { | 123 | { |
124 | pgd_t *pgd_p; | 124 | pgd_t *pgd; |
125 | p4d_t *p4d; | ||
125 | 126 | ||
126 | /* Install the espfix pud into the kernel page directory */ | 127 | /* Install the espfix pud into the kernel page directory */ |
127 | pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; | 128 | pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; |
128 | pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); | 129 | p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); |
130 | p4d_populate(&init_mm, p4d, espfix_pud_page); | ||
129 | 131 | ||
130 | /* Randomize the locations */ | 132 | /* Randomize the locations */ |
131 | init_espfix_random(); | 133 | init_espfix_random(); |
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 469b23d6acc2..5f43cec296c5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c | |||
@@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one( | |||
103 | pgd_t *pgd, pmd_t *pmd, pte_t *pte, | 103 | pgd_t *pgd, pmd_t *pmd, pte_t *pte, |
104 | unsigned long vaddr, unsigned long paddr) | 104 | unsigned long vaddr, unsigned long paddr) |
105 | { | 105 | { |
106 | p4d_t *p4d; | ||
106 | pud_t *pud; | 107 | pud_t *pud; |
107 | 108 | ||
108 | pgd += pgd_index(vaddr); | 109 | pgd += pgd_index(vaddr); |
@@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one( | |||
110 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) | 111 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) |
111 | set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); | 112 | set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); |
112 | #endif | 113 | #endif |
113 | pud = pud_offset(pgd, vaddr); | 114 | p4d = p4d_offset(pgd, vaddr); |
115 | pud = pud_offset(p4d, vaddr); | ||
114 | pmd = pmd_offset(pud, vaddr); | 116 | pmd = pmd_offset(pud, vaddr); |
115 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) | 117 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) |
116 | set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); | 118 | set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); |
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 857cdbd02867..085c3b300d32 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c | |||
@@ -36,6 +36,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = { | |||
36 | 36 | ||
37 | static void free_transition_pgtable(struct kimage *image) | 37 | static void free_transition_pgtable(struct kimage *image) |
38 | { | 38 | { |
39 | free_page((unsigned long)image->arch.p4d); | ||
39 | free_page((unsigned long)image->arch.pud); | 40 | free_page((unsigned long)image->arch.pud); |
40 | free_page((unsigned long)image->arch.pmd); | 41 | free_page((unsigned long)image->arch.pmd); |
41 | free_page((unsigned long)image->arch.pte); | 42 | free_page((unsigned long)image->arch.pte); |
@@ -43,6 +44,7 @@ static void free_transition_pgtable(struct kimage *image) | |||
43 | 44 | ||
44 | static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) | 45 | static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) |
45 | { | 46 | { |
47 | p4d_t *p4d; | ||
46 | pud_t *pud; | 48 | pud_t *pud; |
47 | pmd_t *pmd; | 49 | pmd_t *pmd; |
48 | pte_t *pte; | 50 | pte_t *pte; |
@@ -53,13 +55,21 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) | |||
53 | paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); | 55 | paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); |
54 | pgd += pgd_index(vaddr); | 56 | pgd += pgd_index(vaddr); |
55 | if (!pgd_present(*pgd)) { | 57 | if (!pgd_present(*pgd)) { |
58 | p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); | ||
59 | if (!p4d) | ||
60 | goto err; | ||
61 | image->arch.p4d = p4d; | ||
62 | set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); | ||
63 | } | ||
64 | p4d = p4d_offset(pgd, vaddr); | ||
65 | if (!p4d_present(*p4d)) { | ||
56 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL); | 66 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL); |
57 | if (!pud) | 67 | if (!pud) |
58 | goto err; | 68 | goto err; |
59 | image->arch.pud = pud; | 69 | image->arch.pud = pud; |
60 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); | 70 | set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); |
61 | } | 71 | } |
62 | pud = pud_offset(pgd, vaddr); | 72 | pud = pud_offset(p4d, vaddr); |
63 | if (!pud_present(*pud)) { | 73 | if (!pud_present(*pud)) { |
64 | pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); | 74 | pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
65 | if (!pmd) | 75 | if (!pmd) |
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 4797e87b0fb6..3586996fc50d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { | |||
405 | .alloc_pte = paravirt_nop, | 405 | .alloc_pte = paravirt_nop, |
406 | .alloc_pmd = paravirt_nop, | 406 | .alloc_pmd = paravirt_nop, |
407 | .alloc_pud = paravirt_nop, | 407 | .alloc_pud = paravirt_nop, |
408 | .alloc_p4d = paravirt_nop, | ||
408 | .release_pte = paravirt_nop, | 409 | .release_pte = paravirt_nop, |
409 | .release_pmd = paravirt_nop, | 410 | .release_pmd = paravirt_nop, |
410 | .release_pud = paravirt_nop, | 411 | .release_pud = paravirt_nop, |
412 | .release_p4d = paravirt_nop, | ||
411 | 413 | ||
412 | .set_pte = native_set_pte, | 414 | .set_pte = native_set_pte, |
413 | .set_pte_at = native_set_pte_at, | 415 | .set_pte_at = native_set_pte_at, |
@@ -430,12 +432,19 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { | |||
430 | .pmd_val = PTE_IDENT, | 432 | .pmd_val = PTE_IDENT, |
431 | .make_pmd = PTE_IDENT, | 433 | .make_pmd = PTE_IDENT, |
432 | 434 | ||
433 | #if CONFIG_PGTABLE_LEVELS == 4 | 435 | #if CONFIG_PGTABLE_LEVELS >= 4 |
434 | .pud_val = PTE_IDENT, | 436 | .pud_val = PTE_IDENT, |
435 | .make_pud = PTE_IDENT, | 437 | .make_pud = PTE_IDENT, |
436 | 438 | ||
439 | .set_p4d = native_set_p4d, | ||
440 | |||
441 | #if CONFIG_PGTABLE_LEVELS >= 5 | ||
442 | .p4d_val = PTE_IDENT, | ||
443 | .make_p4d = PTE_IDENT, | ||
444 | |||
437 | .set_pgd = native_set_pgd, | 445 | .set_pgd = native_set_pgd, |
438 | #endif | 446 | #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ |
447 | #endif /* CONFIG_PGTABLE_LEVELS >= 4 */ | ||
439 | #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ | 448 | #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ |
440 | 449 | ||
441 | .pte_val = PTE_IDENT, | 450 | .pte_val = PTE_IDENT, |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ea1a6180bf39..825a1e47cf3e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -53,6 +53,11 @@ | |||
53 | #include <asm/xen/hypervisor.h> | 53 | #include <asm/xen/hypervisor.h> |
54 | #include <asm/vdso.h> | 54 | #include <asm/vdso.h> |
55 | #include <asm/intel_rdt.h> | 55 | #include <asm/intel_rdt.h> |
56 | #include <asm/unistd.h> | ||
57 | #ifdef CONFIG_IA32_EMULATION | ||
58 | /* Not included via unistd.h */ | ||
59 | #include <asm/unistd_32_ia32.h> | ||
60 | #endif | ||
56 | 61 | ||
57 | __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); | 62 | __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); |
58 | 63 | ||
@@ -494,6 +499,8 @@ void set_personality_64bit(void) | |||
494 | clear_thread_flag(TIF_IA32); | 499 | clear_thread_flag(TIF_IA32); |
495 | clear_thread_flag(TIF_ADDR32); | 500 | clear_thread_flag(TIF_ADDR32); |
496 | clear_thread_flag(TIF_X32); | 501 | clear_thread_flag(TIF_X32); |
502 | /* Pretend that this comes from a 64bit execve */ | ||
503 | task_pt_regs(current)->orig_ax = __NR_execve; | ||
497 | 504 | ||
498 | /* Ensure the corresponding mm is not marked. */ | 505 | /* Ensure the corresponding mm is not marked. */ |
499 | if (current->mm) | 506 | if (current->mm) |
@@ -506,32 +513,50 @@ void set_personality_64bit(void) | |||
506 | current->personality &= ~READ_IMPLIES_EXEC; | 513 | current->personality &= ~READ_IMPLIES_EXEC; |
507 | } | 514 | } |
508 | 515 | ||
509 | void set_personality_ia32(bool x32) | 516 | static void __set_personality_x32(void) |
510 | { | 517 | { |
511 | /* inherit personality from parent */ | 518 | #ifdef CONFIG_X86_X32 |
519 | clear_thread_flag(TIF_IA32); | ||
520 | set_thread_flag(TIF_X32); | ||
521 | if (current->mm) | ||
522 | current->mm->context.ia32_compat = TIF_X32; | ||
523 | current->personality &= ~READ_IMPLIES_EXEC; | ||
524 | /* | ||
525 | * in_compat_syscall() uses the presence of the x32 syscall bit | ||
526 | * flag to determine compat status. The x86 mmap() code relies on | ||
527 | * the syscall bitness so set x32 syscall bit right here to make | ||
528 | * in_compat_syscall() work during exec(). | ||
529 | * | ||
530 | * Pretend to come from a x32 execve. | ||
531 | */ | ||
532 | task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; | ||
533 | current->thread.status &= ~TS_COMPAT; | ||
534 | #endif | ||
535 | } | ||
512 | 536 | ||
537 | static void __set_personality_ia32(void) | ||
538 | { | ||
539 | #ifdef CONFIG_IA32_EMULATION | ||
540 | set_thread_flag(TIF_IA32); | ||
541 | clear_thread_flag(TIF_X32); | ||
542 | if (current->mm) | ||
543 | current->mm->context.ia32_compat = TIF_IA32; | ||
544 | current->personality |= force_personality32; | ||
545 | /* Prepare the first "return" to user space */ | ||
546 | task_pt_regs(current)->orig_ax = __NR_ia32_execve; | ||
547 | current->thread.status |= TS_COMPAT; | ||
548 | #endif | ||
549 | } | ||
550 | |||
551 | void set_personality_ia32(bool x32) | ||
552 | { | ||
513 | /* Make sure to be in 32bit mode */ | 553 | /* Make sure to be in 32bit mode */ |
514 | set_thread_flag(TIF_ADDR32); | 554 | set_thread_flag(TIF_ADDR32); |
515 | 555 | ||
516 | /* Mark the associated mm as containing 32-bit tasks. */ | 556 | if (x32) |
517 | if (x32) { | 557 | __set_personality_x32(); |
518 | clear_thread_flag(TIF_IA32); | 558 | else |
519 | set_thread_flag(TIF_X32); | 559 | __set_personality_ia32(); |
520 | if (current->mm) | ||
521 | current->mm->context.ia32_compat = TIF_X32; | ||
522 | current->personality &= ~READ_IMPLIES_EXEC; | ||
523 | /* in_compat_syscall() uses the presence of the x32 | ||
524 | syscall bit flag to determine compat status */ | ||
525 | current->thread.status &= ~TS_COMPAT; | ||
526 | } else { | ||
527 | set_thread_flag(TIF_IA32); | ||
528 | clear_thread_flag(TIF_X32); | ||
529 | if (current->mm) | ||
530 | current->mm->context.ia32_compat = TIF_IA32; | ||
531 | current->personality |= force_personality32; | ||
532 | /* Prepare the first "return" to user space */ | ||
533 | current->thread.status |= TS_COMPAT; | ||
534 | } | ||
535 | } | 560 | } |
536 | EXPORT_SYMBOL_GPL(set_personality_ia32); | 561 | EXPORT_SYMBOL_GPL(set_personality_ia32); |
537 | 562 | ||
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0b4d3c686b1e..603a1669a2ec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -1225,21 +1225,6 @@ void __init setup_arch(char **cmdline_p) | |||
1225 | 1225 | ||
1226 | kasan_init(); | 1226 | kasan_init(); |
1227 | 1227 | ||
1228 | #ifdef CONFIG_X86_32 | ||
1229 | /* sync back kernel address range */ | ||
1230 | clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, | ||
1231 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
1232 | KERNEL_PGD_PTRS); | ||
1233 | |||
1234 | /* | ||
1235 | * sync back low identity map too. It is used for example | ||
1236 | * in the 32-bit EFI stub. | ||
1237 | */ | ||
1238 | clone_pgd_range(initial_page_table, | ||
1239 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
1240 | min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); | ||
1241 | #endif | ||
1242 | |||
1243 | tboot_probe(); | 1228 | tboot_probe(); |
1244 | 1229 | ||
1245 | map_vsyscall(); | 1230 | map_vsyscall(); |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9820d6d977c6..bb1e8cc0bc84 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu) | |||
160 | pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, | 160 | pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, |
161 | 0x2 | DESCTYPE_S, 0x8); | 161 | 0x2 | DESCTYPE_S, 0x8); |
162 | gdt.s = 1; | 162 | gdt.s = 1; |
163 | write_gdt_entry(get_cpu_gdt_table(cpu), | 163 | write_gdt_entry(get_cpu_gdt_rw(cpu), |
164 | GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); | 164 | GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); |
165 | #endif | 165 | #endif |
166 | } | 166 | } |
@@ -288,4 +288,25 @@ void __init setup_per_cpu_areas(void) | |||
288 | 288 | ||
289 | /* Setup cpu initialized, callin, callout masks */ | 289 | /* Setup cpu initialized, callin, callout masks */ |
290 | setup_cpu_local_masks(); | 290 | setup_cpu_local_masks(); |
291 | |||
292 | #ifdef CONFIG_X86_32 | ||
293 | /* | ||
294 | * Sync back kernel address range. We want to make sure that | ||
295 | * all kernel mappings, including percpu mappings, are available | ||
296 | * in the smpboot asm. We can't reliably pick up percpu | ||
297 | * mappings using vmalloc_fault(), because exception dispatch | ||
298 | * needs percpu data. | ||
299 | */ | ||
300 | clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, | ||
301 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
302 | KERNEL_PGD_PTRS); | ||
303 | |||
304 | /* | ||
305 | * sync back low identity map too. It is used for example | ||
306 | * in the 32-bit EFI stub. | ||
307 | */ | ||
308 | clone_pgd_range(initial_page_table, | ||
309 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
310 | min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); | ||
311 | #endif | ||
291 | } | 312 | } |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bd1f1ad35284..f04479a8f74f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) | |||
983 | unsigned long timeout; | 983 | unsigned long timeout; |
984 | 984 | ||
985 | idle->thread.sp = (unsigned long)task_pt_regs(idle); | 985 | idle->thread.sp = (unsigned long)task_pt_regs(idle); |
986 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); | 986 | early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); |
987 | initial_code = (unsigned long)start_secondary; | 987 | initial_code = (unsigned long)start_secondary; |
988 | initial_stack = idle->thread.sp; | 988 | initial_stack = idle->thread.sp; |
989 | 989 | ||
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 50215a4b9347..207b8f2582c7 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <linux/uaccess.h> | 17 | #include <linux/uaccess.h> |
18 | #include <linux/elf.h> | 18 | #include <linux/elf.h> |
19 | 19 | ||
20 | #include <asm/elf.h> | ||
21 | #include <asm/compat.h> | ||
20 | #include <asm/ia32.h> | 22 | #include <asm/ia32.h> |
21 | #include <asm/syscalls.h> | 23 | #include <asm/syscalls.h> |
22 | 24 | ||
@@ -101,7 +103,7 @@ out: | |||
101 | static void find_start_end(unsigned long flags, unsigned long *begin, | 103 | static void find_start_end(unsigned long flags, unsigned long *begin, |
102 | unsigned long *end) | 104 | unsigned long *end) |
103 | { | 105 | { |
104 | if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { | 106 | if (!in_compat_syscall() && (flags & MAP_32BIT)) { |
105 | /* This is usually used needed to map code in small | 107 | /* This is usually used needed to map code in small |
106 | model, so it needs to be in the first 31bit. Limit | 108 | model, so it needs to be in the first 31bit. Limit |
107 | it to that. This means we need to move the | 109 | it to that. This means we need to move the |
@@ -114,10 +116,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin, | |||
114 | if (current->flags & PF_RANDOMIZE) { | 116 | if (current->flags & PF_RANDOMIZE) { |
115 | *begin = randomize_page(*begin, 0x02000000); | 117 | *begin = randomize_page(*begin, 0x02000000); |
116 | } | 118 | } |
117 | } else { | 119 | return; |
118 | *begin = current->mm->mmap_legacy_base; | ||
119 | *end = TASK_SIZE; | ||
120 | } | 120 | } |
121 | |||
122 | *begin = get_mmap_base(1); | ||
123 | *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); | ||
121 | } | 124 | } |
122 | 125 | ||
123 | unsigned long | 126 | unsigned long |
@@ -176,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
176 | return addr; | 179 | return addr; |
177 | 180 | ||
178 | /* for MAP_32BIT mappings we force the legacy mmap base */ | 181 | /* for MAP_32BIT mappings we force the legacy mmap base */ |
179 | if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) | 182 | if (!in_compat_syscall() && (flags & MAP_32BIT)) |
180 | goto bottomup; | 183 | goto bottomup; |
181 | 184 | ||
182 | /* requesting a specific address */ | 185 | /* requesting a specific address */ |
@@ -191,7 +194,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
191 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; | 194 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
192 | info.length = len; | 195 | info.length = len; |
193 | info.low_limit = PAGE_SIZE; | 196 | info.low_limit = PAGE_SIZE; |
194 | info.high_limit = mm->mmap_base; | 197 | info.high_limit = get_mmap_base(0); |
195 | info.align_mask = 0; | 198 | info.align_mask = 0; |
196 | info.align_offset = pgoff << PAGE_SHIFT; | 199 | info.align_offset = pgoff << PAGE_SHIFT; |
197 | if (filp) { | 200 | if (filp) { |
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index ccccd335ae01..d4c8011a2293 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c | |||
@@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, | |||
118 | pgprot_t prot) | 118 | pgprot_t prot) |
119 | { | 119 | { |
120 | pgd_t *pgd; | 120 | pgd_t *pgd; |
121 | p4d_t *p4d; | ||
121 | pud_t *pud; | 122 | pud_t *pud; |
122 | pmd_t *pmd; | 123 | pmd_t *pmd; |
123 | pte_t *pte; | 124 | pte_t *pte; |
124 | 125 | ||
125 | pgd = pgd_offset(&tboot_mm, vaddr); | 126 | pgd = pgd_offset(&tboot_mm, vaddr); |
126 | pud = pud_alloc(&tboot_mm, pgd, vaddr); | 127 | p4d = p4d_alloc(&tboot_mm, pgd, vaddr); |
128 | if (!p4d) | ||
129 | return -1; | ||
130 | pud = pud_alloc(&tboot_mm, p4d, vaddr); | ||
127 | if (!pud) | 131 | if (!pud) |
128 | return -1; | 132 | return -1; |
129 | pmd = pmd_alloc(&tboot_mm, pud, vaddr); | 133 | pmd = pmd_alloc(&tboot_mm, pud, vaddr); |
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6c8934406dc9..dcd699baea1b 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c | |||
@@ -92,10 +92,17 @@ static void set_tls_desc(struct task_struct *p, int idx, | |||
92 | cpu = get_cpu(); | 92 | cpu = get_cpu(); |
93 | 93 | ||
94 | while (n-- > 0) { | 94 | while (n-- > 0) { |
95 | if (LDT_empty(info) || LDT_zero(info)) | 95 | if (LDT_empty(info) || LDT_zero(info)) { |
96 | desc->a = desc->b = 0; | 96 | desc->a = desc->b = 0; |
97 | else | 97 | } else { |
98 | fill_ldt(desc, info); | 98 | fill_ldt(desc, info); |
99 | |||
100 | /* | ||
101 | * Always set the accessed bit so that the CPU | ||
102 | * doesn't try to write to the (read-only) GDT. | ||
103 | */ | ||
104 | desc->type |= 1; | ||
105 | } | ||
99 | ++info; | 106 | ++info; |
100 | ++desc; | 107 | ++desc; |
101 | } | 108 | } |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 23ee89ce59a9..7924a5356c8a 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
164 | struct vm_area_struct *vma; | 164 | struct vm_area_struct *vma; |
165 | spinlock_t *ptl; | 165 | spinlock_t *ptl; |
166 | pgd_t *pgd; | 166 | pgd_t *pgd; |
167 | p4d_t *p4d; | ||
167 | pud_t *pud; | 168 | pud_t *pud; |
168 | pmd_t *pmd; | 169 | pmd_t *pmd; |
169 | pte_t *pte; | 170 | pte_t *pte; |
@@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
173 | pgd = pgd_offset(mm, 0xA0000); | 174 | pgd = pgd_offset(mm, 0xA0000); |
174 | if (pgd_none_or_clear_bad(pgd)) | 175 | if (pgd_none_or_clear_bad(pgd)) |
175 | goto out; | 176 | goto out; |
176 | pud = pud_offset(pgd, 0xA0000); | 177 | p4d = p4d_offset(pgd, 0xA0000); |
178 | if (p4d_none_or_clear_bad(p4d)) | ||
179 | goto out; | ||
180 | pud = pud_offset(p4d, 0xA0000); | ||
177 | if (pud_none_or_clear_bad(pud)) | 181 | if (pud_none_or_clear_bad(pud)) |
178 | goto out; | 182 | goto out; |
179 | pmd = pmd_offset(pud, 0xA0000); | 183 | pmd = pmd_offset(pud, 0xA0000); |
@@ -193,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
193 | pte_unmap_unlock(pte, ptl); | 197 | pte_unmap_unlock(pte, ptl); |
194 | out: | 198 | out: |
195 | up_write(&mm->mmap_sem); | 199 | up_write(&mm->mmap_sem); |
196 | flush_tlb(); | 200 | flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); |
197 | } | 201 | } |
198 | 202 | ||
199 | 203 | ||
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5fba70646c32..5f48f62b8dc2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -741,7 +741,6 @@ static int svm_hardware_enable(void) | |||
741 | 741 | ||
742 | struct svm_cpu_data *sd; | 742 | struct svm_cpu_data *sd; |
743 | uint64_t efer; | 743 | uint64_t efer; |
744 | struct desc_ptr gdt_descr; | ||
745 | struct desc_struct *gdt; | 744 | struct desc_struct *gdt; |
746 | int me = raw_smp_processor_id(); | 745 | int me = raw_smp_processor_id(); |
747 | 746 | ||
@@ -763,8 +762,7 @@ static int svm_hardware_enable(void) | |||
763 | sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; | 762 | sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; |
764 | sd->next_asid = sd->max_asid + 1; | 763 | sd->next_asid = sd->max_asid + 1; |
765 | 764 | ||
766 | native_store_gdt(&gdt_descr); | 765 | gdt = get_current_gdt_rw(); |
767 | gdt = (struct desc_struct *)gdt_descr.address; | ||
768 | sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); | 766 | sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); |
769 | 767 | ||
770 | wrmsrl(MSR_EFER, efer | EFER_SVME); | 768 | wrmsrl(MSR_EFER, efer | EFER_SVME); |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 259e9b28ccf8..1a471e5f963f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | |||
935 | * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. | 935 | * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. |
936 | */ | 936 | */ |
937 | static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); | 937 | static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); |
938 | static DEFINE_PER_CPU(struct desc_ptr, host_gdt); | ||
939 | 938 | ||
940 | /* | 939 | /* |
941 | * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we | 940 | * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we |
@@ -2057,14 +2056,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) | |||
2057 | */ | 2056 | */ |
2058 | static unsigned long segment_base(u16 selector) | 2057 | static unsigned long segment_base(u16 selector) |
2059 | { | 2058 | { |
2060 | struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); | ||
2061 | struct desc_struct *table; | 2059 | struct desc_struct *table; |
2062 | unsigned long v; | 2060 | unsigned long v; |
2063 | 2061 | ||
2064 | if (!(selector & ~SEGMENT_RPL_MASK)) | 2062 | if (!(selector & ~SEGMENT_RPL_MASK)) |
2065 | return 0; | 2063 | return 0; |
2066 | 2064 | ||
2067 | table = (struct desc_struct *)gdt->address; | 2065 | table = get_current_gdt_ro(); |
2068 | 2066 | ||
2069 | if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { | 2067 | if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { |
2070 | u16 ldt_selector = kvm_read_ldt(); | 2068 | u16 ldt_selector = kvm_read_ldt(); |
@@ -2169,7 +2167,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) | |||
2169 | #endif | 2167 | #endif |
2170 | if (vmx->host_state.msr_host_bndcfgs) | 2168 | if (vmx->host_state.msr_host_bndcfgs) |
2171 | wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); | 2169 | wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); |
2172 | load_gdt(this_cpu_ptr(&host_gdt)); | 2170 | load_fixmap_gdt(raw_smp_processor_id()); |
2173 | } | 2171 | } |
2174 | 2172 | ||
2175 | static void vmx_load_host_state(struct vcpu_vmx *vmx) | 2173 | static void vmx_load_host_state(struct vcpu_vmx *vmx) |
@@ -2271,7 +2269,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
2271 | } | 2269 | } |
2272 | 2270 | ||
2273 | if (!already_loaded) { | 2271 | if (!already_loaded) { |
2274 | struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); | 2272 | void *gdt = get_current_gdt_ro(); |
2275 | unsigned long sysenter_esp; | 2273 | unsigned long sysenter_esp; |
2276 | 2274 | ||
2277 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | 2275 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
@@ -2282,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
2282 | */ | 2280 | */ |
2283 | vmcs_writel(HOST_TR_BASE, | 2281 | vmcs_writel(HOST_TR_BASE, |
2284 | (unsigned long)this_cpu_ptr(&cpu_tss)); | 2282 | (unsigned long)this_cpu_ptr(&cpu_tss)); |
2285 | vmcs_writel(HOST_GDTR_BASE, gdt->address); | 2283 | vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ |
2286 | 2284 | ||
2287 | /* | 2285 | /* |
2288 | * VM exits change the host TR limit to 0x67 after a VM | 2286 | * VM exits change the host TR limit to 0x67 after a VM |
@@ -3471,8 +3469,6 @@ static int hardware_enable(void) | |||
3471 | ept_sync_global(); | 3469 | ept_sync_global(); |
3472 | } | 3470 | } |
3473 | 3471 | ||
3474 | native_store_gdt(this_cpu_ptr(&host_gdt)); | ||
3475 | |||
3476 | return 0; | 3472 | return 0; |
3477 | } | 3473 | } |
3478 | 3474 | ||
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 58b5bee7ea27..bce6990b1d81 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c | |||
@@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = { | |||
110 | #define PTE_LEVEL_MULT (PAGE_SIZE) | 110 | #define PTE_LEVEL_MULT (PAGE_SIZE) |
111 | #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) | 111 | #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) |
112 | #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) | 112 | #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) |
113 | #define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) | 113 | #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) |
114 | #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) | ||
114 | 115 | ||
115 | #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ | 116 | #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ |
116 | ({ \ | 117 | ({ \ |
@@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st, | |||
286 | } | 287 | } |
287 | } | 288 | } |
288 | 289 | ||
289 | static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, | 290 | static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) |
290 | unsigned long P) | ||
291 | { | 291 | { |
292 | int i; | 292 | int i; |
293 | pte_t *start; | 293 | pte_t *start; |
294 | pgprotval_t prot; | 294 | pgprotval_t prot; |
295 | 295 | ||
296 | start = (pte_t *) pmd_page_vaddr(addr); | 296 | start = (pte_t *)pmd_page_vaddr(addr); |
297 | for (i = 0; i < PTRS_PER_PTE; i++) { | 297 | for (i = 0; i < PTRS_PER_PTE; i++) { |
298 | prot = pte_flags(*start); | 298 | prot = pte_flags(*start); |
299 | st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); | 299 | st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); |
@@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, | |||
304 | 304 | ||
305 | #if PTRS_PER_PMD > 1 | 305 | #if PTRS_PER_PMD > 1 |
306 | 306 | ||
307 | static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, | 307 | static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) |
308 | unsigned long P) | ||
309 | { | 308 | { |
310 | int i; | 309 | int i; |
311 | pmd_t *start; | 310 | pmd_t *start; |
312 | pgprotval_t prot; | 311 | pgprotval_t prot; |
313 | 312 | ||
314 | start = (pmd_t *) pud_page_vaddr(addr); | 313 | start = (pmd_t *)pud_page_vaddr(addr); |
315 | for (i = 0; i < PTRS_PER_PMD; i++) { | 314 | for (i = 0; i < PTRS_PER_PMD; i++) { |
316 | st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); | 315 | st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); |
317 | if (!pmd_none(*start)) { | 316 | if (!pmd_none(*start)) { |
@@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) | |||
347 | return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); | 346 | return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); |
348 | } | 347 | } |
349 | 348 | ||
350 | static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, | 349 | static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) |
351 | unsigned long P) | ||
352 | { | 350 | { |
353 | int i; | 351 | int i; |
354 | pud_t *start; | 352 | pud_t *start; |
355 | pgprotval_t prot; | 353 | pgprotval_t prot; |
356 | pud_t *prev_pud = NULL; | 354 | pud_t *prev_pud = NULL; |
357 | 355 | ||
358 | start = (pud_t *) pgd_page_vaddr(addr); | 356 | start = (pud_t *)p4d_page_vaddr(addr); |
359 | 357 | ||
360 | for (i = 0; i < PTRS_PER_PUD; i++) { | 358 | for (i = 0; i < PTRS_PER_PUD; i++) { |
361 | st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); | 359 | st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); |
@@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, | |||
377 | } | 375 | } |
378 | 376 | ||
379 | #else | 377 | #else |
380 | #define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) | 378 | #define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) |
381 | #define pgd_large(a) pud_large(__pud(pgd_val(a))) | 379 | #define p4d_large(a) pud_large(__pud(p4d_val(a))) |
382 | #define pgd_none(a) pud_none(__pud(pgd_val(a))) | 380 | #define p4d_none(a) pud_none(__pud(p4d_val(a))) |
381 | #endif | ||
382 | |||
383 | #if PTRS_PER_P4D > 1 | ||
384 | |||
385 | static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) | ||
386 | { | ||
387 | int i; | ||
388 | p4d_t *start; | ||
389 | pgprotval_t prot; | ||
390 | |||
391 | start = (p4d_t *)pgd_page_vaddr(addr); | ||
392 | |||
393 | for (i = 0; i < PTRS_PER_P4D; i++) { | ||
394 | st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); | ||
395 | if (!p4d_none(*start)) { | ||
396 | if (p4d_large(*start) || !p4d_present(*start)) { | ||
397 | prot = p4d_flags(*start); | ||
398 | note_page(m, st, __pgprot(prot), 2); | ||
399 | } else { | ||
400 | walk_pud_level(m, st, *start, | ||
401 | P + i * P4D_LEVEL_MULT); | ||
402 | } | ||
403 | } else | ||
404 | note_page(m, st, __pgprot(0), 2); | ||
405 | |||
406 | start++; | ||
407 | } | ||
408 | } | ||
409 | |||
410 | #else | ||
411 | #define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) | ||
412 | #define pgd_large(a) p4d_large(__p4d(pgd_val(a))) | ||
413 | #define pgd_none(a) p4d_none(__p4d(pgd_val(a))) | ||
383 | #endif | 414 | #endif |
384 | 415 | ||
385 | static inline bool is_hypervisor_range(int idx) | 416 | static inline bool is_hypervisor_range(int idx) |
@@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, | |||
424 | prot = pgd_flags(*start); | 455 | prot = pgd_flags(*start); |
425 | note_page(m, &st, __pgprot(prot), 1); | 456 | note_page(m, &st, __pgprot(prot), 1); |
426 | } else { | 457 | } else { |
427 | walk_pud_level(m, &st, *start, | 458 | walk_p4d_level(m, &st, *start, |
428 | i * PGD_LEVEL_MULT); | 459 | i * PGD_LEVEL_MULT); |
429 | } | 460 | } |
430 | } else | 461 | } else |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 428e31763cb9..8ad91a01cbc8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | |||
253 | { | 253 | { |
254 | unsigned index = pgd_index(address); | 254 | unsigned index = pgd_index(address); |
255 | pgd_t *pgd_k; | 255 | pgd_t *pgd_k; |
256 | p4d_t *p4d, *p4d_k; | ||
256 | pud_t *pud, *pud_k; | 257 | pud_t *pud, *pud_k; |
257 | pmd_t *pmd, *pmd_k; | 258 | pmd_t *pmd, *pmd_k; |
258 | 259 | ||
@@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | |||
265 | /* | 266 | /* |
266 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | 267 | * set_pgd(pgd, *pgd_k); here would be useless on PAE |
267 | * and redundant with the set_pmd() on non-PAE. As would | 268 | * and redundant with the set_pmd() on non-PAE. As would |
268 | * set_pud. | 269 | * set_p4d/set_pud. |
269 | */ | 270 | */ |
270 | pud = pud_offset(pgd, address); | 271 | p4d = p4d_offset(pgd, address); |
271 | pud_k = pud_offset(pgd_k, address); | 272 | p4d_k = p4d_offset(pgd_k, address); |
273 | if (!p4d_present(*p4d_k)) | ||
274 | return NULL; | ||
275 | |||
276 | pud = pud_offset(p4d, address); | ||
277 | pud_k = pud_offset(p4d_k, address); | ||
272 | if (!pud_present(*pud_k)) | 278 | if (!pud_present(*pud_k)) |
273 | return NULL; | 279 | return NULL; |
274 | 280 | ||
@@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address) | |||
384 | { | 390 | { |
385 | pgd_t *base = __va(read_cr3()); | 391 | pgd_t *base = __va(read_cr3()); |
386 | pgd_t *pgd = &base[pgd_index(address)]; | 392 | pgd_t *pgd = &base[pgd_index(address)]; |
393 | p4d_t *p4d; | ||
394 | pud_t *pud; | ||
387 | pmd_t *pmd; | 395 | pmd_t *pmd; |
388 | pte_t *pte; | 396 | pte_t *pte; |
389 | 397 | ||
@@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address) | |||
392 | if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) | 400 | if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) |
393 | goto out; | 401 | goto out; |
394 | #endif | 402 | #endif |
395 | pmd = pmd_offset(pud_offset(pgd, address), address); | 403 | p4d = p4d_offset(pgd, address); |
404 | pud = pud_offset(p4d, address); | ||
405 | pmd = pmd_offset(pud, address); | ||
396 | printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); | 406 | printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); |
397 | 407 | ||
398 | /* | 408 | /* |
@@ -425,6 +435,7 @@ void vmalloc_sync_all(void) | |||
425 | static noinline int vmalloc_fault(unsigned long address) | 435 | static noinline int vmalloc_fault(unsigned long address) |
426 | { | 436 | { |
427 | pgd_t *pgd, *pgd_ref; | 437 | pgd_t *pgd, *pgd_ref; |
438 | p4d_t *p4d, *p4d_ref; | ||
428 | pud_t *pud, *pud_ref; | 439 | pud_t *pud, *pud_ref; |
429 | pmd_t *pmd, *pmd_ref; | 440 | pmd_t *pmd, *pmd_ref; |
430 | pte_t *pte, *pte_ref; | 441 | pte_t *pte, *pte_ref; |
@@ -448,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address) | |||
448 | if (pgd_none(*pgd)) { | 459 | if (pgd_none(*pgd)) { |
449 | set_pgd(pgd, *pgd_ref); | 460 | set_pgd(pgd, *pgd_ref); |
450 | arch_flush_lazy_mmu_mode(); | 461 | arch_flush_lazy_mmu_mode(); |
451 | } else { | 462 | } else if (CONFIG_PGTABLE_LEVELS > 4) { |
463 | /* | ||
464 | * With folded p4d, pgd_none() is always false, so the pgd may | ||
465 | * point to an empty page table entry and pgd_page_vaddr() | ||
466 | * will return garbage. | ||
467 | * | ||
468 | * We will do the correct sanity check on the p4d level. | ||
469 | */ | ||
452 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | 470 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); |
453 | } | 471 | } |
454 | 472 | ||
473 | /* With 4-level paging, copying happens on the p4d level. */ | ||
474 | p4d = p4d_offset(pgd, address); | ||
475 | p4d_ref = p4d_offset(pgd_ref, address); | ||
476 | if (p4d_none(*p4d_ref)) | ||
477 | return -1; | ||
478 | |||
479 | if (p4d_none(*p4d)) { | ||
480 | set_p4d(p4d, *p4d_ref); | ||
481 | arch_flush_lazy_mmu_mode(); | ||
482 | } else { | ||
483 | BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); | ||
484 | } | ||
485 | |||
455 | /* | 486 | /* |
456 | * Below here mismatches are bugs because these lower tables | 487 | * Below here mismatches are bugs because these lower tables |
457 | * are shared: | 488 | * are shared: |
458 | */ | 489 | */ |
459 | 490 | ||
460 | pud = pud_offset(pgd, address); | 491 | pud = pud_offset(p4d, address); |
461 | pud_ref = pud_offset(pgd_ref, address); | 492 | pud_ref = pud_offset(p4d_ref, address); |
462 | if (pud_none(*pud_ref)) | 493 | if (pud_none(*pud_ref)) |
463 | return -1; | 494 | return -1; |
464 | 495 | ||
@@ -526,6 +557,7 @@ static void dump_pagetable(unsigned long address) | |||
526 | { | 557 | { |
527 | pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); | 558 | pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); |
528 | pgd_t *pgd = base + pgd_index(address); | 559 | pgd_t *pgd = base + pgd_index(address); |
560 | p4d_t *p4d; | ||
529 | pud_t *pud; | 561 | pud_t *pud; |
530 | pmd_t *pmd; | 562 | pmd_t *pmd; |
531 | pte_t *pte; | 563 | pte_t *pte; |
@@ -538,7 +570,15 @@ static void dump_pagetable(unsigned long address) | |||
538 | if (!pgd_present(*pgd)) | 570 | if (!pgd_present(*pgd)) |
539 | goto out; | 571 | goto out; |
540 | 572 | ||
541 | pud = pud_offset(pgd, address); | 573 | p4d = p4d_offset(pgd, address); |
574 | if (bad_address(p4d)) | ||
575 | goto bad; | ||
576 | |||
577 | printk("P4D %lx ", p4d_val(*p4d)); | ||
578 | if (!p4d_present(*p4d) || p4d_large(*p4d)) | ||
579 | goto out; | ||
580 | |||
581 | pud = pud_offset(p4d, address); | ||
542 | if (bad_address(pud)) | 582 | if (bad_address(pud)) |
543 | goto bad; | 583 | goto bad; |
544 | 584 | ||
@@ -1082,6 +1122,7 @@ static noinline int | |||
1082 | spurious_fault(unsigned long error_code, unsigned long address) | 1122 | spurious_fault(unsigned long error_code, unsigned long address) |
1083 | { | 1123 | { |
1084 | pgd_t *pgd; | 1124 | pgd_t *pgd; |
1125 | p4d_t *p4d; | ||
1085 | pud_t *pud; | 1126 | pud_t *pud; |
1086 | pmd_t *pmd; | 1127 | pmd_t *pmd; |
1087 | pte_t *pte; | 1128 | pte_t *pte; |
@@ -1104,7 +1145,14 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
1104 | if (!pgd_present(*pgd)) | 1145 | if (!pgd_present(*pgd)) |
1105 | return 0; | 1146 | return 0; |
1106 | 1147 | ||
1107 | pud = pud_offset(pgd, address); | 1148 | p4d = p4d_offset(pgd, address); |
1149 | if (!p4d_present(*p4d)) | ||
1150 | return 0; | ||
1151 | |||
1152 | if (p4d_large(*p4d)) | ||
1153 | return spurious_fault_check(error_code, (pte_t *) p4d); | ||
1154 | |||
1155 | pud = pud_offset(p4d, address); | ||
1108 | if (!pud_present(*pud)) | 1156 | if (!pud_present(*pud)) |
1109 | return 0; | 1157 | return 0; |
1110 | 1158 | ||
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 1f3b6ef105cd..456dfdfd2249 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c | |||
@@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) | |||
76 | } | 76 | } |
77 | 77 | ||
78 | /* | 78 | /* |
79 | * 'pteval' can come from a pte, pmd or pud. We only check | 79 | * 'pteval' can come from a pte, pmd, pud or p4d. We only check |
80 | * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the | 80 | * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the |
81 | * same value on all 3 types. | 81 | * same value on all 4 types. |
82 | */ | 82 | */ |
83 | static inline int pte_allows_gup(unsigned long pteval, int write) | 83 | static inline int pte_allows_gup(unsigned long pteval, int write) |
84 | { | 84 | { |
@@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, | |||
295 | return 1; | 295 | return 1; |
296 | } | 296 | } |
297 | 297 | ||
298 | static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | 298 | static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, |
299 | int write, struct page **pages, int *nr) | 299 | int write, struct page **pages, int *nr) |
300 | { | 300 | { |
301 | unsigned long next; | 301 | unsigned long next; |
302 | pud_t *pudp; | 302 | pud_t *pudp; |
303 | 303 | ||
304 | pudp = pud_offset(&pgd, addr); | 304 | pudp = pud_offset(&p4d, addr); |
305 | do { | 305 | do { |
306 | pud_t pud = *pudp; | 306 | pud_t pud = *pudp; |
307 | 307 | ||
@@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | |||
320 | return 1; | 320 | return 1; |
321 | } | 321 | } |
322 | 322 | ||
323 | static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, | ||
324 | int write, struct page **pages, int *nr) | ||
325 | { | ||
326 | unsigned long next; | ||
327 | p4d_t *p4dp; | ||
328 | |||
329 | p4dp = p4d_offset(&pgd, addr); | ||
330 | do { | ||
331 | p4d_t p4d = *p4dp; | ||
332 | |||
333 | next = p4d_addr_end(addr, end); | ||
334 | if (p4d_none(p4d)) | ||
335 | return 0; | ||
336 | BUILD_BUG_ON(p4d_large(p4d)); | ||
337 | if (!gup_pud_range(p4d, addr, next, write, pages, nr)) | ||
338 | return 0; | ||
339 | } while (p4dp++, addr = next, addr != end); | ||
340 | |||
341 | return 1; | ||
342 | } | ||
343 | |||
323 | /* | 344 | /* |
324 | * Like get_user_pages_fast() except its IRQ-safe in that it won't fall | 345 | * Like get_user_pages_fast() except its IRQ-safe in that it won't fall |
325 | * back to the regular GUP. | 346 | * back to the regular GUP. |
@@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
368 | next = pgd_addr_end(addr, end); | 389 | next = pgd_addr_end(addr, end); |
369 | if (pgd_none(pgd)) | 390 | if (pgd_none(pgd)) |
370 | break; | 391 | break; |
371 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | 392 | if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) |
372 | break; | 393 | break; |
373 | } while (pgdp++, addr = next, addr != end); | 394 | } while (pgdp++, addr = next, addr != end); |
374 | local_irq_restore(flags); | 395 | local_irq_restore(flags); |
@@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
440 | next = pgd_addr_end(addr, end); | 461 | next = pgd_addr_end(addr, end); |
441 | if (pgd_none(pgd)) | 462 | if (pgd_none(pgd)) |
442 | goto slow; | 463 | goto slow; |
443 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | 464 | if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) |
444 | goto slow; | 465 | goto slow; |
445 | } while (pgdp++, addr = next, addr != end); | 466 | } while (pgdp++, addr = next, addr != end); |
446 | local_irq_enable(); | 467 | local_irq_enable(); |
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index c5066a260803..302f43fd9c28 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c | |||
@@ -12,10 +12,12 @@ | |||
12 | #include <linux/pagemap.h> | 12 | #include <linux/pagemap.h> |
13 | #include <linux/err.h> | 13 | #include <linux/err.h> |
14 | #include <linux/sysctl.h> | 14 | #include <linux/sysctl.h> |
15 | #include <linux/compat.h> | ||
15 | #include <asm/mman.h> | 16 | #include <asm/mman.h> |
16 | #include <asm/tlb.h> | 17 | #include <asm/tlb.h> |
17 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
18 | #include <asm/pgalloc.h> | 19 | #include <asm/pgalloc.h> |
20 | #include <asm/elf.h> | ||
19 | 21 | ||
20 | #if 0 /* This is just for testing */ | 22 | #if 0 /* This is just for testing */ |
21 | struct page * | 23 | struct page * |
@@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | |||
82 | 84 | ||
83 | info.flags = 0; | 85 | info.flags = 0; |
84 | info.length = len; | 86 | info.length = len; |
85 | info.low_limit = current->mm->mmap_legacy_base; | 87 | info.low_limit = get_mmap_base(1); |
86 | info.high_limit = TASK_SIZE; | 88 | info.high_limit = in_compat_syscall() ? |
89 | tasksize_32bit() : tasksize_64bit(); | ||
87 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); | 90 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); |
88 | info.align_offset = 0; | 91 | info.align_offset = 0; |
89 | return vm_unmapped_area(&info); | 92 | return vm_unmapped_area(&info); |
@@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |||
100 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; | 103 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
101 | info.length = len; | 104 | info.length = len; |
102 | info.low_limit = PAGE_SIZE; | 105 | info.low_limit = PAGE_SIZE; |
103 | info.high_limit = current->mm->mmap_base; | 106 | info.high_limit = get_mmap_base(0); |
104 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); | 107 | info.align_mask = PAGE_MASK & ~huge_page_mask(h); |
105 | info.align_offset = 0; | 108 | info.align_offset = 0; |
106 | addr = vm_unmapped_area(&info); | 109 | addr = vm_unmapped_area(&info); |
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 4473cb4f8b90..04210a29dd60 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c | |||
@@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, | |||
45 | return 0; | 45 | return 0; |
46 | } | 46 | } |
47 | 47 | ||
48 | static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, | ||
49 | unsigned long addr, unsigned long end) | ||
50 | { | ||
51 | unsigned long next; | ||
52 | |||
53 | for (; addr < end; addr = next) { | ||
54 | p4d_t *p4d = p4d_page + p4d_index(addr); | ||
55 | pud_t *pud; | ||
56 | |||
57 | next = (addr & P4D_MASK) + P4D_SIZE; | ||
58 | if (next > end) | ||
59 | next = end; | ||
60 | |||
61 | if (p4d_present(*p4d)) { | ||
62 | pud = pud_offset(p4d, 0); | ||
63 | ident_pud_init(info, pud, addr, next); | ||
64 | continue; | ||
65 | } | ||
66 | pud = (pud_t *)info->alloc_pgt_page(info->context); | ||
67 | if (!pud) | ||
68 | return -ENOMEM; | ||
69 | ident_pud_init(info, pud, addr, next); | ||
70 | set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); | ||
71 | } | ||
72 | |||
73 | return 0; | ||
74 | } | ||
75 | |||
48 | int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, | 76 | int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, |
49 | unsigned long pstart, unsigned long pend) | 77 | unsigned long pstart, unsigned long pend) |
50 | { | 78 | { |
@@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, | |||
55 | 83 | ||
56 | for (; addr < end; addr = next) { | 84 | for (; addr < end; addr = next) { |
57 | pgd_t *pgd = pgd_page + pgd_index(addr); | 85 | pgd_t *pgd = pgd_page + pgd_index(addr); |
58 | pud_t *pud; | 86 | p4d_t *p4d; |
59 | 87 | ||
60 | next = (addr & PGDIR_MASK) + PGDIR_SIZE; | 88 | next = (addr & PGDIR_MASK) + PGDIR_SIZE; |
61 | if (next > end) | 89 | if (next > end) |
62 | next = end; | 90 | next = end; |
63 | 91 | ||
64 | if (pgd_present(*pgd)) { | 92 | if (pgd_present(*pgd)) { |
65 | pud = pud_offset(pgd, 0); | 93 | p4d = p4d_offset(pgd, 0); |
66 | result = ident_pud_init(info, pud, addr, next); | 94 | result = ident_p4d_init(info, p4d, addr, next); |
67 | if (result) | 95 | if (result) |
68 | return result; | 96 | return result; |
69 | continue; | 97 | continue; |
70 | } | 98 | } |
71 | 99 | ||
72 | pud = (pud_t *)info->alloc_pgt_page(info->context); | 100 | p4d = (p4d_t *)info->alloc_pgt_page(info->context); |
73 | if (!pud) | 101 | if (!p4d) |
74 | return -ENOMEM; | 102 | return -ENOMEM; |
75 | result = ident_pud_init(info, pud, addr, next); | 103 | result = ident_p4d_init(info, p4d, addr, next); |
76 | if (result) | 104 | if (result) |
77 | return result; | 105 | return result; |
78 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); | 106 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) { |
107 | set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); | ||
108 | } else { | ||
109 | /* | ||
110 | * With p4d folded, pgd is equal to p4d. | ||
111 | * The pgd entry has to point to the pud page table in this case. | ||
112 | */ | ||
113 | pud_t *pud = pud_offset(p4d, 0); | ||
114 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); | ||
115 | } | ||
79 | } | 116 | } |
80 | 117 | ||
81 | return 0; | 118 | return 0; |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 030bfed10a6c..f34d275ee201 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -56,8 +56,6 @@ | |||
56 | 56 | ||
57 | unsigned long highstart_pfn, highend_pfn; | 57 | unsigned long highstart_pfn, highend_pfn; |
58 | 58 | ||
59 | static noinline int do_test_wp_bit(void); | ||
60 | |||
61 | bool __read_mostly __vmalloc_start_set = false; | 59 | bool __read_mostly __vmalloc_start_set = false; |
62 | 60 | ||
63 | /* | 61 | /* |
@@ -67,6 +65,7 @@ bool __read_mostly __vmalloc_start_set = false; | |||
67 | */ | 65 | */ |
68 | static pmd_t * __init one_md_table_init(pgd_t *pgd) | 66 | static pmd_t * __init one_md_table_init(pgd_t *pgd) |
69 | { | 67 | { |
68 | p4d_t *p4d; | ||
70 | pud_t *pud; | 69 | pud_t *pud; |
71 | pmd_t *pmd_table; | 70 | pmd_t *pmd_table; |
72 | 71 | ||
@@ -75,13 +74,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) | |||
75 | pmd_table = (pmd_t *)alloc_low_page(); | 74 | pmd_table = (pmd_t *)alloc_low_page(); |
76 | paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); | 75 | paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); |
77 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | 76 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); |
78 | pud = pud_offset(pgd, 0); | 77 | p4d = p4d_offset(pgd, 0); |
78 | pud = pud_offset(p4d, 0); | ||
79 | BUG_ON(pmd_table != pmd_offset(pud, 0)); | 79 | BUG_ON(pmd_table != pmd_offset(pud, 0)); |
80 | 80 | ||
81 | return pmd_table; | 81 | return pmd_table; |
82 | } | 82 | } |
83 | #endif | 83 | #endif |
84 | pud = pud_offset(pgd, 0); | 84 | p4d = p4d_offset(pgd, 0); |
85 | pud = pud_offset(p4d, 0); | ||
85 | pmd_table = pmd_offset(pud, 0); | 86 | pmd_table = pmd_offset(pud, 0); |
86 | 87 | ||
87 | return pmd_table; | 88 | return pmd_table; |
@@ -390,8 +391,11 @@ pte_t *kmap_pte; | |||
390 | 391 | ||
391 | static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) | 392 | static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) |
392 | { | 393 | { |
393 | return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), | 394 | pgd_t *pgd = pgd_offset_k(vaddr); |
394 | vaddr), vaddr), vaddr); | 395 | p4d_t *p4d = p4d_offset(pgd, vaddr); |
396 | pud_t *pud = pud_offset(p4d, vaddr); | ||
397 | pmd_t *pmd = pmd_offset(pud, vaddr); | ||
398 | return pte_offset_kernel(pmd, vaddr); | ||
395 | } | 399 | } |
396 | 400 | ||
397 | static void __init kmap_init(void) | 401 | static void __init kmap_init(void) |
@@ -410,6 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) | |||
410 | { | 414 | { |
411 | unsigned long vaddr; | 415 | unsigned long vaddr; |
412 | pgd_t *pgd; | 416 | pgd_t *pgd; |
417 | p4d_t *p4d; | ||
413 | pud_t *pud; | 418 | pud_t *pud; |
414 | pmd_t *pmd; | 419 | pmd_t *pmd; |
415 | pte_t *pte; | 420 | pte_t *pte; |
@@ -418,7 +423,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) | |||
418 | page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); | 423 | page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); |
419 | 424 | ||
420 | pgd = swapper_pg_dir + pgd_index(vaddr); | 425 | pgd = swapper_pg_dir + pgd_index(vaddr); |
421 | pud = pud_offset(pgd, vaddr); | 426 | p4d = p4d_offset(pgd, vaddr); |
427 | pud = pud_offset(p4d, vaddr); | ||
422 | pmd = pmd_offset(pud, vaddr); | 428 | pmd = pmd_offset(pud, vaddr); |
423 | pte = pte_offset_kernel(pmd, vaddr); | 429 | pte = pte_offset_kernel(pmd, vaddr); |
424 | pkmap_page_table = pte; | 430 | pkmap_page_table = pte; |
@@ -450,6 +456,7 @@ void __init native_pagetable_init(void) | |||
450 | { | 456 | { |
451 | unsigned long pfn, va; | 457 | unsigned long pfn, va; |
452 | pgd_t *pgd, *base = swapper_pg_dir; | 458 | pgd_t *pgd, *base = swapper_pg_dir; |
459 | p4d_t *p4d; | ||
453 | pud_t *pud; | 460 | pud_t *pud; |
454 | pmd_t *pmd; | 461 | pmd_t *pmd; |
455 | pte_t *pte; | 462 | pte_t *pte; |
@@ -469,7 +476,8 @@ void __init native_pagetable_init(void) | |||
469 | if (!pgd_present(*pgd)) | 476 | if (!pgd_present(*pgd)) |
470 | break; | 477 | break; |
471 | 478 | ||
472 | pud = pud_offset(pgd, va); | 479 | p4d = p4d_offset(pgd, va); |
480 | pud = pud_offset(p4d, va); | ||
473 | pmd = pmd_offset(pud, va); | 481 | pmd = pmd_offset(pud, va); |
474 | if (!pmd_present(*pmd)) | 482 | if (!pmd_present(*pmd)) |
475 | break; | 483 | break; |
@@ -716,22 +724,20 @@ void __init paging_init(void) | |||
716 | */ | 724 | */ |
717 | static void __init test_wp_bit(void) | 725 | static void __init test_wp_bit(void) |
718 | { | 726 | { |
719 | int wp_works_ok; | 727 | char z = 0; |
720 | 728 | ||
721 | printk(KERN_INFO | 729 | printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode..."); |
722 | "Checking if this processor honours the WP bit even in supervisor mode..."); | ||
723 | 730 | ||
724 | /* Any page-aligned address will do, the test is non-destructive */ | 731 | __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO); |
725 | __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO); | ||
726 | wp_works_ok = do_test_wp_bit(); | ||
727 | clear_fixmap(FIX_WP_TEST); | ||
728 | 732 | ||
729 | if (!wp_works_ok) { | 733 | if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) { |
730 | printk(KERN_CONT "No.\n"); | 734 | clear_fixmap(FIX_WP_TEST); |
731 | panic("Linux doesn't support CPUs with broken WP."); | ||
732 | } else { | ||
733 | printk(KERN_CONT "Ok.\n"); | 735 | printk(KERN_CONT "Ok.\n"); |
736 | return; | ||
734 | } | 737 | } |
738 | |||
739 | printk(KERN_CONT "No.\n"); | ||
740 | panic("Linux doesn't support CPUs with broken WP."); | ||
735 | } | 741 | } |
736 | 742 | ||
737 | void __init mem_init(void) | 743 | void __init mem_init(void) |
@@ -841,30 +847,6 @@ int arch_remove_memory(u64 start, u64 size) | |||
841 | #endif | 847 | #endif |
842 | #endif | 848 | #endif |
843 | 849 | ||
844 | /* | ||
845 | * This function cannot be __init, since exceptions don't work in that | ||
846 | * section. Put this after the callers, so that it cannot be inlined. | ||
847 | */ | ||
848 | static noinline int do_test_wp_bit(void) | ||
849 | { | ||
850 | char tmp_reg; | ||
851 | int flag; | ||
852 | |||
853 | __asm__ __volatile__( | ||
854 | " movb %0, %1 \n" | ||
855 | "1: movb %1, %0 \n" | ||
856 | " xorl %2, %2 \n" | ||
857 | "2: \n" | ||
858 | _ASM_EXTABLE(1b,2b) | ||
859 | :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), | ||
860 | "=q" (tmp_reg), | ||
861 | "=r" (flag) | ||
862 | :"2" (1) | ||
863 | :"memory"); | ||
864 | |||
865 | return flag; | ||
866 | } | ||
867 | |||
868 | int kernel_set_to_readonly __read_mostly; | 850 | int kernel_set_to_readonly __read_mostly; |
869 | 851 | ||
870 | void set_kernel_text_rw(void) | 852 | void set_kernel_text_rw(void) |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f6da869810a8..745e5e183169 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end) | |||
97 | unsigned long address; | 97 | unsigned long address; |
98 | 98 | ||
99 | for (address = start; address <= end; address += PGDIR_SIZE) { | 99 | for (address = start; address <= end; address += PGDIR_SIZE) { |
100 | const pgd_t *pgd_ref = pgd_offset_k(address); | 100 | pgd_t *pgd_ref = pgd_offset_k(address); |
101 | const p4d_t *p4d_ref; | ||
101 | struct page *page; | 102 | struct page *page; |
102 | 103 | ||
103 | if (pgd_none(*pgd_ref)) | 104 | /* |
105 | * With folded p4d, pgd_none() is always false, we need to | ||
106 | * handle synchonization on p4d level. | ||
107 | */ | ||
108 | BUILD_BUG_ON(pgd_none(*pgd_ref)); | ||
109 | p4d_ref = p4d_offset(pgd_ref, address); | ||
110 | |||
111 | if (p4d_none(*p4d_ref)) | ||
104 | continue; | 112 | continue; |
105 | 113 | ||
106 | spin_lock(&pgd_lock); | 114 | spin_lock(&pgd_lock); |
107 | list_for_each_entry(page, &pgd_list, lru) { | 115 | list_for_each_entry(page, &pgd_list, lru) { |
108 | pgd_t *pgd; | 116 | pgd_t *pgd; |
117 | p4d_t *p4d; | ||
109 | spinlock_t *pgt_lock; | 118 | spinlock_t *pgt_lock; |
110 | 119 | ||
111 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | 120 | pgd = (pgd_t *)page_address(page) + pgd_index(address); |
121 | p4d = p4d_offset(pgd, address); | ||
112 | /* the pgt_lock only for Xen */ | 122 | /* the pgt_lock only for Xen */ |
113 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | 123 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; |
114 | spin_lock(pgt_lock); | 124 | spin_lock(pgt_lock); |
115 | 125 | ||
116 | if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) | 126 | if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) |
117 | BUG_ON(pgd_page_vaddr(*pgd) | 127 | BUG_ON(p4d_page_vaddr(*p4d) |
118 | != pgd_page_vaddr(*pgd_ref)); | 128 | != p4d_page_vaddr(*p4d_ref)); |
119 | 129 | ||
120 | if (pgd_none(*pgd)) | 130 | if (p4d_none(*p4d)) |
121 | set_pgd(pgd, *pgd_ref); | 131 | set_p4d(p4d, *p4d_ref); |
122 | 132 | ||
123 | spin_unlock(pgt_lock); | 133 | spin_unlock(pgt_lock); |
124 | } | 134 | } |
@@ -149,16 +159,28 @@ static __ref void *spp_getpage(void) | |||
149 | return ptr; | 159 | return ptr; |
150 | } | 160 | } |
151 | 161 | ||
152 | static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) | 162 | static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr) |
153 | { | 163 | { |
154 | if (pgd_none(*pgd)) { | 164 | if (pgd_none(*pgd)) { |
155 | pud_t *pud = (pud_t *)spp_getpage(); | 165 | p4d_t *p4d = (p4d_t *)spp_getpage(); |
156 | pgd_populate(&init_mm, pgd, pud); | 166 | pgd_populate(&init_mm, pgd, p4d); |
157 | if (pud != pud_offset(pgd, 0)) | 167 | if (p4d != p4d_offset(pgd, 0)) |
158 | printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", | 168 | printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", |
159 | pud, pud_offset(pgd, 0)); | 169 | p4d, p4d_offset(pgd, 0)); |
170 | } | ||
171 | return p4d_offset(pgd, vaddr); | ||
172 | } | ||
173 | |||
174 | static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr) | ||
175 | { | ||
176 | if (p4d_none(*p4d)) { | ||
177 | pud_t *pud = (pud_t *)spp_getpage(); | ||
178 | p4d_populate(&init_mm, p4d, pud); | ||
179 | if (pud != pud_offset(p4d, 0)) | ||
180 | printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", | ||
181 | pud, pud_offset(p4d, 0)); | ||
160 | } | 182 | } |
161 | return pud_offset(pgd, vaddr); | 183 | return pud_offset(p4d, vaddr); |
162 | } | 184 | } |
163 | 185 | ||
164 | static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) | 186 | static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) |
@@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) | |||
167 | pmd_t *pmd = (pmd_t *) spp_getpage(); | 189 | pmd_t *pmd = (pmd_t *) spp_getpage(); |
168 | pud_populate(&init_mm, pud, pmd); | 190 | pud_populate(&init_mm, pud, pmd); |
169 | if (pmd != pmd_offset(pud, 0)) | 191 | if (pmd != pmd_offset(pud, 0)) |
170 | printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", | 192 | printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n", |
171 | pmd, pmd_offset(pud, 0)); | 193 | pmd, pmd_offset(pud, 0)); |
172 | } | 194 | } |
173 | return pmd_offset(pud, vaddr); | 195 | return pmd_offset(pud, vaddr); |
@@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) | |||
179 | pte_t *pte = (pte_t *) spp_getpage(); | 201 | pte_t *pte = (pte_t *) spp_getpage(); |
180 | pmd_populate_kernel(&init_mm, pmd, pte); | 202 | pmd_populate_kernel(&init_mm, pmd, pte); |
181 | if (pte != pte_offset_kernel(pmd, 0)) | 203 | if (pte != pte_offset_kernel(pmd, 0)) |
182 | printk(KERN_ERR "PAGETABLE BUG #02!\n"); | 204 | printk(KERN_ERR "PAGETABLE BUG #03!\n"); |
183 | } | 205 | } |
184 | return pte_offset_kernel(pmd, vaddr); | 206 | return pte_offset_kernel(pmd, vaddr); |
185 | } | 207 | } |
186 | 208 | ||
187 | void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) | 209 | static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) |
188 | { | 210 | { |
189 | pud_t *pud; | 211 | pmd_t *pmd = fill_pmd(pud, vaddr); |
190 | pmd_t *pmd; | 212 | pte_t *pte = fill_pte(pmd, vaddr); |
191 | pte_t *pte; | ||
192 | |||
193 | pud = pud_page + pud_index(vaddr); | ||
194 | pmd = fill_pmd(pud, vaddr); | ||
195 | pte = fill_pte(pmd, vaddr); | ||
196 | 213 | ||
197 | set_pte(pte, new_pte); | 214 | set_pte(pte, new_pte); |
198 | 215 | ||
@@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) | |||
203 | __flush_tlb_one(vaddr); | 220 | __flush_tlb_one(vaddr); |
204 | } | 221 | } |
205 | 222 | ||
223 | void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) | ||
224 | { | ||
225 | p4d_t *p4d = p4d_page + p4d_index(vaddr); | ||
226 | pud_t *pud = fill_pud(p4d, vaddr); | ||
227 | |||
228 | __set_pte_vaddr(pud, vaddr, new_pte); | ||
229 | } | ||
230 | |||
231 | void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) | ||
232 | { | ||
233 | pud_t *pud = pud_page + pud_index(vaddr); | ||
234 | |||
235 | __set_pte_vaddr(pud, vaddr, new_pte); | ||
236 | } | ||
237 | |||
206 | void set_pte_vaddr(unsigned long vaddr, pte_t pteval) | 238 | void set_pte_vaddr(unsigned long vaddr, pte_t pteval) |
207 | { | 239 | { |
208 | pgd_t *pgd; | 240 | pgd_t *pgd; |
209 | pud_t *pud_page; | 241 | p4d_t *p4d_page; |
210 | 242 | ||
211 | pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); | 243 | pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); |
212 | 244 | ||
@@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) | |||
216 | "PGD FIXMAP MISSING, it should be setup in head.S!\n"); | 248 | "PGD FIXMAP MISSING, it should be setup in head.S!\n"); |
217 | return; | 249 | return; |
218 | } | 250 | } |
219 | pud_page = (pud_t*)pgd_page_vaddr(*pgd); | 251 | |
220 | set_pte_vaddr_pud(pud_page, vaddr, pteval); | 252 | p4d_page = p4d_offset(pgd, 0); |
253 | set_pte_vaddr_p4d(p4d_page, vaddr, pteval); | ||
221 | } | 254 | } |
222 | 255 | ||
223 | pmd_t * __init populate_extra_pmd(unsigned long vaddr) | 256 | pmd_t * __init populate_extra_pmd(unsigned long vaddr) |
224 | { | 257 | { |
225 | pgd_t *pgd; | 258 | pgd_t *pgd; |
259 | p4d_t *p4d; | ||
226 | pud_t *pud; | 260 | pud_t *pud; |
227 | 261 | ||
228 | pgd = pgd_offset_k(vaddr); | 262 | pgd = pgd_offset_k(vaddr); |
229 | pud = fill_pud(pgd, vaddr); | 263 | p4d = fill_p4d(pgd, vaddr); |
264 | pud = fill_pud(p4d, vaddr); | ||
230 | return fill_pmd(pud, vaddr); | 265 | return fill_pmd(pud, vaddr); |
231 | } | 266 | } |
232 | 267 | ||
@@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, | |||
245 | enum page_cache_mode cache) | 280 | enum page_cache_mode cache) |
246 | { | 281 | { |
247 | pgd_t *pgd; | 282 | pgd_t *pgd; |
283 | p4d_t *p4d; | ||
248 | pud_t *pud; | 284 | pud_t *pud; |
249 | pmd_t *pmd; | 285 | pmd_t *pmd; |
250 | pgprot_t prot; | 286 | pgprot_t prot; |
@@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, | |||
255 | for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { | 291 | for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { |
256 | pgd = pgd_offset_k((unsigned long)__va(phys)); | 292 | pgd = pgd_offset_k((unsigned long)__va(phys)); |
257 | if (pgd_none(*pgd)) { | 293 | if (pgd_none(*pgd)) { |
294 | p4d = (p4d_t *) spp_getpage(); | ||
295 | set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE | | ||
296 | _PAGE_USER)); | ||
297 | } | ||
298 | p4d = p4d_offset(pgd, (unsigned long)__va(phys)); | ||
299 | if (p4d_none(*p4d)) { | ||
258 | pud = (pud_t *) spp_getpage(); | 300 | pud = (pud_t *) spp_getpage(); |
259 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | | 301 | set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE | |
260 | _PAGE_USER)); | 302 | _PAGE_USER)); |
261 | } | 303 | } |
262 | pud = pud_offset(pgd, (unsigned long)__va(phys)); | 304 | pud = pud_offset(p4d, (unsigned long)__va(phys)); |
263 | if (pud_none(*pud)) { | 305 | if (pud_none(*pud)) { |
264 | pmd = (pmd_t *) spp_getpage(); | 306 | pmd = (pmd_t *) spp_getpage(); |
265 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | | 307 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | |
@@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start, | |||
563 | 605 | ||
564 | for (; vaddr < vaddr_end; vaddr = vaddr_next) { | 606 | for (; vaddr < vaddr_end; vaddr = vaddr_next) { |
565 | pgd_t *pgd = pgd_offset_k(vaddr); | 607 | pgd_t *pgd = pgd_offset_k(vaddr); |
608 | p4d_t *p4d; | ||
566 | pud_t *pud; | 609 | pud_t *pud; |
567 | 610 | ||
568 | vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; | 611 | vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; |
569 | 612 | ||
570 | if (pgd_val(*pgd)) { | 613 | BUILD_BUG_ON(pgd_none(*pgd)); |
571 | pud = (pud_t *)pgd_page_vaddr(*pgd); | 614 | p4d = p4d_offset(pgd, vaddr); |
615 | if (p4d_val(*p4d)) { | ||
616 | pud = (pud_t *)p4d_page_vaddr(*p4d); | ||
572 | paddr_last = phys_pud_init(pud, __pa(vaddr), | 617 | paddr_last = phys_pud_init(pud, __pa(vaddr), |
573 | __pa(vaddr_end), | 618 | __pa(vaddr_end), |
574 | page_size_mask); | 619 | page_size_mask); |
@@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, | |||
580 | page_size_mask); | 625 | page_size_mask); |
581 | 626 | ||
582 | spin_lock(&init_mm.page_table_lock); | 627 | spin_lock(&init_mm.page_table_lock); |
583 | pgd_populate(&init_mm, pgd, pud); | 628 | p4d_populate(&init_mm, p4d, pud); |
584 | spin_unlock(&init_mm.page_table_lock); | 629 | spin_unlock(&init_mm.page_table_lock); |
585 | pgd_changed = true; | 630 | pgd_changed = true; |
586 | } | 631 | } |
@@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) | |||
726 | spin_unlock(&init_mm.page_table_lock); | 771 | spin_unlock(&init_mm.page_table_lock); |
727 | } | 772 | } |
728 | 773 | ||
774 | static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) | ||
775 | { | ||
776 | pud_t *pud; | ||
777 | int i; | ||
778 | |||
779 | for (i = 0; i < PTRS_PER_PUD; i++) { | ||
780 | pud = pud_start + i; | ||
781 | if (!pud_none(*pud)) | ||
782 | return; | ||
783 | } | ||
784 | |||
785 | /* free a pud talbe */ | ||
786 | free_pagetable(p4d_page(*p4d), 0); | ||
787 | spin_lock(&init_mm.page_table_lock); | ||
788 | p4d_clear(p4d); | ||
789 | spin_unlock(&init_mm.page_table_lock); | ||
790 | } | ||
791 | |||
729 | static void __meminit | 792 | static void __meminit |
730 | remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, | 793 | remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, |
731 | bool direct) | 794 | bool direct) |
@@ -899,7 +962,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, | |||
899 | continue; | 962 | continue; |
900 | } | 963 | } |
901 | 964 | ||
902 | pmd_base = (pmd_t *)pud_page_vaddr(*pud); | 965 | pmd_base = pmd_offset(pud, 0); |
903 | remove_pmd_table(pmd_base, addr, next, direct); | 966 | remove_pmd_table(pmd_base, addr, next, direct); |
904 | free_pmd_table(pmd_base, pud); | 967 | free_pmd_table(pmd_base, pud); |
905 | } | 968 | } |
@@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, | |||
908 | update_page_count(PG_LEVEL_1G, -pages); | 971 | update_page_count(PG_LEVEL_1G, -pages); |
909 | } | 972 | } |
910 | 973 | ||
974 | static void __meminit | ||
975 | remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, | ||
976 | bool direct) | ||
977 | { | ||
978 | unsigned long next, pages = 0; | ||
979 | pud_t *pud_base; | ||
980 | p4d_t *p4d; | ||
981 | |||
982 | p4d = p4d_start + p4d_index(addr); | ||
983 | for (; addr < end; addr = next, p4d++) { | ||
984 | next = p4d_addr_end(addr, end); | ||
985 | |||
986 | if (!p4d_present(*p4d)) | ||
987 | continue; | ||
988 | |||
989 | BUILD_BUG_ON(p4d_large(*p4d)); | ||
990 | |||
991 | pud_base = pud_offset(p4d, 0); | ||
992 | remove_pud_table(pud_base, addr, next, direct); | ||
993 | free_pud_table(pud_base, p4d); | ||
994 | } | ||
995 | |||
996 | if (direct) | ||
997 | update_page_count(PG_LEVEL_512G, -pages); | ||
998 | } | ||
999 | |||
911 | /* start and end are both virtual address. */ | 1000 | /* start and end are both virtual address. */ |
912 | static void __meminit | 1001 | static void __meminit |
913 | remove_pagetable(unsigned long start, unsigned long end, bool direct) | 1002 | remove_pagetable(unsigned long start, unsigned long end, bool direct) |
@@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) | |||
915 | unsigned long next; | 1004 | unsigned long next; |
916 | unsigned long addr; | 1005 | unsigned long addr; |
917 | pgd_t *pgd; | 1006 | pgd_t *pgd; |
918 | pud_t *pud; | 1007 | p4d_t *p4d; |
919 | 1008 | ||
920 | for (addr = start; addr < end; addr = next) { | 1009 | for (addr = start; addr < end; addr = next) { |
921 | next = pgd_addr_end(addr, end); | 1010 | next = pgd_addr_end(addr, end); |
@@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) | |||
924 | if (!pgd_present(*pgd)) | 1013 | if (!pgd_present(*pgd)) |
925 | continue; | 1014 | continue; |
926 | 1015 | ||
927 | pud = (pud_t *)pgd_page_vaddr(*pgd); | 1016 | p4d = p4d_offset(pgd, 0); |
928 | remove_pud_table(pud, addr, next, direct); | 1017 | remove_p4d_table(p4d, addr, next, direct); |
929 | } | 1018 | } |
930 | 1019 | ||
931 | flush_tlb_all(); | 1020 | flush_tlb_all(); |
@@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr) | |||
1090 | { | 1179 | { |
1091 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; | 1180 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; |
1092 | pgd_t *pgd; | 1181 | pgd_t *pgd; |
1182 | p4d_t *p4d; | ||
1093 | pud_t *pud; | 1183 | pud_t *pud; |
1094 | pmd_t *pmd; | 1184 | pmd_t *pmd; |
1095 | pte_t *pte; | 1185 | pte_t *pte; |
@@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr) | |||
1101 | if (pgd_none(*pgd)) | 1191 | if (pgd_none(*pgd)) |
1102 | return 0; | 1192 | return 0; |
1103 | 1193 | ||
1104 | pud = pud_offset(pgd, addr); | 1194 | p4d = p4d_offset(pgd, addr); |
1195 | if (p4d_none(*p4d)) | ||
1196 | return 0; | ||
1197 | |||
1198 | pud = pud_offset(p4d, addr); | ||
1105 | if (pud_none(*pud)) | 1199 | if (pud_none(*pud)) |
1106 | return 0; | 1200 | return 0; |
1107 | 1201 | ||
@@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, | |||
1158 | unsigned long addr; | 1252 | unsigned long addr; |
1159 | unsigned long next; | 1253 | unsigned long next; |
1160 | pgd_t *pgd; | 1254 | pgd_t *pgd; |
1255 | p4d_t *p4d; | ||
1161 | pud_t *pud; | 1256 | pud_t *pud; |
1162 | pmd_t *pmd; | 1257 | pmd_t *pmd; |
1163 | 1258 | ||
@@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, | |||
1168 | if (!pgd) | 1263 | if (!pgd) |
1169 | return -ENOMEM; | 1264 | return -ENOMEM; |
1170 | 1265 | ||
1171 | pud = vmemmap_pud_populate(pgd, addr, node); | 1266 | p4d = vmemmap_p4d_populate(pgd, addr, node); |
1267 | if (!p4d) | ||
1268 | return -ENOMEM; | ||
1269 | |||
1270 | pud = vmemmap_pud_populate(p4d, addr, node); | ||
1172 | if (!pud) | 1271 | if (!pud) |
1173 | return -ENOMEM; | 1272 | return -ENOMEM; |
1174 | 1273 | ||
@@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, | |||
1236 | unsigned long end = (unsigned long)(start_page + size); | 1335 | unsigned long end = (unsigned long)(start_page + size); |
1237 | unsigned long next; | 1336 | unsigned long next; |
1238 | pgd_t *pgd; | 1337 | pgd_t *pgd; |
1338 | p4d_t *p4d; | ||
1239 | pud_t *pud; | 1339 | pud_t *pud; |
1240 | pmd_t *pmd; | 1340 | pmd_t *pmd; |
1241 | unsigned int nr_pages; | 1341 | unsigned int nr_pages; |
@@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr, | |||
1251 | } | 1351 | } |
1252 | get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); | 1352 | get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); |
1253 | 1353 | ||
1254 | pud = pud_offset(pgd, addr); | 1354 | p4d = p4d_offset(pgd, addr); |
1355 | if (p4d_none(*p4d)) { | ||
1356 | next = (addr + PAGE_SIZE) & PAGE_MASK; | ||
1357 | continue; | ||
1358 | } | ||
1359 | get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO); | ||
1360 | |||
1361 | pud = pud_offset(p4d, addr); | ||
1255 | if (pud_none(*pud)) { | 1362 | if (pud_none(*pud)) { |
1256 | next = (addr + PAGE_SIZE) & PAGE_MASK; | 1363 | next = (addr + PAGE_SIZE) & PAGE_MASK; |
1257 | continue; | 1364 | continue; |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c43b6b33463a..e4f7b25df18e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -426,7 +426,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) | |||
426 | /* Don't assume we're using swapper_pg_dir at this point */ | 426 | /* Don't assume we're using swapper_pg_dir at this point */ |
427 | pgd_t *base = __va(read_cr3()); | 427 | pgd_t *base = __va(read_cr3()); |
428 | pgd_t *pgd = &base[pgd_index(addr)]; | 428 | pgd_t *pgd = &base[pgd_index(addr)]; |
429 | pud_t *pud = pud_offset(pgd, addr); | 429 | p4d_t *p4d = p4d_offset(pgd, addr); |
430 | pud_t *pud = pud_offset(p4d, addr); | ||
430 | pmd_t *pmd = pmd_offset(pud, addr); | 431 | pmd_t *pmd = pmd_offset(pud, addr); |
431 | 432 | ||
432 | return pmd; | 433 | return pmd; |
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index da92df32d0f1..0c7d8129bed6 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c | |||
@@ -34,8 +34,19 @@ static int __init map_range(struct range *range) | |||
34 | static void __init clear_pgds(unsigned long start, | 34 | static void __init clear_pgds(unsigned long start, |
35 | unsigned long end) | 35 | unsigned long end) |
36 | { | 36 | { |
37 | for (; start < end; start += PGDIR_SIZE) | 37 | pgd_t *pgd; |
38 | pgd_clear(pgd_offset_k(start)); | 38 | |
39 | for (; start < end; start += PGDIR_SIZE) { | ||
40 | pgd = pgd_offset_k(start); | ||
41 | /* | ||
42 | * With folded p4d, pgd_clear() is nop, use p4d_clear() | ||
43 | * instead. | ||
44 | */ | ||
45 | if (CONFIG_PGTABLE_LEVELS < 5) | ||
46 | p4d_clear(p4d_offset(pgd, start)); | ||
47 | else | ||
48 | pgd_clear(pgd); | ||
49 | } | ||
39 | } | 50 | } |
40 | 51 | ||
41 | static void __init kasan_map_early_shadow(pgd_t *pgd) | 52 | static void __init kasan_map_early_shadow(pgd_t *pgd) |
@@ -45,8 +56,18 @@ static void __init kasan_map_early_shadow(pgd_t *pgd) | |||
45 | unsigned long end = KASAN_SHADOW_END; | 56 | unsigned long end = KASAN_SHADOW_END; |
46 | 57 | ||
47 | for (i = pgd_index(start); start < end; i++) { | 58 | for (i = pgd_index(start); start < end; i++) { |
48 | pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | 59 | switch (CONFIG_PGTABLE_LEVELS) { |
49 | | _KERNPG_TABLE); | 60 | case 4: |
61 | pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | | ||
62 | _KERNPG_TABLE); | ||
63 | break; | ||
64 | case 5: | ||
65 | pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) | | ||
66 | _KERNPG_TABLE); | ||
67 | break; | ||
68 | default: | ||
69 | BUILD_BUG(); | ||
70 | } | ||
50 | start += PGDIR_SIZE; | 71 | start += PGDIR_SIZE; |
51 | } | 72 | } |
52 | } | 73 | } |
@@ -74,6 +95,7 @@ void __init kasan_early_init(void) | |||
74 | pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; | 95 | pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; |
75 | pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; | 96 | pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; |
76 | pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; | 97 | pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; |
98 | p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; | ||
77 | 99 | ||
78 | for (i = 0; i < PTRS_PER_PTE; i++) | 100 | for (i = 0; i < PTRS_PER_PTE; i++) |
79 | kasan_zero_pte[i] = __pte(pte_val); | 101 | kasan_zero_pte[i] = __pte(pte_val); |
@@ -84,6 +106,9 @@ void __init kasan_early_init(void) | |||
84 | for (i = 0; i < PTRS_PER_PUD; i++) | 106 | for (i = 0; i < PTRS_PER_PUD; i++) |
85 | kasan_zero_pud[i] = __pud(pud_val); | 107 | kasan_zero_pud[i] = __pud(pud_val); |
86 | 108 | ||
109 | for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) | ||
110 | kasan_zero_p4d[i] = __p4d(p4d_val); | ||
111 | |||
87 | kasan_map_early_shadow(early_level4_pgt); | 112 | kasan_map_early_shadow(early_level4_pgt); |
88 | kasan_map_early_shadow(init_level4_pgt); | 113 | kasan_map_early_shadow(init_level4_pgt); |
89 | } | 114 | } |
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 7940166c799b..19ad095b41df 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c | |||
@@ -30,30 +30,44 @@ | |||
30 | #include <linux/limits.h> | 30 | #include <linux/limits.h> |
31 | #include <linux/sched/signal.h> | 31 | #include <linux/sched/signal.h> |
32 | #include <linux/sched/mm.h> | 32 | #include <linux/sched/mm.h> |
33 | #include <linux/compat.h> | ||
33 | #include <asm/elf.h> | 34 | #include <asm/elf.h> |
34 | 35 | ||
35 | struct va_alignment __read_mostly va_align = { | 36 | struct va_alignment __read_mostly va_align = { |
36 | .flags = -1, | 37 | .flags = -1, |
37 | }; | 38 | }; |
38 | 39 | ||
39 | static unsigned long stack_maxrandom_size(void) | 40 | unsigned long tasksize_32bit(void) |
41 | { | ||
42 | return IA32_PAGE_OFFSET; | ||
43 | } | ||
44 | |||
45 | unsigned long tasksize_64bit(void) | ||
46 | { | ||
47 | return TASK_SIZE_MAX; | ||
48 | } | ||
49 | |||
50 | static unsigned long stack_maxrandom_size(unsigned long task_size) | ||
40 | { | 51 | { |
41 | unsigned long max = 0; | 52 | unsigned long max = 0; |
42 | if ((current->flags & PF_RANDOMIZE) && | 53 | if ((current->flags & PF_RANDOMIZE) && |
43 | !(current->personality & ADDR_NO_RANDOMIZE)) { | 54 | !(current->personality & ADDR_NO_RANDOMIZE)) { |
44 | max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; | 55 | max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); |
56 | max <<= PAGE_SHIFT; | ||
45 | } | 57 | } |
46 | 58 | ||
47 | return max; | 59 | return max; |
48 | } | 60 | } |
49 | 61 | ||
50 | /* | 62 | #ifdef CONFIG_COMPAT |
51 | * Top of mmap area (just below the process stack). | 63 | # define mmap32_rnd_bits mmap_rnd_compat_bits |
52 | * | 64 | # define mmap64_rnd_bits mmap_rnd_bits |
53 | * Leave an at least ~128 MB hole with possible stack randomization. | 65 | #else |
54 | */ | 66 | # define mmap32_rnd_bits mmap_rnd_bits |
55 | #define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) | 67 | # define mmap64_rnd_bits mmap_rnd_bits |
56 | #define MAX_GAP (TASK_SIZE/6*5) | 68 | #endif |
69 | |||
70 | #define SIZE_128M (128 * 1024 * 1024UL) | ||
57 | 71 | ||
58 | static int mmap_is_legacy(void) | 72 | static int mmap_is_legacy(void) |
59 | { | 73 | { |
@@ -66,54 +80,91 @@ static int mmap_is_legacy(void) | |||
66 | return sysctl_legacy_va_layout; | 80 | return sysctl_legacy_va_layout; |
67 | } | 81 | } |
68 | 82 | ||
69 | unsigned long arch_mmap_rnd(void) | 83 | static unsigned long arch_rnd(unsigned int rndbits) |
70 | { | 84 | { |
71 | unsigned long rnd; | 85 | return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; |
72 | 86 | } | |
73 | if (mmap_is_ia32()) | ||
74 | #ifdef CONFIG_COMPAT | ||
75 | rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); | ||
76 | #else | ||
77 | rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); | ||
78 | #endif | ||
79 | else | ||
80 | rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); | ||
81 | 87 | ||
82 | return rnd << PAGE_SHIFT; | 88 | unsigned long arch_mmap_rnd(void) |
89 | { | ||
90 | if (!(current->flags & PF_RANDOMIZE)) | ||
91 | return 0; | ||
92 | return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); | ||
83 | } | 93 | } |
84 | 94 | ||
85 | static unsigned long mmap_base(unsigned long rnd) | 95 | static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) |
86 | { | 96 | { |
87 | unsigned long gap = rlimit(RLIMIT_STACK); | 97 | unsigned long gap = rlimit(RLIMIT_STACK); |
98 | unsigned long gap_min, gap_max; | ||
99 | |||
100 | /* | ||
101 | * Top of mmap area (just below the process stack). | ||
102 | * Leave an at least ~128 MB hole with possible stack randomization. | ||
103 | */ | ||
104 | gap_min = SIZE_128M + stack_maxrandom_size(task_size); | ||
105 | gap_max = (task_size / 6) * 5; | ||
88 | 106 | ||
89 | if (gap < MIN_GAP) | 107 | if (gap < gap_min) |
90 | gap = MIN_GAP; | 108 | gap = gap_min; |
91 | else if (gap > MAX_GAP) | 109 | else if (gap > gap_max) |
92 | gap = MAX_GAP; | 110 | gap = gap_max; |
111 | |||
112 | return PAGE_ALIGN(task_size - gap - rnd); | ||
113 | } | ||
93 | 114 | ||
94 | return PAGE_ALIGN(TASK_SIZE - gap - rnd); | 115 | static unsigned long mmap_legacy_base(unsigned long rnd, |
116 | unsigned long task_size) | ||
117 | { | ||
118 | return __TASK_UNMAPPED_BASE(task_size) + rnd; | ||
95 | } | 119 | } |
96 | 120 | ||
97 | /* | 121 | /* |
98 | * This function, called very early during the creation of a new | 122 | * This function, called very early during the creation of a new |
99 | * process VM image, sets up which VM layout function to use: | 123 | * process VM image, sets up which VM layout function to use: |
100 | */ | 124 | */ |
125 | static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, | ||
126 | unsigned long random_factor, unsigned long task_size) | ||
127 | { | ||
128 | *legacy_base = mmap_legacy_base(random_factor, task_size); | ||
129 | if (mmap_is_legacy()) | ||
130 | *base = *legacy_base; | ||
131 | else | ||
132 | *base = mmap_base(random_factor, task_size); | ||
133 | } | ||
134 | |||
101 | void arch_pick_mmap_layout(struct mm_struct *mm) | 135 | void arch_pick_mmap_layout(struct mm_struct *mm) |
102 | { | 136 | { |
103 | unsigned long random_factor = 0UL; | 137 | if (mmap_is_legacy()) |
138 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
139 | else | ||
140 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
104 | 141 | ||
105 | if (current->flags & PF_RANDOMIZE) | 142 | arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, |
106 | random_factor = arch_mmap_rnd(); | 143 | arch_rnd(mmap64_rnd_bits), tasksize_64bit()); |
144 | |||
145 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES | ||
146 | /* | ||
147 | * The mmap syscall mapping base decision depends solely on the | ||
148 | * syscall type (64-bit or compat). This applies for 64bit | ||
149 | * applications and 32bit applications. The 64bit syscall uses | ||
150 | * mmap_base, the compat syscall uses mmap_compat_base. | ||
151 | */ | ||
152 | arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, | ||
153 | arch_rnd(mmap32_rnd_bits), tasksize_32bit()); | ||
154 | #endif | ||
155 | } | ||
107 | 156 | ||
108 | mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; | 157 | unsigned long get_mmap_base(int is_legacy) |
158 | { | ||
159 | struct mm_struct *mm = current->mm; | ||
109 | 160 | ||
110 | if (mmap_is_legacy()) { | 161 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES |
111 | mm->mmap_base = mm->mmap_legacy_base; | 162 | if (in_compat_syscall()) { |
112 | mm->get_unmapped_area = arch_get_unmapped_area; | 163 | return is_legacy ? mm->mmap_compat_legacy_base |
113 | } else { | 164 | : mm->mmap_compat_base; |
114 | mm->mmap_base = mmap_base(random_factor); | ||
115 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
116 | } | 165 | } |
166 | #endif | ||
167 | return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; | ||
117 | } | 168 | } |
118 | 169 | ||
119 | const char *arch_vma_name(struct vm_area_struct *vma) | 170 | const char *arch_vma_name(struct vm_area_struct *vma) |
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index cd44ae727df7..1c34b767c84c 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c | |||
@@ -526,15 +526,7 @@ int mpx_handle_bd_fault(void) | |||
526 | if (!kernel_managing_mpx_tables(current->mm)) | 526 | if (!kernel_managing_mpx_tables(current->mm)) |
527 | return -EINVAL; | 527 | return -EINVAL; |
528 | 528 | ||
529 | if (do_mpx_bt_fault()) { | 529 | return do_mpx_bt_fault(); |
530 | force_sig(SIGSEGV, current); | ||
531 | /* | ||
532 | * The force_sig() is essentially "handling" this | ||
533 | * exception, so we do not pass up the error | ||
534 | * from do_mpx_bt_fault(). | ||
535 | */ | ||
536 | } | ||
537 | return 0; | ||
538 | } | 530 | } |
539 | 531 | ||
540 | /* | 532 | /* |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f9d99535f233..25504d5aa816 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -201,7 +201,7 @@ static void __init alloc_node_data(int nid) | |||
201 | nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, | 201 | nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, |
202 | MEMBLOCK_ALLOC_ACCESSIBLE); | 202 | MEMBLOCK_ALLOC_ACCESSIBLE); |
203 | if (!nd_pa) { | 203 | if (!nd_pa) { |
204 | pr_err("Cannot find %zu bytes in node %d\n", | 204 | pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", |
205 | nd_size, nid); | 205 | nd_size, nid); |
206 | return; | 206 | return; |
207 | } | 207 | } |
@@ -225,7 +225,7 @@ static void __init alloc_node_data(int nid) | |||
225 | * numa_cleanup_meminfo - Cleanup a numa_meminfo | 225 | * numa_cleanup_meminfo - Cleanup a numa_meminfo |
226 | * @mi: numa_meminfo to clean up | 226 | * @mi: numa_meminfo to clean up |
227 | * | 227 | * |
228 | * Sanitize @mi by merging and removing unncessary memblks. Also check for | 228 | * Sanitize @mi by merging and removing unnecessary memblks. Also check for |
229 | * conflicts and clear unused memblks. | 229 | * conflicts and clear unused memblks. |
230 | * | 230 | * |
231 | * RETURNS: | 231 | * RETURNS: |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a57e8e02f457..56b22fa504df 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, | |||
346 | pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, | 346 | pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, |
347 | unsigned int *level) | 347 | unsigned int *level) |
348 | { | 348 | { |
349 | p4d_t *p4d; | ||
349 | pud_t *pud; | 350 | pud_t *pud; |
350 | pmd_t *pmd; | 351 | pmd_t *pmd; |
351 | 352 | ||
@@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, | |||
354 | if (pgd_none(*pgd)) | 355 | if (pgd_none(*pgd)) |
355 | return NULL; | 356 | return NULL; |
356 | 357 | ||
357 | pud = pud_offset(pgd, address); | 358 | p4d = p4d_offset(pgd, address); |
359 | if (p4d_none(*p4d)) | ||
360 | return NULL; | ||
361 | |||
362 | *level = PG_LEVEL_512G; | ||
363 | if (p4d_large(*p4d) || !p4d_present(*p4d)) | ||
364 | return (pte_t *)p4d; | ||
365 | |||
366 | pud = pud_offset(p4d, address); | ||
358 | if (pud_none(*pud)) | 367 | if (pud_none(*pud)) |
359 | return NULL; | 368 | return NULL; |
360 | 369 | ||
@@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, | |||
406 | pmd_t *lookup_pmd_address(unsigned long address) | 415 | pmd_t *lookup_pmd_address(unsigned long address) |
407 | { | 416 | { |
408 | pgd_t *pgd; | 417 | pgd_t *pgd; |
418 | p4d_t *p4d; | ||
409 | pud_t *pud; | 419 | pud_t *pud; |
410 | 420 | ||
411 | pgd = pgd_offset_k(address); | 421 | pgd = pgd_offset_k(address); |
412 | if (pgd_none(*pgd)) | 422 | if (pgd_none(*pgd)) |
413 | return NULL; | 423 | return NULL; |
414 | 424 | ||
415 | pud = pud_offset(pgd, address); | 425 | p4d = p4d_offset(pgd, address); |
426 | if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) | ||
427 | return NULL; | ||
428 | |||
429 | pud = pud_offset(p4d, address); | ||
416 | if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) | 430 | if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) |
417 | return NULL; | 431 | return NULL; |
418 | 432 | ||
@@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) | |||
477 | 491 | ||
478 | list_for_each_entry(page, &pgd_list, lru) { | 492 | list_for_each_entry(page, &pgd_list, lru) { |
479 | pgd_t *pgd; | 493 | pgd_t *pgd; |
494 | p4d_t *p4d; | ||
480 | pud_t *pud; | 495 | pud_t *pud; |
481 | pmd_t *pmd; | 496 | pmd_t *pmd; |
482 | 497 | ||
483 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | 498 | pgd = (pgd_t *)page_address(page) + pgd_index(address); |
484 | pud = pud_offset(pgd, address); | 499 | p4d = p4d_offset(pgd, address); |
500 | pud = pud_offset(p4d, address); | ||
485 | pmd = pmd_offset(pud, address); | 501 | pmd = pmd_offset(pud, address); |
486 | set_pte_atomic((pte_t *)pmd, pte); | 502 | set_pte_atomic((pte_t *)pmd, pte); |
487 | } | 503 | } |
@@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) | |||
836 | pud_clear(pud); | 852 | pud_clear(pud); |
837 | } | 853 | } |
838 | 854 | ||
839 | static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) | 855 | static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) |
840 | { | 856 | { |
841 | pud_t *pud = pud_offset(pgd, start); | 857 | pud_t *pud = pud_offset(p4d, start); |
842 | 858 | ||
843 | /* | 859 | /* |
844 | * Not on a GB page boundary? | 860 | * Not on a GB page boundary? |
@@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa, | |||
1004 | return num_pages; | 1020 | return num_pages; |
1005 | } | 1021 | } |
1006 | 1022 | ||
1007 | static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, | 1023 | static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, |
1008 | pgprot_t pgprot) | 1024 | pgprot_t pgprot) |
1009 | { | 1025 | { |
1010 | pud_t *pud; | 1026 | pud_t *pud; |
1011 | unsigned long end; | 1027 | unsigned long end; |
@@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, | |||
1026 | cur_pages = (pre_end - start) >> PAGE_SHIFT; | 1042 | cur_pages = (pre_end - start) >> PAGE_SHIFT; |
1027 | cur_pages = min_t(int, (int)cpa->numpages, cur_pages); | 1043 | cur_pages = min_t(int, (int)cpa->numpages, cur_pages); |
1028 | 1044 | ||
1029 | pud = pud_offset(pgd, start); | 1045 | pud = pud_offset(p4d, start); |
1030 | 1046 | ||
1031 | /* | 1047 | /* |
1032 | * Need a PMD page? | 1048 | * Need a PMD page? |
@@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, | |||
1047 | if (cpa->numpages == cur_pages) | 1063 | if (cpa->numpages == cur_pages) |
1048 | return cur_pages; | 1064 | return cur_pages; |
1049 | 1065 | ||
1050 | pud = pud_offset(pgd, start); | 1066 | pud = pud_offset(p4d, start); |
1051 | pud_pgprot = pgprot_4k_2_large(pgprot); | 1067 | pud_pgprot = pgprot_4k_2_large(pgprot); |
1052 | 1068 | ||
1053 | /* | 1069 | /* |
@@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, | |||
1067 | if (start < end) { | 1083 | if (start < end) { |
1068 | long tmp; | 1084 | long tmp; |
1069 | 1085 | ||
1070 | pud = pud_offset(pgd, start); | 1086 | pud = pud_offset(p4d, start); |
1071 | if (pud_none(*pud)) | 1087 | if (pud_none(*pud)) |
1072 | if (alloc_pmd_page(pud)) | 1088 | if (alloc_pmd_page(pud)) |
1073 | return -1; | 1089 | return -1; |
@@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) | |||
1090 | { | 1106 | { |
1091 | pgprot_t pgprot = __pgprot(_KERNPG_TABLE); | 1107 | pgprot_t pgprot = __pgprot(_KERNPG_TABLE); |
1092 | pud_t *pud = NULL; /* shut up gcc */ | 1108 | pud_t *pud = NULL; /* shut up gcc */ |
1109 | p4d_t *p4d; | ||
1093 | pgd_t *pgd_entry; | 1110 | pgd_t *pgd_entry; |
1094 | long ret; | 1111 | long ret; |
1095 | 1112 | ||
1096 | pgd_entry = cpa->pgd + pgd_index(addr); | 1113 | pgd_entry = cpa->pgd + pgd_index(addr); |
1097 | 1114 | ||
1115 | if (pgd_none(*pgd_entry)) { | ||
1116 | p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | ||
1117 | if (!p4d) | ||
1118 | return -1; | ||
1119 | |||
1120 | set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); | ||
1121 | } | ||
1122 | |||
1098 | /* | 1123 | /* |
1099 | * Allocate a PUD page and hand it down for mapping. | 1124 | * Allocate a PUD page and hand it down for mapping. |
1100 | */ | 1125 | */ |
1101 | if (pgd_none(*pgd_entry)) { | 1126 | p4d = p4d_offset(pgd_entry, addr); |
1127 | if (p4d_none(*p4d)) { | ||
1102 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | 1128 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); |
1103 | if (!pud) | 1129 | if (!pud) |
1104 | return -1; | 1130 | return -1; |
1105 | 1131 | ||
1106 | set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); | 1132 | set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); |
1107 | } | 1133 | } |
1108 | 1134 | ||
1109 | pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); | 1135 | pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); |
1110 | pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); | 1136 | pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); |
1111 | 1137 | ||
1112 | ret = populate_pud(cpa, addr, pgd_entry, pgprot); | 1138 | ret = populate_pud(cpa, addr, p4d, pgprot); |
1113 | if (ret < 0) { | 1139 | if (ret < 0) { |
1114 | /* | 1140 | /* |
1115 | * Leave the PUD page in place in case some other CPU or thread | 1141 | * Leave the PUD page in place in case some other CPU or thread |
1116 | * already found it, but remove any useless entries we just | 1142 | * already found it, but remove any useless entries we just |
1117 | * added to it. | 1143 | * added to it. |
1118 | */ | 1144 | */ |
1119 | unmap_pud_range(pgd_entry, addr, | 1145 | unmap_pud_range(p4d, addr, |
1120 | addr + (cpa->numpages << PAGE_SHIFT)); | 1146 | addr + (cpa->numpages << PAGE_SHIFT)); |
1121 | return ret; | 1147 | return ret; |
1122 | } | 1148 | } |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6cbdff26bb96..508a708eb9a6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) | |||
81 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); | 81 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); |
82 | tlb_remove_page(tlb, virt_to_page(pud)); | 82 | tlb_remove_page(tlb, virt_to_page(pud)); |
83 | } | 83 | } |
84 | |||
85 | #if CONFIG_PGTABLE_LEVELS > 4 | ||
86 | void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) | ||
87 | { | ||
88 | paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); | ||
89 | tlb_remove_page(tlb, virt_to_page(p4d)); | ||
90 | } | ||
91 | #endif /* CONFIG_PGTABLE_LEVELS > 4 */ | ||
84 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ | 92 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ |
85 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ | 93 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ |
86 | 94 | ||
@@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) | |||
120 | references from swapper_pg_dir. */ | 128 | references from swapper_pg_dir. */ |
121 | if (CONFIG_PGTABLE_LEVELS == 2 || | 129 | if (CONFIG_PGTABLE_LEVELS == 2 || |
122 | (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || | 130 | (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || |
123 | CONFIG_PGTABLE_LEVELS == 4) { | 131 | CONFIG_PGTABLE_LEVELS >= 4) { |
124 | clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, | 132 | clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, |
125 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | 133 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, |
126 | KERNEL_PGD_PTRS); | 134 | KERNEL_PGD_PTRS); |
@@ -261,13 +269,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |||
261 | 269 | ||
262 | static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) | 270 | static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) |
263 | { | 271 | { |
272 | p4d_t *p4d; | ||
264 | pud_t *pud; | 273 | pud_t *pud; |
265 | int i; | 274 | int i; |
266 | 275 | ||
267 | if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ | 276 | if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ |
268 | return; | 277 | return; |
269 | 278 | ||
270 | pud = pud_offset(pgd, 0); | 279 | p4d = p4d_offset(pgd, 0); |
280 | pud = pud_offset(p4d, 0); | ||
271 | 281 | ||
272 | for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { | 282 | for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { |
273 | pmd_t *pmd = pmds[i]; | 283 | pmd_t *pmd = pmds[i]; |
@@ -580,6 +590,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, | |||
580 | } | 590 | } |
581 | 591 | ||
582 | #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP | 592 | #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP |
593 | #ifdef CONFIG_X86_5LEVEL | ||
594 | /** | ||
595 | * p4d_set_huge - setup kernel P4D mapping | ||
596 | * | ||
597 | * No 512GB pages yet -- always return 0 | ||
598 | */ | ||
599 | int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) | ||
600 | { | ||
601 | return 0; | ||
602 | } | ||
603 | |||
604 | /** | ||
605 | * p4d_clear_huge - clear kernel P4D mapping when it is set | ||
606 | * | ||
607 | * No 512GB pages yet -- always return 0 | ||
608 | */ | ||
609 | int p4d_clear_huge(p4d_t *p4d) | ||
610 | { | ||
611 | return 0; | ||
612 | } | ||
613 | #endif | ||
614 | |||
583 | /** | 615 | /** |
584 | * pud_set_huge - setup kernel PUD mapping | 616 | * pud_set_huge - setup kernel PUD mapping |
585 | * | 617 | * |
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index de53c52551a5..b9bd5b8b14fa 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c | |||
@@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; | |||
26 | void set_pte_vaddr(unsigned long vaddr, pte_t pteval) | 26 | void set_pte_vaddr(unsigned long vaddr, pte_t pteval) |
27 | { | 27 | { |
28 | pgd_t *pgd; | 28 | pgd_t *pgd; |
29 | p4d_t *p4d; | ||
29 | pud_t *pud; | 30 | pud_t *pud; |
30 | pmd_t *pmd; | 31 | pmd_t *pmd; |
31 | pte_t *pte; | 32 | pte_t *pte; |
@@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) | |||
35 | BUG(); | 36 | BUG(); |
36 | return; | 37 | return; |
37 | } | 38 | } |
38 | pud = pud_offset(pgd, vaddr); | 39 | p4d = p4d_offset(pgd, vaddr); |
40 | if (p4d_none(*p4d)) { | ||
41 | BUG(); | ||
42 | return; | ||
43 | } | ||
44 | pud = pud_offset(p4d, vaddr); | ||
39 | if (pud_none(*pud)) { | 45 | if (pud_none(*pud)) { |
40 | BUG(); | 46 | BUG(); |
41 | return; | 47 | return; |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a7655f6caf7d..6e7bedf69af7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -263,8 +263,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
263 | { | 263 | { |
264 | struct flush_tlb_info info; | 264 | struct flush_tlb_info info; |
265 | 265 | ||
266 | if (end == 0) | ||
267 | end = start + PAGE_SIZE; | ||
268 | info.flush_mm = mm; | 266 | info.flush_mm = mm; |
269 | info.flush_start = start; | 267 | info.flush_start = start; |
270 | info.flush_end = end; | 268 | info.flush_end = end; |
@@ -289,23 +287,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
289 | smp_call_function_many(cpumask, flush_tlb_func, &info, 1); | 287 | smp_call_function_many(cpumask, flush_tlb_func, &info, 1); |
290 | } | 288 | } |
291 | 289 | ||
292 | void flush_tlb_current_task(void) | ||
293 | { | ||
294 | struct mm_struct *mm = current->mm; | ||
295 | |||
296 | preempt_disable(); | ||
297 | |||
298 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | ||
299 | |||
300 | /* This is an implicit full barrier that synchronizes with switch_mm. */ | ||
301 | local_flush_tlb(); | ||
302 | |||
303 | trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); | ||
304 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | ||
305 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); | ||
306 | preempt_enable(); | ||
307 | } | ||
308 | |||
309 | /* | 290 | /* |
310 | * See Documentation/x86/tlb.txt for details. We choose 33 | 291 | * See Documentation/x86/tlb.txt for details. We choose 33 |
311 | * because it is large enough to cover the vast majority (at | 292 | * because it is large enough to cover the vast majority (at |
@@ -326,6 +307,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
326 | unsigned long base_pages_to_flush = TLB_FLUSH_ALL; | 307 | unsigned long base_pages_to_flush = TLB_FLUSH_ALL; |
327 | 308 | ||
328 | preempt_disable(); | 309 | preempt_disable(); |
310 | |||
311 | if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) | ||
312 | base_pages_to_flush = (end - start) >> PAGE_SHIFT; | ||
313 | if (base_pages_to_flush > tlb_single_page_flush_ceiling) | ||
314 | base_pages_to_flush = TLB_FLUSH_ALL; | ||
315 | |||
329 | if (current->active_mm != mm) { | 316 | if (current->active_mm != mm) { |
330 | /* Synchronize with switch_mm. */ | 317 | /* Synchronize with switch_mm. */ |
331 | smp_mb(); | 318 | smp_mb(); |
@@ -342,15 +329,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
342 | goto out; | 329 | goto out; |
343 | } | 330 | } |
344 | 331 | ||
345 | if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) | ||
346 | base_pages_to_flush = (end - start) >> PAGE_SHIFT; | ||
347 | |||
348 | /* | 332 | /* |
349 | * Both branches below are implicit full barriers (MOV to CR or | 333 | * Both branches below are implicit full barriers (MOV to CR or |
350 | * INVLPG) that synchronize with switch_mm. | 334 | * INVLPG) that synchronize with switch_mm. |
351 | */ | 335 | */ |
352 | if (base_pages_to_flush > tlb_single_page_flush_ceiling) { | 336 | if (base_pages_to_flush == TLB_FLUSH_ALL) { |
353 | base_pages_to_flush = TLB_FLUSH_ALL; | ||
354 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | 337 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
355 | local_flush_tlb(); | 338 | local_flush_tlb(); |
356 | } else { | 339 | } else { |
@@ -393,7 +376,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) | |||
393 | } | 376 | } |
394 | 377 | ||
395 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | 378 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
396 | flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); | 379 | flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); |
397 | 380 | ||
398 | preempt_enable(); | 381 | preempt_enable(); |
399 | } | 382 | } |
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index cef39b097649..3481268da3d0 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c | |||
@@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void) | |||
68 | load_cr3(initial_page_table); | 68 | load_cr3(initial_page_table); |
69 | __flush_tlb_all(); | 69 | __flush_tlb_all(); |
70 | 70 | ||
71 | gdt_descr.address = __pa(get_cpu_gdt_table(0)); | 71 | gdt_descr.address = get_cpu_gdt_paddr(0); |
72 | gdt_descr.size = GDT_SIZE - 1; | 72 | gdt_descr.size = GDT_SIZE - 1; |
73 | load_gdt(&gdt_descr); | 73 | load_gdt(&gdt_descr); |
74 | 74 | ||
@@ -79,7 +79,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) | |||
79 | { | 79 | { |
80 | struct desc_ptr gdt_descr; | 80 | struct desc_ptr gdt_descr; |
81 | 81 | ||
82 | gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); | 82 | gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0); |
83 | gdt_descr.size = GDT_SIZE - 1; | 83 | gdt_descr.size = GDT_SIZE - 1; |
84 | load_gdt(&gdt_descr); | 84 | load_gdt(&gdt_descr); |
85 | 85 | ||
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 642a8698ad61..c488625c9712 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c | |||
@@ -135,6 +135,7 @@ static pgd_t *efi_pgd; | |||
135 | int __init efi_alloc_page_tables(void) | 135 | int __init efi_alloc_page_tables(void) |
136 | { | 136 | { |
137 | pgd_t *pgd; | 137 | pgd_t *pgd; |
138 | p4d_t *p4d; | ||
138 | pud_t *pud; | 139 | pud_t *pud; |
139 | gfp_t gfp_mask; | 140 | gfp_t gfp_mask; |
140 | 141 | ||
@@ -147,15 +148,20 @@ int __init efi_alloc_page_tables(void) | |||
147 | return -ENOMEM; | 148 | return -ENOMEM; |
148 | 149 | ||
149 | pgd = efi_pgd + pgd_index(EFI_VA_END); | 150 | pgd = efi_pgd + pgd_index(EFI_VA_END); |
151 | p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END); | ||
152 | if (!p4d) { | ||
153 | free_page((unsigned long)efi_pgd); | ||
154 | return -ENOMEM; | ||
155 | } | ||
150 | 156 | ||
151 | pud = pud_alloc_one(NULL, 0); | 157 | pud = pud_alloc(&init_mm, p4d, EFI_VA_END); |
152 | if (!pud) { | 158 | if (!pud) { |
159 | if (CONFIG_PGTABLE_LEVELS > 4) | ||
160 | free_page((unsigned long) pgd_page_vaddr(*pgd)); | ||
153 | free_page((unsigned long)efi_pgd); | 161 | free_page((unsigned long)efi_pgd); |
154 | return -ENOMEM; | 162 | return -ENOMEM; |
155 | } | 163 | } |
156 | 164 | ||
157 | pgd_populate(NULL, pgd, pud); | ||
158 | |||
159 | return 0; | 165 | return 0; |
160 | } | 166 | } |
161 | 167 | ||
@@ -166,6 +172,7 @@ void efi_sync_low_kernel_mappings(void) | |||
166 | { | 172 | { |
167 | unsigned num_entries; | 173 | unsigned num_entries; |
168 | pgd_t *pgd_k, *pgd_efi; | 174 | pgd_t *pgd_k, *pgd_efi; |
175 | p4d_t *p4d_k, *p4d_efi; | ||
169 | pud_t *pud_k, *pud_efi; | 176 | pud_t *pud_k, *pud_efi; |
170 | 177 | ||
171 | if (efi_enabled(EFI_OLD_MEMMAP)) | 178 | if (efi_enabled(EFI_OLD_MEMMAP)) |
@@ -190,23 +197,37 @@ void efi_sync_low_kernel_mappings(void) | |||
190 | memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries); | 197 | memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries); |
191 | 198 | ||
192 | /* | 199 | /* |
200 | * As with PGDs, we share all P4D entries apart from the one entry | ||
201 | * that covers the EFI runtime mapping space. | ||
202 | */ | ||
203 | BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END)); | ||
204 | BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK)); | ||
205 | |||
206 | pgd_efi = efi_pgd + pgd_index(EFI_VA_END); | ||
207 | pgd_k = pgd_offset_k(EFI_VA_END); | ||
208 | p4d_efi = p4d_offset(pgd_efi, 0); | ||
209 | p4d_k = p4d_offset(pgd_k, 0); | ||
210 | |||
211 | num_entries = p4d_index(EFI_VA_END); | ||
212 | memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries); | ||
213 | |||
214 | /* | ||
193 | * We share all the PUD entries apart from those that map the | 215 | * We share all the PUD entries apart from those that map the |
194 | * EFI regions. Copy around them. | 216 | * EFI regions. Copy around them. |
195 | */ | 217 | */ |
196 | BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0); | 218 | BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0); |
197 | BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); | 219 | BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); |
198 | 220 | ||
199 | pgd_efi = efi_pgd + pgd_index(EFI_VA_END); | 221 | p4d_efi = p4d_offset(pgd_efi, EFI_VA_END); |
200 | pud_efi = pud_offset(pgd_efi, 0); | 222 | p4d_k = p4d_offset(pgd_k, EFI_VA_END); |
201 | 223 | pud_efi = pud_offset(p4d_efi, 0); | |
202 | pgd_k = pgd_offset_k(EFI_VA_END); | 224 | pud_k = pud_offset(p4d_k, 0); |
203 | pud_k = pud_offset(pgd_k, 0); | ||
204 | 225 | ||
205 | num_entries = pud_index(EFI_VA_END); | 226 | num_entries = pud_index(EFI_VA_END); |
206 | memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); | 227 | memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); |
207 | 228 | ||
208 | pud_efi = pud_offset(pgd_efi, EFI_VA_START); | 229 | pud_efi = pud_offset(p4d_efi, EFI_VA_START); |
209 | pud_k = pud_offset(pgd_k, EFI_VA_START); | 230 | pud_k = pud_offset(p4d_k, EFI_VA_START); |
210 | 231 | ||
211 | num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); | 232 | num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); |
212 | memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); | 233 | memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); |
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 66ade16c7693..6b05a9219ea2 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c | |||
@@ -95,7 +95,7 @@ static void __save_processor_state(struct saved_context *ctxt) | |||
95 | * 'pmode_gdt' in wakeup_start. | 95 | * 'pmode_gdt' in wakeup_start. |
96 | */ | 96 | */ |
97 | ctxt->gdt_desc.size = GDT_SIZE - 1; | 97 | ctxt->gdt_desc.size = GDT_SIZE - 1; |
98 | ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); | 98 | ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id()); |
99 | 99 | ||
100 | store_tr(ctxt->tr); | 100 | store_tr(ctxt->tr); |
101 | 101 | ||
@@ -162,7 +162,7 @@ static void fix_processor_context(void) | |||
162 | int cpu = smp_processor_id(); | 162 | int cpu = smp_processor_id(); |
163 | struct tss_struct *t = &per_cpu(cpu_tss, cpu); | 163 | struct tss_struct *t = &per_cpu(cpu_tss, cpu); |
164 | #ifdef CONFIG_X86_64 | 164 | #ifdef CONFIG_X86_64 |
165 | struct desc_struct *desc = get_cpu_gdt_table(cpu); | 165 | struct desc_struct *desc = get_cpu_gdt_rw(cpu); |
166 | tss_desc tss; | 166 | tss_desc tss; |
167 | #endif | 167 | #endif |
168 | set_tss_desc(cpu, t); /* | 168 | set_tss_desc(cpu, t); /* |
@@ -183,6 +183,9 @@ static void fix_processor_context(void) | |||
183 | load_mm_ldt(current->active_mm); /* This does lldt */ | 183 | load_mm_ldt(current->active_mm); /* This does lldt */ |
184 | 184 | ||
185 | fpu__resume_cpu(); | 185 | fpu__resume_cpu(); |
186 | |||
187 | /* The processor is back on the direct GDT, load back the fixmap */ | ||
188 | load_fixmap_gdt(cpu); | ||
186 | } | 189 | } |
187 | 190 | ||
188 | /** | 191 | /** |
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 9f14bd34581d..c35fdb585c68 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c | |||
@@ -32,6 +32,7 @@ pgd_t *resume_pg_dir; | |||
32 | */ | 32 | */ |
33 | static pmd_t *resume_one_md_table_init(pgd_t *pgd) | 33 | static pmd_t *resume_one_md_table_init(pgd_t *pgd) |
34 | { | 34 | { |
35 | p4d_t *p4d; | ||
35 | pud_t *pud; | 36 | pud_t *pud; |
36 | pmd_t *pmd_table; | 37 | pmd_t *pmd_table; |
37 | 38 | ||
@@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd) | |||
41 | return NULL; | 42 | return NULL; |
42 | 43 | ||
43 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | 44 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); |
44 | pud = pud_offset(pgd, 0); | 45 | p4d = p4d_offset(pgd, 0); |
46 | pud = pud_offset(p4d, 0); | ||
45 | 47 | ||
46 | BUG_ON(pmd_table != pmd_offset(pud, 0)); | 48 | BUG_ON(pmd_table != pmd_offset(pud, 0)); |
47 | #else | 49 | #else |
48 | pud = pud_offset(pgd, 0); | 50 | p4d = p4d_offset(pgd, 0); |
51 | pud = pud_offset(p4d, 0); | ||
49 | pmd_table = pmd_offset(pud, 0); | 52 | pmd_table = pmd_offset(pud, 0); |
50 | #endif | 53 | #endif |
51 | 54 | ||
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 053801b022dd..6a61194ffd58 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c | |||
@@ -50,6 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) | |||
50 | { | 50 | { |
51 | pmd_t *pmd; | 51 | pmd_t *pmd; |
52 | pud_t *pud; | 52 | pud_t *pud; |
53 | p4d_t *p4d; | ||
53 | 54 | ||
54 | /* | 55 | /* |
55 | * The new mapping only has to cover the page containing the image | 56 | * The new mapping only has to cover the page containing the image |
@@ -64,6 +65,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) | |||
64 | * the virtual address space after switching over to the original page | 65 | * the virtual address space after switching over to the original page |
65 | * tables used by the image kernel. | 66 | * tables used by the image kernel. |
66 | */ | 67 | */ |
68 | |||
69 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) { | ||
70 | p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); | ||
71 | if (!p4d) | ||
72 | return -ENOMEM; | ||
73 | } | ||
74 | |||
67 | pud = (pud_t *)get_safe_page(GFP_ATOMIC); | 75 | pud = (pud_t *)get_safe_page(GFP_ATOMIC); |
68 | if (!pud) | 76 | if (!pud) |
69 | return -ENOMEM; | 77 | return -ENOMEM; |
@@ -76,8 +84,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) | |||
76 | __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); | 84 | __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); |
77 | set_pud(pud + pud_index(restore_jump_address), | 85 | set_pud(pud + pud_index(restore_jump_address), |
78 | __pud(__pa(pmd) | _KERNPG_TABLE)); | 86 | __pud(__pa(pmd) | _KERNPG_TABLE)); |
79 | set_pgd(pgd + pgd_index(restore_jump_address), | 87 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) { |
80 | __pgd(__pa(pud) | _KERNPG_TABLE)); | 88 | set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); |
89 | set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); | ||
90 | } else { | ||
91 | /* No p4d for 4-level paging: point the pgd to the pud page table */ | ||
92 | set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE)); | ||
93 | } | ||
81 | 94 | ||
82 | return 0; | 95 | return 0; |
83 | } | 96 | } |
@@ -125,7 +138,10 @@ static int set_up_temporary_mappings(void) | |||
125 | static int relocate_restore_code(void) | 138 | static int relocate_restore_code(void) |
126 | { | 139 | { |
127 | pgd_t *pgd; | 140 | pgd_t *pgd; |
141 | p4d_t *p4d; | ||
128 | pud_t *pud; | 142 | pud_t *pud; |
143 | pmd_t *pmd; | ||
144 | pte_t *pte; | ||
129 | 145 | ||
130 | relocated_restore_code = get_safe_page(GFP_ATOMIC); | 146 | relocated_restore_code = get_safe_page(GFP_ATOMIC); |
131 | if (!relocated_restore_code) | 147 | if (!relocated_restore_code) |
@@ -135,22 +151,25 @@ static int relocate_restore_code(void) | |||
135 | 151 | ||
136 | /* Make the page containing the relocated code executable */ | 152 | /* Make the page containing the relocated code executable */ |
137 | pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); | 153 | pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); |
138 | pud = pud_offset(pgd, relocated_restore_code); | 154 | p4d = p4d_offset(pgd, relocated_restore_code); |
155 | if (p4d_large(*p4d)) { | ||
156 | set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); | ||
157 | goto out; | ||
158 | } | ||
159 | pud = pud_offset(p4d, relocated_restore_code); | ||
139 | if (pud_large(*pud)) { | 160 | if (pud_large(*pud)) { |
140 | set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); | 161 | set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); |
141 | } else { | 162 | goto out; |
142 | pmd_t *pmd = pmd_offset(pud, relocated_restore_code); | 163 | } |
143 | 164 | pmd = pmd_offset(pud, relocated_restore_code); | |
144 | if (pmd_large(*pmd)) { | 165 | if (pmd_large(*pmd)) { |
145 | set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); | 166 | set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); |
146 | } else { | 167 | goto out; |
147 | pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); | ||
148 | |||
149 | set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); | ||
150 | } | ||
151 | } | 168 | } |
169 | pte = pte_offset_kernel(pmd, relocated_restore_code); | ||
170 | set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); | ||
171 | out: | ||
152 | __flush_tlb_all(); | 172 | __flush_tlb_all(); |
153 | |||
154 | return 0; | 173 | return 0; |
155 | } | 174 | } |
156 | 175 | ||
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 504ec746b2e4..30822e8e64ac 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -711,7 +711,7 @@ static void load_TLS_descriptor(struct thread_struct *t, | |||
711 | 711 | ||
712 | *shadow = t->tls_array[i]; | 712 | *shadow = t->tls_array[i]; |
713 | 713 | ||
714 | gdt = get_cpu_gdt_table(cpu); | 714 | gdt = get_cpu_gdt_rw(cpu); |
715 | maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); | 715 | maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); |
716 | mc = __xen_mc_entry(0); | 716 | mc = __xen_mc_entry(0); |
717 | 717 | ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 1d68be6e3ff1..f226038a39ca 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -535,40 +535,41 @@ static pgd_t *xen_get_user_pgd(pgd_t *pgd) | |||
535 | return user_ptr; | 535 | return user_ptr; |
536 | } | 536 | } |
537 | 537 | ||
538 | static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | 538 | static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) |
539 | { | 539 | { |
540 | struct mmu_update u; | 540 | struct mmu_update u; |
541 | 541 | ||
542 | u.ptr = virt_to_machine(ptr).maddr; | 542 | u.ptr = virt_to_machine(ptr).maddr; |
543 | u.val = pgd_val_ma(val); | 543 | u.val = p4d_val_ma(val); |
544 | xen_extend_mmu_update(&u); | 544 | xen_extend_mmu_update(&u); |
545 | } | 545 | } |
546 | 546 | ||
547 | /* | 547 | /* |
548 | * Raw hypercall-based set_pgd, intended for in early boot before | 548 | * Raw hypercall-based set_p4d, intended for in early boot before |
549 | * there's a page structure. This implies: | 549 | * there's a page structure. This implies: |
550 | * 1. The only existing pagetable is the kernel's | 550 | * 1. The only existing pagetable is the kernel's |
551 | * 2. It is always pinned | 551 | * 2. It is always pinned |
552 | * 3. It has no user pagetable attached to it | 552 | * 3. It has no user pagetable attached to it |
553 | */ | 553 | */ |
554 | static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | 554 | static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) |
555 | { | 555 | { |
556 | preempt_disable(); | 556 | preempt_disable(); |
557 | 557 | ||
558 | xen_mc_batch(); | 558 | xen_mc_batch(); |
559 | 559 | ||
560 | __xen_set_pgd_hyper(ptr, val); | 560 | __xen_set_p4d_hyper(ptr, val); |
561 | 561 | ||
562 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 562 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
563 | 563 | ||
564 | preempt_enable(); | 564 | preempt_enable(); |
565 | } | 565 | } |
566 | 566 | ||
567 | static void xen_set_pgd(pgd_t *ptr, pgd_t val) | 567 | static void xen_set_p4d(p4d_t *ptr, p4d_t val) |
568 | { | 568 | { |
569 | pgd_t *user_ptr = xen_get_user_pgd(ptr); | 569 | pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); |
570 | pgd_t pgd_val; | ||
570 | 571 | ||
571 | trace_xen_mmu_set_pgd(ptr, user_ptr, val); | 572 | trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); |
572 | 573 | ||
573 | /* If page is not pinned, we can just update the entry | 574 | /* If page is not pinned, we can just update the entry |
574 | directly */ | 575 | directly */ |
@@ -576,7 +577,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
576 | *ptr = val; | 577 | *ptr = val; |
577 | if (user_ptr) { | 578 | if (user_ptr) { |
578 | WARN_ON(xen_page_pinned(user_ptr)); | 579 | WARN_ON(xen_page_pinned(user_ptr)); |
579 | *user_ptr = val; | 580 | pgd_val.pgd = p4d_val_ma(val); |
581 | *user_ptr = pgd_val; | ||
580 | } | 582 | } |
581 | return; | 583 | return; |
582 | } | 584 | } |
@@ -585,14 +587,72 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
585 | user updates together. */ | 587 | user updates together. */ |
586 | xen_mc_batch(); | 588 | xen_mc_batch(); |
587 | 589 | ||
588 | __xen_set_pgd_hyper(ptr, val); | 590 | __xen_set_p4d_hyper(ptr, val); |
589 | if (user_ptr) | 591 | if (user_ptr) |
590 | __xen_set_pgd_hyper(user_ptr, val); | 592 | __xen_set_p4d_hyper((p4d_t *)user_ptr, val); |
591 | 593 | ||
592 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 594 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
593 | } | 595 | } |
594 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ | 596 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ |
595 | 597 | ||
598 | static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, | ||
599 | int (*func)(struct mm_struct *mm, struct page *, enum pt_level), | ||
600 | bool last, unsigned long limit) | ||
601 | { | ||
602 | int i, nr, flush = 0; | ||
603 | |||
604 | nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; | ||
605 | for (i = 0; i < nr; i++) { | ||
606 | if (!pmd_none(pmd[i])) | ||
607 | flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); | ||
608 | } | ||
609 | return flush; | ||
610 | } | ||
611 | |||
612 | static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, | ||
613 | int (*func)(struct mm_struct *mm, struct page *, enum pt_level), | ||
614 | bool last, unsigned long limit) | ||
615 | { | ||
616 | int i, nr, flush = 0; | ||
617 | |||
618 | nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; | ||
619 | for (i = 0; i < nr; i++) { | ||
620 | pmd_t *pmd; | ||
621 | |||
622 | if (pud_none(pud[i])) | ||
623 | continue; | ||
624 | |||
625 | pmd = pmd_offset(&pud[i], 0); | ||
626 | if (PTRS_PER_PMD > 1) | ||
627 | flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); | ||
628 | flush |= xen_pmd_walk(mm, pmd, func, | ||
629 | last && i == nr - 1, limit); | ||
630 | } | ||
631 | return flush; | ||
632 | } | ||
633 | |||
634 | static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, | ||
635 | int (*func)(struct mm_struct *mm, struct page *, enum pt_level), | ||
636 | bool last, unsigned long limit) | ||
637 | { | ||
638 | int i, nr, flush = 0; | ||
639 | |||
640 | nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; | ||
641 | for (i = 0; i < nr; i++) { | ||
642 | pud_t *pud; | ||
643 | |||
644 | if (p4d_none(p4d[i])) | ||
645 | continue; | ||
646 | |||
647 | pud = pud_offset(&p4d[i], 0); | ||
648 | if (PTRS_PER_PUD > 1) | ||
649 | flush |= (*func)(mm, virt_to_page(pud), PT_PUD); | ||
650 | flush |= xen_pud_walk(mm, pud, func, | ||
651 | last && i == nr - 1, limit); | ||
652 | } | ||
653 | return flush; | ||
654 | } | ||
655 | |||
596 | /* | 656 | /* |
597 | * (Yet another) pagetable walker. This one is intended for pinning a | 657 | * (Yet another) pagetable walker. This one is intended for pinning a |
598 | * pagetable. This means that it walks a pagetable and calls the | 658 | * pagetable. This means that it walks a pagetable and calls the |
@@ -613,10 +673,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, | |||
613 | enum pt_level), | 673 | enum pt_level), |
614 | unsigned long limit) | 674 | unsigned long limit) |
615 | { | 675 | { |
616 | int flush = 0; | 676 | int i, nr, flush = 0; |
617 | unsigned hole_low, hole_high; | 677 | unsigned hole_low, hole_high; |
618 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; | ||
619 | unsigned pgdidx, pudidx, pmdidx; | ||
620 | 678 | ||
621 | /* The limit is the last byte to be touched */ | 679 | /* The limit is the last byte to be touched */ |
622 | limit--; | 680 | limit--; |
@@ -633,65 +691,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, | |||
633 | hole_low = pgd_index(USER_LIMIT); | 691 | hole_low = pgd_index(USER_LIMIT); |
634 | hole_high = pgd_index(PAGE_OFFSET); | 692 | hole_high = pgd_index(PAGE_OFFSET); |
635 | 693 | ||
636 | pgdidx_limit = pgd_index(limit); | 694 | nr = pgd_index(limit) + 1; |
637 | #if PTRS_PER_PUD > 1 | 695 | for (i = 0; i < nr; i++) { |
638 | pudidx_limit = pud_index(limit); | 696 | p4d_t *p4d; |
639 | #else | ||
640 | pudidx_limit = 0; | ||
641 | #endif | ||
642 | #if PTRS_PER_PMD > 1 | ||
643 | pmdidx_limit = pmd_index(limit); | ||
644 | #else | ||
645 | pmdidx_limit = 0; | ||
646 | #endif | ||
647 | |||
648 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { | ||
649 | pud_t *pud; | ||
650 | 697 | ||
651 | if (pgdidx >= hole_low && pgdidx < hole_high) | 698 | if (i >= hole_low && i < hole_high) |
652 | continue; | 699 | continue; |
653 | 700 | ||
654 | if (!pgd_val(pgd[pgdidx])) | 701 | if (pgd_none(pgd[i])) |
655 | continue; | 702 | continue; |
656 | 703 | ||
657 | pud = pud_offset(&pgd[pgdidx], 0); | 704 | p4d = p4d_offset(&pgd[i], 0); |
658 | 705 | if (PTRS_PER_P4D > 1) | |
659 | if (PTRS_PER_PUD > 1) /* not folded */ | 706 | flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); |
660 | flush |= (*func)(mm, virt_to_page(pud), PT_PUD); | 707 | flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); |
661 | |||
662 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { | ||
663 | pmd_t *pmd; | ||
664 | |||
665 | if (pgdidx == pgdidx_limit && | ||
666 | pudidx > pudidx_limit) | ||
667 | goto out; | ||
668 | |||
669 | if (pud_none(pud[pudidx])) | ||
670 | continue; | ||
671 | |||
672 | pmd = pmd_offset(&pud[pudidx], 0); | ||
673 | |||
674 | if (PTRS_PER_PMD > 1) /* not folded */ | ||
675 | flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); | ||
676 | |||
677 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { | ||
678 | struct page *pte; | ||
679 | |||
680 | if (pgdidx == pgdidx_limit && | ||
681 | pudidx == pudidx_limit && | ||
682 | pmdidx > pmdidx_limit) | ||
683 | goto out; | ||
684 | |||
685 | if (pmd_none(pmd[pmdidx])) | ||
686 | continue; | ||
687 | |||
688 | pte = pmd_page(pmd[pmdidx]); | ||
689 | flush |= (*func)(mm, pte, PT_PTE); | ||
690 | } | ||
691 | } | ||
692 | } | 708 | } |
693 | 709 | ||
694 | out: | ||
695 | /* Do the top level last, so that the callbacks can use it as | 710 | /* Do the top level last, so that the callbacks can use it as |
696 | a cue to do final things like tlb flushes. */ | 711 | a cue to do final things like tlb flushes. */ |
697 | flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); | 712 | flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); |
@@ -1150,57 +1165,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) | |||
1150 | xen_free_ro_pages(pa, PAGE_SIZE); | 1165 | xen_free_ro_pages(pa, PAGE_SIZE); |
1151 | } | 1166 | } |
1152 | 1167 | ||
1168 | static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) | ||
1169 | { | ||
1170 | unsigned long pa; | ||
1171 | pte_t *pte_tbl; | ||
1172 | int i; | ||
1173 | |||
1174 | if (pmd_large(*pmd)) { | ||
1175 | pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; | ||
1176 | xen_free_ro_pages(pa, PMD_SIZE); | ||
1177 | return; | ||
1178 | } | ||
1179 | |||
1180 | pte_tbl = pte_offset_kernel(pmd, 0); | ||
1181 | for (i = 0; i < PTRS_PER_PTE; i++) { | ||
1182 | if (pte_none(pte_tbl[i])) | ||
1183 | continue; | ||
1184 | pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; | ||
1185 | xen_free_ro_pages(pa, PAGE_SIZE); | ||
1186 | } | ||
1187 | set_pmd(pmd, __pmd(0)); | ||
1188 | xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); | ||
1189 | } | ||
1190 | |||
1191 | static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) | ||
1192 | { | ||
1193 | unsigned long pa; | ||
1194 | pmd_t *pmd_tbl; | ||
1195 | int i; | ||
1196 | |||
1197 | if (pud_large(*pud)) { | ||
1198 | pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; | ||
1199 | xen_free_ro_pages(pa, PUD_SIZE); | ||
1200 | return; | ||
1201 | } | ||
1202 | |||
1203 | pmd_tbl = pmd_offset(pud, 0); | ||
1204 | for (i = 0; i < PTRS_PER_PMD; i++) { | ||
1205 | if (pmd_none(pmd_tbl[i])) | ||
1206 | continue; | ||
1207 | xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); | ||
1208 | } | ||
1209 | set_pud(pud, __pud(0)); | ||
1210 | xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); | ||
1211 | } | ||
1212 | |||
1213 | static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) | ||
1214 | { | ||
1215 | unsigned long pa; | ||
1216 | pud_t *pud_tbl; | ||
1217 | int i; | ||
1218 | |||
1219 | if (p4d_large(*p4d)) { | ||
1220 | pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; | ||
1221 | xen_free_ro_pages(pa, P4D_SIZE); | ||
1222 | return; | ||
1223 | } | ||
1224 | |||
1225 | pud_tbl = pud_offset(p4d, 0); | ||
1226 | for (i = 0; i < PTRS_PER_PUD; i++) { | ||
1227 | if (pud_none(pud_tbl[i])) | ||
1228 | continue; | ||
1229 | xen_cleanmfnmap_pud(pud_tbl + i, unpin); | ||
1230 | } | ||
1231 | set_p4d(p4d, __p4d(0)); | ||
1232 | xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); | ||
1233 | } | ||
1234 | |||
1153 | /* | 1235 | /* |
1154 | * Since it is well isolated we can (and since it is perhaps large we should) | 1236 | * Since it is well isolated we can (and since it is perhaps large we should) |
1155 | * also free the page tables mapping the initial P->M table. | 1237 | * also free the page tables mapping the initial P->M table. |
1156 | */ | 1238 | */ |
1157 | static void __init xen_cleanmfnmap(unsigned long vaddr) | 1239 | static void __init xen_cleanmfnmap(unsigned long vaddr) |
1158 | { | 1240 | { |
1159 | unsigned long va = vaddr & PMD_MASK; | 1241 | pgd_t *pgd; |
1160 | unsigned long pa; | 1242 | p4d_t *p4d; |
1161 | pgd_t *pgd = pgd_offset_k(va); | ||
1162 | pud_t *pud_page = pud_offset(pgd, 0); | ||
1163 | pud_t *pud; | ||
1164 | pmd_t *pmd; | ||
1165 | pte_t *pte; | ||
1166 | unsigned int i; | 1243 | unsigned int i; |
1167 | bool unpin; | 1244 | bool unpin; |
1168 | 1245 | ||
1169 | unpin = (vaddr == 2 * PGDIR_SIZE); | 1246 | unpin = (vaddr == 2 * PGDIR_SIZE); |
1170 | set_pgd(pgd, __pgd(0)); | 1247 | vaddr &= PMD_MASK; |
1171 | do { | 1248 | pgd = pgd_offset_k(vaddr); |
1172 | pud = pud_page + pud_index(va); | 1249 | p4d = p4d_offset(pgd, 0); |
1173 | if (pud_none(*pud)) { | 1250 | for (i = 0; i < PTRS_PER_P4D; i++) { |
1174 | va += PUD_SIZE; | 1251 | if (p4d_none(p4d[i])) |
1175 | } else if (pud_large(*pud)) { | 1252 | continue; |
1176 | pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; | 1253 | xen_cleanmfnmap_p4d(p4d + i, unpin); |
1177 | xen_free_ro_pages(pa, PUD_SIZE); | 1254 | } |
1178 | va += PUD_SIZE; | 1255 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) { |
1179 | } else { | 1256 | set_pgd(pgd, __pgd(0)); |
1180 | pmd = pmd_offset(pud, va); | 1257 | xen_cleanmfnmap_free_pgtbl(p4d, unpin); |
1181 | if (pmd_large(*pmd)) { | 1258 | } |
1182 | pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; | ||
1183 | xen_free_ro_pages(pa, PMD_SIZE); | ||
1184 | } else if (!pmd_none(*pmd)) { | ||
1185 | pte = pte_offset_kernel(pmd, va); | ||
1186 | set_pmd(pmd, __pmd(0)); | ||
1187 | for (i = 0; i < PTRS_PER_PTE; ++i) { | ||
1188 | if (pte_none(pte[i])) | ||
1189 | break; | ||
1190 | pa = pte_pfn(pte[i]) << PAGE_SHIFT; | ||
1191 | xen_free_ro_pages(pa, PAGE_SIZE); | ||
1192 | } | ||
1193 | xen_cleanmfnmap_free_pgtbl(pte, unpin); | ||
1194 | } | ||
1195 | va += PMD_SIZE; | ||
1196 | if (pmd_index(va)) | ||
1197 | continue; | ||
1198 | set_pud(pud, __pud(0)); | ||
1199 | xen_cleanmfnmap_free_pgtbl(pmd, unpin); | ||
1200 | } | ||
1201 | |||
1202 | } while (pud_index(va) || pmd_index(va)); | ||
1203 | xen_cleanmfnmap_free_pgtbl(pud_page, unpin); | ||
1204 | } | 1259 | } |
1205 | 1260 | ||
1206 | static void __init xen_pagetable_p2m_free(void) | 1261 | static void __init xen_pagetable_p2m_free(void) |
@@ -1538,7 +1593,6 @@ static int xen_pgd_alloc(struct mm_struct *mm) | |||
1538 | BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); | 1593 | BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); |
1539 | } | 1594 | } |
1540 | #endif | 1595 | #endif |
1541 | |||
1542 | return ret; | 1596 | return ret; |
1543 | } | 1597 | } |
1544 | 1598 | ||
@@ -1730,7 +1784,7 @@ static void xen_release_pmd(unsigned long pfn) | |||
1730 | xen_release_ptpage(pfn, PT_PMD); | 1784 | xen_release_ptpage(pfn, PT_PMD); |
1731 | } | 1785 | } |
1732 | 1786 | ||
1733 | #if CONFIG_PGTABLE_LEVELS == 4 | 1787 | #if CONFIG_PGTABLE_LEVELS >= 4 |
1734 | static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) | 1788 | static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) |
1735 | { | 1789 | { |
1736 | xen_alloc_ptpage(mm, pfn, PT_PUD); | 1790 | xen_alloc_ptpage(mm, pfn, PT_PUD); |
@@ -2071,21 +2125,27 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) | |||
2071 | */ | 2125 | */ |
2072 | void __init xen_relocate_p2m(void) | 2126 | void __init xen_relocate_p2m(void) |
2073 | { | 2127 | { |
2074 | phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; | 2128 | phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; |
2075 | unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; | 2129 | unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; |
2076 | int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; | 2130 | int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; |
2077 | pte_t *pt; | 2131 | pte_t *pt; |
2078 | pmd_t *pmd; | 2132 | pmd_t *pmd; |
2079 | pud_t *pud; | 2133 | pud_t *pud; |
2134 | p4d_t *p4d = NULL; | ||
2080 | pgd_t *pgd; | 2135 | pgd_t *pgd; |
2081 | unsigned long *new_p2m; | 2136 | unsigned long *new_p2m; |
2137 | int save_pud; | ||
2082 | 2138 | ||
2083 | size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); | 2139 | size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); |
2084 | n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; | 2140 | n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; |
2085 | n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; | 2141 | n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; |
2086 | n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; | 2142 | n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; |
2087 | n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; | 2143 | n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; |
2088 | n_frames = n_pte + n_pt + n_pmd + n_pud; | 2144 | if (PTRS_PER_P4D > 1) |
2145 | n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; | ||
2146 | else | ||
2147 | n_p4d = 0; | ||
2148 | n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; | ||
2089 | 2149 | ||
2090 | new_area = xen_find_free_area(PFN_PHYS(n_frames)); | 2150 | new_area = xen_find_free_area(PFN_PHYS(n_frames)); |
2091 | if (!new_area) { | 2151 | if (!new_area) { |
@@ -2101,55 +2161,76 @@ void __init xen_relocate_p2m(void) | |||
2101 | * To avoid any possible virtual address collision, just use | 2161 | * To avoid any possible virtual address collision, just use |
2102 | * 2 * PUD_SIZE for the new area. | 2162 | * 2 * PUD_SIZE for the new area. |
2103 | */ | 2163 | */ |
2104 | pud_phys = new_area; | 2164 | p4d_phys = new_area; |
2165 | pud_phys = p4d_phys + PFN_PHYS(n_p4d); | ||
2105 | pmd_phys = pud_phys + PFN_PHYS(n_pud); | 2166 | pmd_phys = pud_phys + PFN_PHYS(n_pud); |
2106 | pt_phys = pmd_phys + PFN_PHYS(n_pmd); | 2167 | pt_phys = pmd_phys + PFN_PHYS(n_pmd); |
2107 | p2m_pfn = PFN_DOWN(pt_phys) + n_pt; | 2168 | p2m_pfn = PFN_DOWN(pt_phys) + n_pt; |
2108 | 2169 | ||
2109 | pgd = __va(read_cr3()); | 2170 | pgd = __va(read_cr3()); |
2110 | new_p2m = (unsigned long *)(2 * PGDIR_SIZE); | 2171 | new_p2m = (unsigned long *)(2 * PGDIR_SIZE); |
2111 | for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { | 2172 | idx_p4d = 0; |
2112 | pud = early_memremap(pud_phys, PAGE_SIZE); | 2173 | save_pud = n_pud; |
2113 | clear_page(pud); | 2174 | do { |
2114 | for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); | 2175 | if (n_p4d > 0) { |
2115 | idx_pmd++) { | 2176 | p4d = early_memremap(p4d_phys, PAGE_SIZE); |
2116 | pmd = early_memremap(pmd_phys, PAGE_SIZE); | 2177 | clear_page(p4d); |
2117 | clear_page(pmd); | 2178 | n_pud = min(save_pud, PTRS_PER_P4D); |
2118 | for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); | 2179 | } |
2119 | idx_pt++) { | 2180 | for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { |
2120 | pt = early_memremap(pt_phys, PAGE_SIZE); | 2181 | pud = early_memremap(pud_phys, PAGE_SIZE); |
2121 | clear_page(pt); | 2182 | clear_page(pud); |
2122 | for (idx_pte = 0; | 2183 | for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); |
2123 | idx_pte < min(n_pte, PTRS_PER_PTE); | 2184 | idx_pmd++) { |
2124 | idx_pte++) { | 2185 | pmd = early_memremap(pmd_phys, PAGE_SIZE); |
2125 | set_pte(pt + idx_pte, | 2186 | clear_page(pmd); |
2126 | pfn_pte(p2m_pfn, PAGE_KERNEL)); | 2187 | for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); |
2127 | p2m_pfn++; | 2188 | idx_pt++) { |
2189 | pt = early_memremap(pt_phys, PAGE_SIZE); | ||
2190 | clear_page(pt); | ||
2191 | for (idx_pte = 0; | ||
2192 | idx_pte < min(n_pte, PTRS_PER_PTE); | ||
2193 | idx_pte++) { | ||
2194 | set_pte(pt + idx_pte, | ||
2195 | pfn_pte(p2m_pfn, PAGE_KERNEL)); | ||
2196 | p2m_pfn++; | ||
2197 | } | ||
2198 | n_pte -= PTRS_PER_PTE; | ||
2199 | early_memunmap(pt, PAGE_SIZE); | ||
2200 | make_lowmem_page_readonly(__va(pt_phys)); | ||
2201 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, | ||
2202 | PFN_DOWN(pt_phys)); | ||
2203 | set_pmd(pmd + idx_pt, | ||
2204 | __pmd(_PAGE_TABLE | pt_phys)); | ||
2205 | pt_phys += PAGE_SIZE; | ||
2128 | } | 2206 | } |
2129 | n_pte -= PTRS_PER_PTE; | 2207 | n_pt -= PTRS_PER_PMD; |
2130 | early_memunmap(pt, PAGE_SIZE); | 2208 | early_memunmap(pmd, PAGE_SIZE); |
2131 | make_lowmem_page_readonly(__va(pt_phys)); | 2209 | make_lowmem_page_readonly(__va(pmd_phys)); |
2132 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, | 2210 | pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, |
2133 | PFN_DOWN(pt_phys)); | 2211 | PFN_DOWN(pmd_phys)); |
2134 | set_pmd(pmd + idx_pt, | 2212 | set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); |
2135 | __pmd(_PAGE_TABLE | pt_phys)); | 2213 | pmd_phys += PAGE_SIZE; |
2136 | pt_phys += PAGE_SIZE; | ||
2137 | } | 2214 | } |
2138 | n_pt -= PTRS_PER_PMD; | 2215 | n_pmd -= PTRS_PER_PUD; |
2139 | early_memunmap(pmd, PAGE_SIZE); | 2216 | early_memunmap(pud, PAGE_SIZE); |
2140 | make_lowmem_page_readonly(__va(pmd_phys)); | 2217 | make_lowmem_page_readonly(__va(pud_phys)); |
2141 | pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, | 2218 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); |
2142 | PFN_DOWN(pmd_phys)); | 2219 | if (n_p4d > 0) |
2143 | set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); | 2220 | set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); |
2144 | pmd_phys += PAGE_SIZE; | 2221 | else |
2222 | set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); | ||
2223 | pud_phys += PAGE_SIZE; | ||
2145 | } | 2224 | } |
2146 | n_pmd -= PTRS_PER_PUD; | 2225 | if (n_p4d > 0) { |
2147 | early_memunmap(pud, PAGE_SIZE); | 2226 | save_pud -= PTRS_PER_P4D; |
2148 | make_lowmem_page_readonly(__va(pud_phys)); | 2227 | early_memunmap(p4d, PAGE_SIZE); |
2149 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); | 2228 | make_lowmem_page_readonly(__va(p4d_phys)); |
2150 | set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); | 2229 | pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); |
2151 | pud_phys += PAGE_SIZE; | 2230 | set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); |
2152 | } | 2231 | p4d_phys += PAGE_SIZE; |
2232 | } | ||
2233 | } while (++idx_p4d < n_p4d); | ||
2153 | 2234 | ||
2154 | /* Now copy the old p2m info to the new area. */ | 2235 | /* Now copy the old p2m info to the new area. */ |
2155 | memcpy(new_p2m, xen_p2m_addr, size); | 2236 | memcpy(new_p2m, xen_p2m_addr, size); |
@@ -2326,6 +2407,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | |||
2326 | #endif | 2407 | #endif |
2327 | case FIX_TEXT_POKE0: | 2408 | case FIX_TEXT_POKE0: |
2328 | case FIX_TEXT_POKE1: | 2409 | case FIX_TEXT_POKE1: |
2410 | case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: | ||
2329 | /* All local page mappings */ | 2411 | /* All local page mappings */ |
2330 | pte = pfn_pte(phys, prot); | 2412 | pte = pfn_pte(phys, prot); |
2331 | break; | 2413 | break; |
@@ -2378,8 +2460,8 @@ static void __init xen_post_allocator_init(void) | |||
2378 | pv_mmu_ops.set_pte = xen_set_pte; | 2460 | pv_mmu_ops.set_pte = xen_set_pte; |
2379 | pv_mmu_ops.set_pmd = xen_set_pmd; | 2461 | pv_mmu_ops.set_pmd = xen_set_pmd; |
2380 | pv_mmu_ops.set_pud = xen_set_pud; | 2462 | pv_mmu_ops.set_pud = xen_set_pud; |
2381 | #if CONFIG_PGTABLE_LEVELS == 4 | 2463 | #if CONFIG_PGTABLE_LEVELS >= 4 |
2382 | pv_mmu_ops.set_pgd = xen_set_pgd; | 2464 | pv_mmu_ops.set_p4d = xen_set_p4d; |
2383 | #endif | 2465 | #endif |
2384 | 2466 | ||
2385 | /* This will work as long as patching hasn't happened yet | 2467 | /* This will work as long as patching hasn't happened yet |
@@ -2388,7 +2470,7 @@ static void __init xen_post_allocator_init(void) | |||
2388 | pv_mmu_ops.alloc_pmd = xen_alloc_pmd; | 2470 | pv_mmu_ops.alloc_pmd = xen_alloc_pmd; |
2389 | pv_mmu_ops.release_pte = xen_release_pte; | 2471 | pv_mmu_ops.release_pte = xen_release_pte; |
2390 | pv_mmu_ops.release_pmd = xen_release_pmd; | 2472 | pv_mmu_ops.release_pmd = xen_release_pmd; |
2391 | #if CONFIG_PGTABLE_LEVELS == 4 | 2473 | #if CONFIG_PGTABLE_LEVELS >= 4 |
2392 | pv_mmu_ops.alloc_pud = xen_alloc_pud; | 2474 | pv_mmu_ops.alloc_pud = xen_alloc_pud; |
2393 | pv_mmu_ops.release_pud = xen_release_pud; | 2475 | pv_mmu_ops.release_pud = xen_release_pud; |
2394 | #endif | 2476 | #endif |
@@ -2454,10 +2536,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { | |||
2454 | .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), | 2536 | .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), |
2455 | .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), | 2537 | .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), |
2456 | 2538 | ||
2457 | #if CONFIG_PGTABLE_LEVELS == 4 | 2539 | #if CONFIG_PGTABLE_LEVELS >= 4 |
2458 | .pud_val = PV_CALLEE_SAVE(xen_pud_val), | 2540 | .pud_val = PV_CALLEE_SAVE(xen_pud_val), |
2459 | .make_pud = PV_CALLEE_SAVE(xen_make_pud), | 2541 | .make_pud = PV_CALLEE_SAVE(xen_make_pud), |
2460 | .set_pgd = xen_set_pgd_hyper, | 2542 | .set_p4d = xen_set_p4d_hyper, |
2461 | 2543 | ||
2462 | .alloc_pud = xen_alloc_pmd_init, | 2544 | .alloc_pud = xen_alloc_pmd_init, |
2463 | .release_pud = xen_release_pmd_init, | 2545 | .release_pud = xen_release_pmd_init, |
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 73809bb951b4..3fe2b3292915 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -5,6 +5,7 @@ | |||
5 | 5 | ||
6 | enum pt_level { | 6 | enum pt_level { |
7 | PT_PGD, | 7 | PT_PGD, |
8 | PT_P4D, | ||
8 | PT_PUD, | 9 | PT_PUD, |
9 | PT_PMD, | 10 | PT_PMD, |
10 | PT_PTE | 11 | PT_PTE |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7ff2f1bfb7ec..eaa36162ed4a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -392,7 +392,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) | |||
392 | if (ctxt == NULL) | 392 | if (ctxt == NULL) |
393 | return -ENOMEM; | 393 | return -ENOMEM; |
394 | 394 | ||
395 | gdt = get_cpu_gdt_table(cpu); | 395 | gdt = get_cpu_gdt_rw(cpu); |
396 | 396 | ||
397 | #ifdef CONFIG_X86_32 | 397 | #ifdef CONFIG_X86_32 |
398 | ctxt->user_regs.fs = __KERNEL_PERCPU; | 398 | ctxt->user_regs.fs = __KERNEL_PERCPU; |
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 033f49b31fdc..cb0d742fa23f 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c | |||
@@ -43,6 +43,7 @@ static void dax_pmem_percpu_exit(void *data) | |||
43 | struct dax_pmem *dax_pmem = to_dax_pmem(ref); | 43 | struct dax_pmem *dax_pmem = to_dax_pmem(ref); |
44 | 44 | ||
45 | dev_dbg(dax_pmem->dev, "%s\n", __func__); | 45 | dev_dbg(dax_pmem->dev, "%s\n", __func__); |
46 | wait_for_completion(&dax_pmem->cmp); | ||
46 | percpu_ref_exit(ref); | 47 | percpu_ref_exit(ref); |
47 | } | 48 | } |
48 | 49 | ||
@@ -53,7 +54,6 @@ static void dax_pmem_percpu_kill(void *data) | |||
53 | 54 | ||
54 | dev_dbg(dax_pmem->dev, "%s\n", __func__); | 55 | dev_dbg(dax_pmem->dev, "%s\n", __func__); |
55 | percpu_ref_kill(ref); | 56 | percpu_ref_kill(ref); |
56 | wait_for_completion(&dax_pmem->cmp); | ||
57 | } | 57 | } |
58 | 58 | ||
59 | static int dax_pmem_probe(struct device *dev) | 59 | static int dax_pmem_probe(struct device *dev) |
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index d71f6323ac00..b4f79b923aea 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -504,7 +504,7 @@ void __init lguest_arch_host_init(void) | |||
504 | * byte, not the size, hence the "-1"). | 504 | * byte, not the size, hence the "-1"). |
505 | */ | 505 | */ |
506 | state->host_gdt_desc.size = GDT_SIZE-1; | 506 | state->host_gdt_desc.size = GDT_SIZE-1; |
507 | state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); | 507 | state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i); |
508 | 508 | ||
509 | /* | 509 | /* |
510 | * All CPUs on the Host use the same Interrupt Descriptor | 510 | * All CPUs on the Host use the same Interrupt Descriptor |
@@ -554,8 +554,8 @@ void __init lguest_arch_host_init(void) | |||
554 | * The Host needs to be able to use the LGUEST segments on this | 554 | * The Host needs to be able to use the LGUEST segments on this |
555 | * CPU, too, so put them in the Host GDT. | 555 | * CPU, too, so put them in the Host GDT. |
556 | */ | 556 | */ |
557 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; | 557 | get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; |
558 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; | 558 | get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; |
559 | } | 559 | } |
560 | 560 | ||
561 | /* | 561 | /* |
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5b536be5a12e..fbc640bf06b0 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/badblocks.h> | 25 | #include <linux/badblocks.h> |
26 | #include <linux/memremap.h> | 26 | #include <linux/memremap.h> |
27 | #include <linux/vmalloc.h> | 27 | #include <linux/vmalloc.h> |
28 | #include <linux/blk-mq.h> | ||
28 | #include <linux/pfn_t.h> | 29 | #include <linux/pfn_t.h> |
29 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
30 | #include <linux/pmem.h> | 31 | #include <linux/pmem.h> |
@@ -231,6 +232,11 @@ static void pmem_release_queue(void *q) | |||
231 | blk_cleanup_queue(q); | 232 | blk_cleanup_queue(q); |
232 | } | 233 | } |
233 | 234 | ||
235 | static void pmem_freeze_queue(void *q) | ||
236 | { | ||
237 | blk_freeze_queue_start(q); | ||
238 | } | ||
239 | |||
234 | static void pmem_release_disk(void *disk) | 240 | static void pmem_release_disk(void *disk) |
235 | { | 241 | { |
236 | del_gendisk(disk); | 242 | del_gendisk(disk); |
@@ -284,6 +290,9 @@ static int pmem_attach_disk(struct device *dev, | |||
284 | if (!q) | 290 | if (!q) |
285 | return -ENOMEM; | 291 | return -ENOMEM; |
286 | 292 | ||
293 | if (devm_add_action_or_reset(dev, pmem_release_queue, q)) | ||
294 | return -ENOMEM; | ||
295 | |||
287 | pmem->pfn_flags = PFN_DEV; | 296 | pmem->pfn_flags = PFN_DEV; |
288 | if (is_nd_pfn(dev)) { | 297 | if (is_nd_pfn(dev)) { |
289 | addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, | 298 | addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, |
@@ -303,10 +312,10 @@ static int pmem_attach_disk(struct device *dev, | |||
303 | pmem->size, ARCH_MEMREMAP_PMEM); | 312 | pmem->size, ARCH_MEMREMAP_PMEM); |
304 | 313 | ||
305 | /* | 314 | /* |
306 | * At release time the queue must be dead before | 315 | * At release time the queue must be frozen before |
307 | * devm_memremap_pages is unwound | 316 | * devm_memremap_pages is unwound |
308 | */ | 317 | */ |
309 | if (devm_add_action_or_reset(dev, pmem_release_queue, q)) | 318 | if (devm_add_action_or_reset(dev, pmem_freeze_queue, q)) |
310 | return -ENOMEM; | 319 | return -ENOMEM; |
311 | 320 | ||
312 | if (IS_ERR(addr)) | 321 | if (IS_ERR(addr)) |
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index 438d4c72c7b3..ff563db025b3 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c | |||
@@ -54,7 +54,7 @@ __asm__(".text \n" | |||
54 | 54 | ||
55 | #define Q2_SET_SEL(cpu, selname, address, size) \ | 55 | #define Q2_SET_SEL(cpu, selname, address, size) \ |
56 | do { \ | 56 | do { \ |
57 | struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \ | 57 | struct desc_struct *gdt = get_cpu_gdt_rw((cpu)); \ |
58 | set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \ | 58 | set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \ |
59 | set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ | 59 | set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ |
60 | } while(0) | 60 | } while(0) |
@@ -95,8 +95,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3, | |||
95 | return PNP_FUNCTION_NOT_SUPPORTED; | 95 | return PNP_FUNCTION_NOT_SUPPORTED; |
96 | 96 | ||
97 | cpu = get_cpu(); | 97 | cpu = get_cpu(); |
98 | save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8]; | 98 | save_desc_40 = get_cpu_gdt_rw(cpu)[0x40 / 8]; |
99 | get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc; | 99 | get_cpu_gdt_rw(cpu)[0x40 / 8] = bad_bios_desc; |
100 | 100 | ||
101 | /* On some boxes IRQ's during PnP BIOS calls are deadly. */ | 101 | /* On some boxes IRQ's during PnP BIOS calls are deadly. */ |
102 | spin_lock_irqsave(&pnp_bios_lock, flags); | 102 | spin_lock_irqsave(&pnp_bios_lock, flags); |
@@ -134,7 +134,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3, | |||
134 | :"memory"); | 134 | :"memory"); |
135 | spin_unlock_irqrestore(&pnp_bios_lock, flags); | 135 | spin_unlock_irqrestore(&pnp_bios_lock, flags); |
136 | 136 | ||
137 | get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40; | 137 | get_cpu_gdt_rw(cpu)[0x40 / 8] = save_desc_40; |
138 | put_cpu(); | 138 | put_cpu(); |
139 | 139 | ||
140 | /* If we get here and this is set then the PnP BIOS faulted on us. */ | 140 | /* If we get here and this is set then the PnP BIOS faulted on us. */ |
@@ -477,7 +477,7 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) | |||
477 | pnp_bios_callpoint.segment = PNP_CS16; | 477 | pnp_bios_callpoint.segment = PNP_CS16; |
478 | 478 | ||
479 | for_each_possible_cpu(i) { | 479 | for_each_possible_cpu(i) { |
480 | struct desc_struct *gdt = get_cpu_gdt_table(i); | 480 | struct desc_struct *gdt = get_cpu_gdt_rw(i); |
481 | if (!gdt) | 481 | if (!gdt) |
482 | continue; | 482 | continue; |
483 | set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32], | 483 | set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32], |
diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index cc5d9a1405df..41e5b6784b97 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h | |||
@@ -32,10 +32,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, | |||
32 | /* by default, allow everything */ | 32 | /* by default, allow everything */ |
33 | return true; | 33 | return true; |
34 | } | 34 | } |
35 | |||
36 | static inline bool arch_pte_access_permitted(pte_t pte, bool write) | ||
37 | { | ||
38 | /* by default, allow everything */ | ||
39 | return true; | ||
40 | } | ||
41 | #endif /* _ASM_GENERIC_MM_HOOKS_H */ | 35 | #endif /* _ASM_GENERIC_MM_HOOKS_H */ |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 1fad160f35de..7dfa767dc680 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -341,6 +341,31 @@ static inline int pte_unused(pte_t pte) | |||
341 | } | 341 | } |
342 | #endif | 342 | #endif |
343 | 343 | ||
344 | #ifndef pte_access_permitted | ||
345 | #define pte_access_permitted(pte, write) \ | ||
346 | (pte_present(pte) && (!(write) || pte_write(pte))) | ||
347 | #endif | ||
348 | |||
349 | #ifndef pmd_access_permitted | ||
350 | #define pmd_access_permitted(pmd, write) \ | ||
351 | (pmd_present(pmd) && (!(write) || pmd_write(pmd))) | ||
352 | #endif | ||
353 | |||
354 | #ifndef pud_access_permitted | ||
355 | #define pud_access_permitted(pud, write) \ | ||
356 | (pud_present(pud) && (!(write) || pud_write(pud))) | ||
357 | #endif | ||
358 | |||
359 | #ifndef p4d_access_permitted | ||
360 | #define p4d_access_permitted(p4d, write) \ | ||
361 | (p4d_present(p4d) && (!(write) || p4d_write(p4d))) | ||
362 | #endif | ||
363 | |||
364 | #ifndef pgd_access_permitted | ||
365 | #define pgd_access_permitted(pgd, write) \ | ||
366 | (pgd_present(pgd) && (!(write) || pgd_write(pgd))) | ||
367 | #endif | ||
368 | |||
344 | #ifndef __HAVE_ARCH_PMD_SAME | 369 | #ifndef __HAVE_ARCH_PMD_SAME |
345 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 370 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
346 | static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) | 371 | static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 00a8fa7e366a..695da2a19b4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -432,6 +432,10 @@ static inline int pud_devmap(pud_t pud) | |||
432 | { | 432 | { |
433 | return 0; | 433 | return 0; |
434 | } | 434 | } |
435 | static inline int pgd_devmap(pgd_t pgd) | ||
436 | { | ||
437 | return 0; | ||
438 | } | ||
435 | #endif | 439 | #endif |
436 | 440 | ||
437 | /* | 441 | /* |
@@ -758,19 +762,11 @@ static inline enum zone_type page_zonenum(const struct page *page) | |||
758 | } | 762 | } |
759 | 763 | ||
760 | #ifdef CONFIG_ZONE_DEVICE | 764 | #ifdef CONFIG_ZONE_DEVICE |
761 | void get_zone_device_page(struct page *page); | ||
762 | void put_zone_device_page(struct page *page); | ||
763 | static inline bool is_zone_device_page(const struct page *page) | 765 | static inline bool is_zone_device_page(const struct page *page) |
764 | { | 766 | { |
765 | return page_zonenum(page) == ZONE_DEVICE; | 767 | return page_zonenum(page) == ZONE_DEVICE; |
766 | } | 768 | } |
767 | #else | 769 | #else |
768 | static inline void get_zone_device_page(struct page *page) | ||
769 | { | ||
770 | } | ||
771 | static inline void put_zone_device_page(struct page *page) | ||
772 | { | ||
773 | } | ||
774 | static inline bool is_zone_device_page(const struct page *page) | 770 | static inline bool is_zone_device_page(const struct page *page) |
775 | { | 771 | { |
776 | return false; | 772 | return false; |
@@ -786,9 +782,6 @@ static inline void get_page(struct page *page) | |||
786 | */ | 782 | */ |
787 | VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); | 783 | VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); |
788 | page_ref_inc(page); | 784 | page_ref_inc(page); |
789 | |||
790 | if (unlikely(is_zone_device_page(page))) | ||
791 | get_zone_device_page(page); | ||
792 | } | 785 | } |
793 | 786 | ||
794 | static inline void put_page(struct page *page) | 787 | static inline void put_page(struct page *page) |
@@ -797,9 +790,6 @@ static inline void put_page(struct page *page) | |||
797 | 790 | ||
798 | if (put_page_testzero(page)) | 791 | if (put_page_testzero(page)) |
799 | __put_page(page); | 792 | __put_page(page); |
800 | |||
801 | if (unlikely(is_zone_device_page(page))) | ||
802 | put_zone_device_page(page); | ||
803 | } | 793 | } |
804 | 794 | ||
805 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 795 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f60f45fe226f..45cdb27791a3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -367,6 +367,11 @@ struct mm_struct { | |||
367 | #endif | 367 | #endif |
368 | unsigned long mmap_base; /* base of mmap area */ | 368 | unsigned long mmap_base; /* base of mmap area */ |
369 | unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ | 369 | unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ |
370 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES | ||
371 | /* Base adresses for compatible mmap() */ | ||
372 | unsigned long mmap_compat_base; | ||
373 | unsigned long mmap_compat_legacy_base; | ||
374 | #endif | ||
370 | unsigned long task_size; /* size of task vm space */ | 375 | unsigned long task_size; /* size of task vm space */ |
371 | unsigned long highest_vm_end; /* highest vma end address */ | 376 | unsigned long highest_vm_end; /* highest vma end address */ |
372 | pgd_t * pgd; | 377 | pgd_t * pgd; |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 84943e8057ef..316a19f6b635 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page) | |||
148 | 148 | ||
149 | #ifdef CONFIG_TINY_RCU | 149 | #ifdef CONFIG_TINY_RCU |
150 | # ifdef CONFIG_PREEMPT_COUNT | 150 | # ifdef CONFIG_PREEMPT_COUNT |
151 | VM_BUG_ON(!in_atomic()); | 151 | VM_BUG_ON(!in_atomic() && !irqs_disabled()); |
152 | # endif | 152 | # endif |
153 | /* | 153 | /* |
154 | * Preempt must be disabled here - we rely on rcu_read_lock doing | 154 | * Preempt must be disabled here - we rely on rcu_read_lock doing |
@@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count) | |||
186 | 186 | ||
187 | #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) | 187 | #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) |
188 | # ifdef CONFIG_PREEMPT_COUNT | 188 | # ifdef CONFIG_PREEMPT_COUNT |
189 | VM_BUG_ON(!in_atomic()); | 189 | VM_BUG_ON(!in_atomic() && !irqs_disabled()); |
190 | # endif | 190 | # endif |
191 | VM_BUG_ON_PAGE(page_count(page) == 0, page); | 191 | VM_BUG_ON_PAGE(page_count(page) == 0, page); |
192 | page_ref_add(page, count); | 192 | page_ref_add(page, count); |
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index bce990f5a35d..31acce9019a6 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h | |||
@@ -241,21 +241,21 @@ TRACE_EVENT(xen_mmu_set_pud, | |||
241 | (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval) | 241 | (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval) |
242 | ); | 242 | ); |
243 | 243 | ||
244 | TRACE_EVENT(xen_mmu_set_pgd, | 244 | TRACE_EVENT(xen_mmu_set_p4d, |
245 | TP_PROTO(pgd_t *pgdp, pgd_t *user_pgdp, pgd_t pgdval), | 245 | TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t p4dval), |
246 | TP_ARGS(pgdp, user_pgdp, pgdval), | 246 | TP_ARGS(p4dp, user_p4dp, p4dval), |
247 | TP_STRUCT__entry( | 247 | TP_STRUCT__entry( |
248 | __field(pgd_t *, pgdp) | 248 | __field(p4d_t *, p4dp) |
249 | __field(pgd_t *, user_pgdp) | 249 | __field(p4d_t *, user_p4dp) |
250 | __field(pgdval_t, pgdval) | 250 | __field(p4dval_t, p4dval) |
251 | ), | 251 | ), |
252 | TP_fast_assign(__entry->pgdp = pgdp; | 252 | TP_fast_assign(__entry->p4dp = p4dp; |
253 | __entry->user_pgdp = user_pgdp; | 253 | __entry->user_p4dp = user_p4dp; |
254 | __entry->pgdval = pgdval.pgd), | 254 | __entry->p4dval = p4d_val(p4dval)), |
255 | TP_printk("pgdp %p user_pgdp %p pgdval %0*llx (raw %0*llx)", | 255 | TP_printk("p4dp %p user_p4dp %p p4dval %0*llx (raw %0*llx)", |
256 | __entry->pgdp, __entry->user_pgdp, | 256 | __entry->p4dp, __entry->user_p4dp, |
257 | (int)sizeof(pgdval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pgdval)), | 257 | (int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)), |
258 | (int)sizeof(pgdval_t) * 2, (unsigned long long)__entry->pgdval) | 258 | (int)sizeof(p4dval_t) * 2, (unsigned long long)__entry->p4dval) |
259 | ); | 259 | ); |
260 | 260 | ||
261 | TRACE_EVENT(xen_mmu_pud_clear, | 261 | TRACE_EVENT(xen_mmu_pud_clear, |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 07e85e5229da..23a6483c3666 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
@@ -182,18 +182,6 @@ struct page_map { | |||
182 | struct vmem_altmap altmap; | 182 | struct vmem_altmap altmap; |
183 | }; | 183 | }; |
184 | 184 | ||
185 | void get_zone_device_page(struct page *page) | ||
186 | { | ||
187 | percpu_ref_get(page->pgmap->ref); | ||
188 | } | ||
189 | EXPORT_SYMBOL(get_zone_device_page); | ||
190 | |||
191 | void put_zone_device_page(struct page *page) | ||
192 | { | ||
193 | put_dev_pagemap(page->pgmap); | ||
194 | } | ||
195 | EXPORT_SYMBOL(put_zone_device_page); | ||
196 | |||
197 | static void pgmap_radix_release(struct resource *res) | 185 | static void pgmap_radix_release(struct resource *res) |
198 | { | 186 | { |
199 | resource_size_t key, align_start, align_size, align_end; | 187 | resource_size_t key, align_start, align_size, align_end; |
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data) | |||
237 | struct resource *res = &page_map->res; | 225 | struct resource *res = &page_map->res; |
238 | resource_size_t align_start, align_size; | 226 | resource_size_t align_start, align_size; |
239 | struct dev_pagemap *pgmap = &page_map->pgmap; | 227 | struct dev_pagemap *pgmap = &page_map->pgmap; |
228 | unsigned long pfn; | ||
229 | |||
230 | for_each_device_pfn(pfn, page_map) | ||
231 | put_page(pfn_to_page(pfn)); | ||
240 | 232 | ||
241 | if (percpu_ref_tryget_live(pgmap->ref)) { | 233 | if (percpu_ref_tryget_live(pgmap->ref)) { |
242 | dev_WARN(dev, "%s: page mapping is still live!\n", __func__); | 234 | dev_WARN(dev, "%s: page mapping is still live!\n", __func__); |
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) | |||
277 | * | 269 | * |
278 | * Notes: | 270 | * Notes: |
279 | * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time | 271 | * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time |
280 | * (or devm release event). | 272 | * (or devm release event). The expected order of events is that @ref has |
273 | * been through percpu_ref_kill() before devm_memremap_pages_release(). The | ||
274 | * wait for the completion of all references being dropped and | ||
275 | * percpu_ref_exit() must occur after devm_memremap_pages_release(). | ||
281 | * | 276 | * |
282 | * 2/ @res is expected to be a host memory range that could feasibly be | 277 | * 2/ @res is expected to be a host memory range that could feasibly be |
283 | * treated as a "System RAM" range, i.e. not a device mmio range, but | 278 | * treated as a "System RAM" range, i.e. not a device mmio range, but |
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
379 | */ | 374 | */ |
380 | list_del(&page->lru); | 375 | list_del(&page->lru); |
381 | page->pgmap = pgmap; | 376 | page->pgmap = pgmap; |
377 | percpu_ref_get(ref); | ||
382 | } | 378 | } |
383 | devres_add(dev, page_map); | 379 | devres_add(dev, page_map); |
384 | return __va(res->start); | 380 | return __va(res->start); |
@@ -1189,34 +1189,57 @@ struct page *get_dump_page(unsigned long addr) | |||
1189 | */ | 1189 | */ |
1190 | #ifdef CONFIG_HAVE_GENERIC_RCU_GUP | 1190 | #ifdef CONFIG_HAVE_GENERIC_RCU_GUP |
1191 | 1191 | ||
1192 | #ifndef gup_get_pte | ||
1193 | /* | ||
1194 | * We assume that the PTE can be read atomically. If this is not the case for | ||
1195 | * your architecture, please provide the helper. | ||
1196 | */ | ||
1197 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
1198 | { | ||
1199 | return READ_ONCE(*ptep); | ||
1200 | } | ||
1201 | #endif | ||
1202 | |||
1203 | static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) | ||
1204 | { | ||
1205 | while ((*nr) - nr_start) { | ||
1206 | struct page *page = pages[--(*nr)]; | ||
1207 | |||
1208 | ClearPageReferenced(page); | ||
1209 | put_page(page); | ||
1210 | } | ||
1211 | } | ||
1212 | |||
1192 | #ifdef __HAVE_ARCH_PTE_SPECIAL | 1213 | #ifdef __HAVE_ARCH_PTE_SPECIAL |
1193 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | 1214 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, |
1194 | int write, struct page **pages, int *nr) | 1215 | int write, struct page **pages, int *nr) |
1195 | { | 1216 | { |
1217 | struct dev_pagemap *pgmap = NULL; | ||
1218 | int nr_start = *nr, ret = 0; | ||
1196 | pte_t *ptep, *ptem; | 1219 | pte_t *ptep, *ptem; |
1197 | int ret = 0; | ||
1198 | 1220 | ||
1199 | ptem = ptep = pte_offset_map(&pmd, addr); | 1221 | ptem = ptep = pte_offset_map(&pmd, addr); |
1200 | do { | 1222 | do { |
1201 | /* | 1223 | pte_t pte = gup_get_pte(ptep); |
1202 | * In the line below we are assuming that the pte can be read | ||
1203 | * atomically. If this is not the case for your architecture, | ||
1204 | * please wrap this in a helper function! | ||
1205 | * | ||
1206 | * for an example see gup_get_pte in arch/x86/mm/gup.c | ||
1207 | */ | ||
1208 | pte_t pte = READ_ONCE(*ptep); | ||
1209 | struct page *head, *page; | 1224 | struct page *head, *page; |
1210 | 1225 | ||
1211 | /* | 1226 | /* |
1212 | * Similar to the PMD case below, NUMA hinting must take slow | 1227 | * Similar to the PMD case below, NUMA hinting must take slow |
1213 | * path using the pte_protnone check. | 1228 | * path using the pte_protnone check. |
1214 | */ | 1229 | */ |
1215 | if (!pte_present(pte) || pte_special(pte) || | 1230 | if (pte_protnone(pte)) |
1216 | pte_protnone(pte) || (write && !pte_write(pte))) | ||
1217 | goto pte_unmap; | 1231 | goto pte_unmap; |
1218 | 1232 | ||
1219 | if (!arch_pte_access_permitted(pte, write)) | 1233 | if (!pte_access_permitted(pte, write)) |
1234 | goto pte_unmap; | ||
1235 | |||
1236 | if (pte_devmap(pte)) { | ||
1237 | pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); | ||
1238 | if (unlikely(!pgmap)) { | ||
1239 | undo_dev_pagemap(nr, nr_start, pages); | ||
1240 | goto pte_unmap; | ||
1241 | } | ||
1242 | } else if (pte_special(pte)) | ||
1220 | goto pte_unmap; | 1243 | goto pte_unmap; |
1221 | 1244 | ||
1222 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 1245 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
@@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1232 | } | 1255 | } |
1233 | 1256 | ||
1234 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | 1257 | VM_BUG_ON_PAGE(compound_head(page) != head, page); |
1258 | |||
1259 | put_dev_pagemap(pgmap); | ||
1260 | SetPageReferenced(page); | ||
1235 | pages[*nr] = page; | 1261 | pages[*nr] = page; |
1236 | (*nr)++; | 1262 | (*nr)++; |
1237 | 1263 | ||
@@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1261 | } | 1287 | } |
1262 | #endif /* __HAVE_ARCH_PTE_SPECIAL */ | 1288 | #endif /* __HAVE_ARCH_PTE_SPECIAL */ |
1263 | 1289 | ||
1290 | #ifdef __HAVE_ARCH_PTE_DEVMAP | ||
1291 | static int __gup_device_huge(unsigned long pfn, unsigned long addr, | ||
1292 | unsigned long end, struct page **pages, int *nr) | ||
1293 | { | ||
1294 | int nr_start = *nr; | ||
1295 | struct dev_pagemap *pgmap = NULL; | ||
1296 | |||
1297 | do { | ||
1298 | struct page *page = pfn_to_page(pfn); | ||
1299 | |||
1300 | pgmap = get_dev_pagemap(pfn, pgmap); | ||
1301 | if (unlikely(!pgmap)) { | ||
1302 | undo_dev_pagemap(nr, nr_start, pages); | ||
1303 | return 0; | ||
1304 | } | ||
1305 | SetPageReferenced(page); | ||
1306 | pages[*nr] = page; | ||
1307 | get_page(page); | ||
1308 | put_dev_pagemap(pgmap); | ||
1309 | (*nr)++; | ||
1310 | pfn++; | ||
1311 | } while (addr += PAGE_SIZE, addr != end); | ||
1312 | return 1; | ||
1313 | } | ||
1314 | |||
1315 | static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, | ||
1316 | unsigned long end, struct page **pages, int *nr) | ||
1317 | { | ||
1318 | unsigned long fault_pfn; | ||
1319 | |||
1320 | fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
1321 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
1322 | } | ||
1323 | |||
1324 | static int __gup_device_huge_pud(pud_t pud, unsigned long addr, | ||
1325 | unsigned long end, struct page **pages, int *nr) | ||
1326 | { | ||
1327 | unsigned long fault_pfn; | ||
1328 | |||
1329 | fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
1330 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
1331 | } | ||
1332 | #else | ||
1333 | static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, | ||
1334 | unsigned long end, struct page **pages, int *nr) | ||
1335 | { | ||
1336 | BUILD_BUG(); | ||
1337 | return 0; | ||
1338 | } | ||
1339 | |||
1340 | static int __gup_device_huge_pud(pud_t pud, unsigned long addr, | ||
1341 | unsigned long end, struct page **pages, int *nr) | ||
1342 | { | ||
1343 | BUILD_BUG(); | ||
1344 | return 0; | ||
1345 | } | ||
1346 | #endif | ||
1347 | |||
1264 | static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | 1348 | static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, |
1265 | unsigned long end, int write, struct page **pages, int *nr) | 1349 | unsigned long end, int write, struct page **pages, int *nr) |
1266 | { | 1350 | { |
1267 | struct page *head, *page; | 1351 | struct page *head, *page; |
1268 | int refs; | 1352 | int refs; |
1269 | 1353 | ||
1270 | if (write && !pmd_write(orig)) | 1354 | if (!pmd_access_permitted(orig, write)) |
1271 | return 0; | 1355 | return 0; |
1272 | 1356 | ||
1357 | if (pmd_devmap(orig)) | ||
1358 | return __gup_device_huge_pmd(orig, addr, end, pages, nr); | ||
1359 | |||
1273 | refs = 0; | 1360 | refs = 0; |
1274 | head = pmd_page(orig); | 1361 | head = pmd_page(orig); |
1275 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | 1362 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
@@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | |||
1293 | return 0; | 1380 | return 0; |
1294 | } | 1381 | } |
1295 | 1382 | ||
1383 | SetPageReferenced(head); | ||
1296 | return 1; | 1384 | return 1; |
1297 | } | 1385 | } |
1298 | 1386 | ||
@@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | |||
1302 | struct page *head, *page; | 1390 | struct page *head, *page; |
1303 | int refs; | 1391 | int refs; |
1304 | 1392 | ||
1305 | if (write && !pud_write(orig)) | 1393 | if (!pud_access_permitted(orig, write)) |
1306 | return 0; | 1394 | return 0; |
1307 | 1395 | ||
1396 | if (pud_devmap(orig)) | ||
1397 | return __gup_device_huge_pud(orig, addr, end, pages, nr); | ||
1398 | |||
1308 | refs = 0; | 1399 | refs = 0; |
1309 | head = pud_page(orig); | 1400 | head = pud_page(orig); |
1310 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | 1401 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); |
@@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | |||
1328 | return 0; | 1419 | return 0; |
1329 | } | 1420 | } |
1330 | 1421 | ||
1422 | SetPageReferenced(head); | ||
1331 | return 1; | 1423 | return 1; |
1332 | } | 1424 | } |
1333 | 1425 | ||
@@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, | |||
1338 | int refs; | 1430 | int refs; |
1339 | struct page *head, *page; | 1431 | struct page *head, *page; |
1340 | 1432 | ||
1341 | if (write && !pgd_write(orig)) | 1433 | if (!pgd_access_permitted(orig, write)) |
1342 | return 0; | 1434 | return 0; |
1343 | 1435 | ||
1436 | BUILD_BUG_ON(pgd_devmap(orig)); | ||
1344 | refs = 0; | 1437 | refs = 0; |
1345 | head = pgd_page(orig); | 1438 | head = pgd_page(orig); |
1346 | page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); | 1439 | page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); |
@@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, | |||
1364 | return 0; | 1457 | return 0; |
1365 | } | 1458 | } |
1366 | 1459 | ||
1460 | SetPageReferenced(head); | ||
1367 | return 1; | 1461 | return 1; |
1368 | } | 1462 | } |
1369 | 1463 | ||
@@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1520 | return nr; | 1614 | return nr; |
1521 | } | 1615 | } |
1522 | 1616 | ||
1617 | #ifndef gup_fast_permitted | ||
1618 | /* | ||
1619 | * Check if it's allowed to use __get_user_pages_fast() for the range, or | ||
1620 | * we need to fall back to the slow version: | ||
1621 | */ | ||
1622 | bool gup_fast_permitted(unsigned long start, int nr_pages, int write) | ||
1623 | { | ||
1624 | unsigned long len, end; | ||
1625 | |||
1626 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
1627 | end = start + len; | ||
1628 | return end >= start; | ||
1629 | } | ||
1630 | #endif | ||
1631 | |||
1523 | /** | 1632 | /** |
1524 | * get_user_pages_fast() - pin user pages in memory | 1633 | * get_user_pages_fast() - pin user pages in memory |
1525 | * @start: starting user address | 1634 | * @start: starting user address |
@@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1539 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1648 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
1540 | struct page **pages) | 1649 | struct page **pages) |
1541 | { | 1650 | { |
1542 | int nr, ret; | 1651 | int nr = 0, ret = 0; |
1543 | 1652 | ||
1544 | start &= PAGE_MASK; | 1653 | start &= PAGE_MASK; |
1545 | nr = __get_user_pages_fast(start, nr_pages, write, pages); | 1654 | |
1546 | ret = nr; | 1655 | if (gup_fast_permitted(start, nr_pages, write)) { |
1656 | nr = __get_user_pages_fast(start, nr_pages, write, pages); | ||
1657 | ret = nr; | ||
1658 | } | ||
1547 | 1659 | ||
1548 | if (nr < nr_pages) { | 1660 | if (nr < nr_pages) { |
1549 | /* Try to get the remaining pages with get_user_pages */ | 1661 | /* Try to get the remaining pages with get_user_pages */ |
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page) | |||
97 | 97 | ||
98 | void __put_page(struct page *page) | 98 | void __put_page(struct page *page) |
99 | { | 99 | { |
100 | if (is_zone_device_page(page)) { | ||
101 | put_dev_pagemap(page->pgmap); | ||
102 | |||
103 | /* | ||
104 | * The page belongs to the device that created pgmap. Do | ||
105 | * not return it to page allocator. | ||
106 | */ | ||
107 | return; | ||
108 | } | ||
109 | |||
100 | if (unlikely(PageCompound(page))) | 110 | if (unlikely(PageCompound(page))) |
101 | __put_compound_page(page); | 111 | __put_compound_page(page); |
102 | else | 112 | else |
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index f6121612e769..b9a22f18566a 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c | |||
@@ -409,6 +409,51 @@ static void *threadproc(void *ctx) | |||
409 | } | 409 | } |
410 | } | 410 | } |
411 | 411 | ||
412 | #ifdef __i386__ | ||
413 | |||
414 | #ifndef SA_RESTORE | ||
415 | #define SA_RESTORER 0x04000000 | ||
416 | #endif | ||
417 | |||
418 | /* | ||
419 | * The UAPI header calls this 'struct sigaction', which conflicts with | ||
420 | * glibc. Sigh. | ||
421 | */ | ||
422 | struct fake_ksigaction { | ||
423 | void *handler; /* the real type is nasty */ | ||
424 | unsigned long sa_flags; | ||
425 | void (*sa_restorer)(void); | ||
426 | unsigned char sigset[8]; | ||
427 | }; | ||
428 | |||
429 | static void fix_sa_restorer(int sig) | ||
430 | { | ||
431 | struct fake_ksigaction ksa; | ||
432 | |||
433 | if (syscall(SYS_rt_sigaction, sig, NULL, &ksa, 8) == 0) { | ||
434 | /* | ||
435 | * glibc has a nasty bug: it sometimes writes garbage to | ||
436 | * sa_restorer. This interacts quite badly with anything | ||
437 | * that fiddles with SS because it can trigger legacy | ||
438 | * stack switching. Patch it up. See: | ||
439 | * | ||
440 | * https://sourceware.org/bugzilla/show_bug.cgi?id=21269 | ||
441 | */ | ||
442 | if (!(ksa.sa_flags & SA_RESTORER) && ksa.sa_restorer) { | ||
443 | ksa.sa_restorer = NULL; | ||
444 | if (syscall(SYS_rt_sigaction, sig, &ksa, NULL, | ||
445 | sizeof(ksa.sigset)) != 0) | ||
446 | err(1, "rt_sigaction"); | ||
447 | } | ||
448 | } | ||
449 | } | ||
450 | #else | ||
451 | static void fix_sa_restorer(int sig) | ||
452 | { | ||
453 | /* 64-bit glibc works fine. */ | ||
454 | } | ||
455 | #endif | ||
456 | |||
412 | static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), | 457 | static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), |
413 | int flags) | 458 | int flags) |
414 | { | 459 | { |
@@ -420,6 +465,7 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), | |||
420 | if (sigaction(sig, &sa, 0)) | 465 | if (sigaction(sig, &sa, 0)) |
421 | err(1, "sigaction"); | 466 | err(1, "sigaction"); |
422 | 467 | ||
468 | fix_sa_restorer(sig); | ||
423 | } | 469 | } |
424 | 470 | ||
425 | static jmp_buf jmpbuf; | 471 | static jmp_buf jmpbuf; |
diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c index 616ee9673339..a8df159a8924 100644 --- a/tools/testing/selftests/x86/mpx-mini-test.c +++ b/tools/testing/selftests/x86/mpx-mini-test.c | |||
@@ -404,8 +404,6 @@ void handler(int signum, siginfo_t *si, void *vucontext) | |||
404 | dprintf2("info->si_lower: %p\n", __si_bounds_lower(si)); | 404 | dprintf2("info->si_lower: %p\n", __si_bounds_lower(si)); |
405 | dprintf2("info->si_upper: %p\n", __si_bounds_upper(si)); | 405 | dprintf2("info->si_upper: %p\n", __si_bounds_upper(si)); |
406 | 406 | ||
407 | check_siginfo_vs_shadow(si); | ||
408 | |||
409 | for (i = 0; i < 8; i++) | 407 | for (i = 0; i < 8; i++) |
410 | dprintf3("[%d]: %p\n", i, si_addr_ptr[i]); | 408 | dprintf3("[%d]: %p\n", i, si_addr_ptr[i]); |
411 | switch (br_reason) { | 409 | switch (br_reason) { |
@@ -416,6 +414,9 @@ void handler(int signum, siginfo_t *si, void *vucontext) | |||
416 | exit(5); | 414 | exit(5); |
417 | case 1: /* #BR MPX bounds exception */ | 415 | case 1: /* #BR MPX bounds exception */ |
418 | /* these are normal and we expect to see them */ | 416 | /* these are normal and we expect to see them */ |
417 | |||
418 | check_siginfo_vs_shadow(si); | ||
419 | |||
419 | dprintf1("bounds exception (normal): status 0x%jx at %p si_addr: %p\n", | 420 | dprintf1("bounds exception (normal): status 0x%jx at %p si_addr: %p\n", |
420 | status, (void *)ip, si->si_addr); | 421 | status, (void *)ip, si->si_addr); |
421 | num_bnd_chk++; | 422 | num_bnd_chk++; |