diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 17:45:09 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 17:45:09 -0400 |
commit | 7a69f9c60b49699579f5bfb71f928cceba0afe1a (patch) | |
tree | bf3b5640bbd9f23beeb5a55d18348d65bafff8e8 /arch/x86/kernel | |
parent | 9bc088ab66be8978fbc981ba9644468fa2c2fd3f (diff) | |
parent | 8781fb7e9749da424e01daacd14834b674658c63 (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar:
"The main changes in this cycle were:
- Continued work to add support for 5-level paging provided by future
Intel CPUs. In particular we switch the x86 GUP code to the generic
implementation. (Kirill A. Shutemov)
- Continued work to add PCID CPU support to native kernels as well.
In this round most of the focus is on reworking/refreshing the TLB
flush infrastructure for the upcoming PCID changes. (Andy
Lutomirski)"
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits)
x86/mm: Delete a big outdated comment about TLB flushing
x86/mm: Don't reenter flush_tlb_func_common()
x86/KASLR: Fix detection 32/64 bit bootloaders for 5-level paging
x86/ftrace: Exclude functions in head64.c from function-tracing
x86/mmap, ASLR: Do not treat unlimited-stack tasks as legacy mmap
x86/mm: Remove reset_lazy_tlbstate()
x86/ldt: Simplify the LDT switching logic
x86/boot/64: Put __startup_64() into .head.text
x86/mm: Add support for 5-level paging for KASLR
x86/mm: Make kernel_physical_mapping_init() support 5-level paging
x86/mm: Add sync_global_pgds() for configuration with 5-level paging
x86/boot/64: Add support of additional page table level during early boot
x86/boot/64: Rename init_level4_pgt and early_level4_pgt
x86/boot/64: Rewrite startup_64() in C
x86/boot/compressed: Enable 5-level paging during decompression stage
x86/boot/efi: Define __KERNEL32_CS GDT on 64-bit configurations
x86/boot/efi: Fix __KERNEL_CS definition of GDT entry on 64-bit configurations
x86/boot/efi: Cleanup initialization of GDT entries
x86/asm: Fix comment in return_from_SYSCALL_64()
x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation
...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/kernel/espfix_64.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/head64.c | 145 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 131 | ||||
-rw-r--r-- | arch/x86/kernel/ldt.c | 56 | ||||
-rw-r--r-- | arch/x86/kernel/machine_kexec_64.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/paravirt.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process_32.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/step.c | 2 |
11 files changed, 198 insertions, 150 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3c7c419c4e3e..a01892bdd61a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg | |||
18 | CFLAGS_REMOVE_kvmclock.o = -pg | 18 | CFLAGS_REMOVE_kvmclock.o = -pg |
19 | CFLAGS_REMOVE_ftrace.o = -pg | 19 | CFLAGS_REMOVE_ftrace.o = -pg |
20 | CFLAGS_REMOVE_early_printk.o = -pg | 20 | CFLAGS_REMOVE_early_printk.o = -pg |
21 | CFLAGS_REMOVE_head64.o = -pg | ||
21 | endif | 22 | endif |
22 | 23 | ||
23 | KASAN_SANITIZE_head$(BITS).o := n | 24 | KASAN_SANITIZE_head$(BITS).o := n |
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 8e598a1ad986..6b91e2eb8d3f 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c | |||
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void) | |||
125 | p4d_t *p4d; | 125 | p4d_t *p4d; |
126 | 126 | ||
127 | /* Install the espfix pud into the kernel page directory */ | 127 | /* Install the espfix pud into the kernel page directory */ |
128 | pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; | 128 | pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; |
129 | p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); | 129 | p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); |
130 | p4d_populate(&init_mm, p4d, espfix_pud_page); | 130 | p4d_populate(&init_mm, p4d, espfix_pud_page); |
131 | 131 | ||
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 43b7002f44fb..46c3c73e7f43 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
@@ -33,17 +33,120 @@ | |||
33 | /* | 33 | /* |
34 | * Manage page tables very early on. | 34 | * Manage page tables very early on. |
35 | */ | 35 | */ |
36 | extern pgd_t early_level4_pgt[PTRS_PER_PGD]; | 36 | extern pgd_t early_top_pgt[PTRS_PER_PGD]; |
37 | extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; | 37 | extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; |
38 | static unsigned int __initdata next_early_pgt = 2; | 38 | static unsigned int __initdata next_early_pgt; |
39 | pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); | 39 | pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); |
40 | 40 | ||
41 | #define __head __section(.head.text) | ||
42 | |||
43 | static void __head *fixup_pointer(void *ptr, unsigned long physaddr) | ||
44 | { | ||
45 | return ptr - (void *)_text + (void *)physaddr; | ||
46 | } | ||
47 | |||
48 | void __head __startup_64(unsigned long physaddr) | ||
49 | { | ||
50 | unsigned long load_delta, *p; | ||
51 | pgdval_t *pgd; | ||
52 | p4dval_t *p4d; | ||
53 | pudval_t *pud; | ||
54 | pmdval_t *pmd, pmd_entry; | ||
55 | int i; | ||
56 | |||
57 | /* Is the address too large? */ | ||
58 | if (physaddr >> MAX_PHYSMEM_BITS) | ||
59 | for (;;); | ||
60 | |||
61 | /* | ||
62 | * Compute the delta between the address I am compiled to run at | ||
63 | * and the address I am actually running at. | ||
64 | */ | ||
65 | load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); | ||
66 | |||
67 | /* Is the address not 2M aligned? */ | ||
68 | if (load_delta & ~PMD_PAGE_MASK) | ||
69 | for (;;); | ||
70 | |||
71 | /* Fixup the physical addresses in the page table */ | ||
72 | |||
73 | pgd = fixup_pointer(&early_top_pgt, physaddr); | ||
74 | pgd[pgd_index(__START_KERNEL_map)] += load_delta; | ||
75 | |||
76 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) { | ||
77 | p4d = fixup_pointer(&level4_kernel_pgt, physaddr); | ||
78 | p4d[511] += load_delta; | ||
79 | } | ||
80 | |||
81 | pud = fixup_pointer(&level3_kernel_pgt, physaddr); | ||
82 | pud[510] += load_delta; | ||
83 | pud[511] += load_delta; | ||
84 | |||
85 | pmd = fixup_pointer(level2_fixmap_pgt, physaddr); | ||
86 | pmd[506] += load_delta; | ||
87 | |||
88 | /* | ||
89 | * Set up the identity mapping for the switchover. These | ||
90 | * entries should *NOT* have the global bit set! This also | ||
91 | * creates a bunch of nonsense entries but that is fine -- | ||
92 | * it avoids problems around wraparound. | ||
93 | */ | ||
94 | |||
95 | pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); | ||
96 | pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); | ||
97 | |||
98 | if (IS_ENABLED(CONFIG_X86_5LEVEL)) { | ||
99 | p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); | ||
100 | |||
101 | i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; | ||
102 | pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; | ||
103 | pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; | ||
104 | |||
105 | i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; | ||
106 | p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; | ||
107 | p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; | ||
108 | } else { | ||
109 | i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; | ||
110 | pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; | ||
111 | pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; | ||
112 | } | ||
113 | |||
114 | i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; | ||
115 | pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; | ||
116 | pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; | ||
117 | |||
118 | pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; | ||
119 | pmd_entry += physaddr; | ||
120 | |||
121 | for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { | ||
122 | int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD; | ||
123 | pmd[idx] = pmd_entry + i * PMD_SIZE; | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Fixup the kernel text+data virtual addresses. Note that | ||
128 | * we might write invalid pmds, when the kernel is relocated | ||
129 | * cleanup_highmap() fixes this up along with the mappings | ||
130 | * beyond _end. | ||
131 | */ | ||
132 | |||
133 | pmd = fixup_pointer(level2_kernel_pgt, physaddr); | ||
134 | for (i = 0; i < PTRS_PER_PMD; i++) { | ||
135 | if (pmd[i] & _PAGE_PRESENT) | ||
136 | pmd[i] += load_delta; | ||
137 | } | ||
138 | |||
139 | /* Fixup phys_base */ | ||
140 | p = fixup_pointer(&phys_base, physaddr); | ||
141 | *p += load_delta; | ||
142 | } | ||
143 | |||
41 | /* Wipe all early page tables except for the kernel symbol map */ | 144 | /* Wipe all early page tables except for the kernel symbol map */ |
42 | static void __init reset_early_page_tables(void) | 145 | static void __init reset_early_page_tables(void) |
43 | { | 146 | { |
44 | memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); | 147 | memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); |
45 | next_early_pgt = 0; | 148 | next_early_pgt = 0; |
46 | write_cr3(__pa_nodebug(early_level4_pgt)); | 149 | write_cr3(__pa_nodebug(early_top_pgt)); |
47 | } | 150 | } |
48 | 151 | ||
49 | /* Create a new PMD entry */ | 152 | /* Create a new PMD entry */ |
@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address) | |||
51 | { | 154 | { |
52 | unsigned long physaddr = address - __PAGE_OFFSET; | 155 | unsigned long physaddr = address - __PAGE_OFFSET; |
53 | pgdval_t pgd, *pgd_p; | 156 | pgdval_t pgd, *pgd_p; |
157 | p4dval_t p4d, *p4d_p; | ||
54 | pudval_t pud, *pud_p; | 158 | pudval_t pud, *pud_p; |
55 | pmdval_t pmd, *pmd_p; | 159 | pmdval_t pmd, *pmd_p; |
56 | 160 | ||
57 | /* Invalid address or early pgt is done ? */ | 161 | /* Invalid address or early pgt is done ? */ |
58 | if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) | 162 | if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) |
59 | return -1; | 163 | return -1; |
60 | 164 | ||
61 | again: | 165 | again: |
62 | pgd_p = &early_level4_pgt[pgd_index(address)].pgd; | 166 | pgd_p = &early_top_pgt[pgd_index(address)].pgd; |
63 | pgd = *pgd_p; | 167 | pgd = *pgd_p; |
64 | 168 | ||
65 | /* | 169 | /* |
@@ -67,8 +171,25 @@ again: | |||
67 | * critical -- __PAGE_OFFSET would point us back into the dynamic | 171 | * critical -- __PAGE_OFFSET would point us back into the dynamic |
68 | * range and we might end up looping forever... | 172 | * range and we might end up looping forever... |
69 | */ | 173 | */ |
70 | if (pgd) | 174 | if (!IS_ENABLED(CONFIG_X86_5LEVEL)) |
71 | pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); | 175 | p4d_p = pgd_p; |
176 | else if (pgd) | ||
177 | p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); | ||
178 | else { | ||
179 | if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { | ||
180 | reset_early_page_tables(); | ||
181 | goto again; | ||
182 | } | ||
183 | |||
184 | p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++]; | ||
185 | memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); | ||
186 | *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; | ||
187 | } | ||
188 | p4d_p += p4d_index(address); | ||
189 | p4d = *p4d_p; | ||
190 | |||
191 | if (p4d) | ||
192 | pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); | ||
72 | else { | 193 | else { |
73 | if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { | 194 | if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { |
74 | reset_early_page_tables(); | 195 | reset_early_page_tables(); |
@@ -77,7 +198,7 @@ again: | |||
77 | 198 | ||
78 | pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; | 199 | pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; |
79 | memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); | 200 | memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); |
80 | *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; | 201 | *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; |
81 | } | 202 | } |
82 | pud_p += pud_index(address); | 203 | pud_p += pud_index(address); |
83 | pud = *pud_p; | 204 | pud = *pud_p; |
@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) | |||
156 | 277 | ||
157 | clear_bss(); | 278 | clear_bss(); |
158 | 279 | ||
159 | clear_page(init_level4_pgt); | 280 | clear_page(init_top_pgt); |
160 | 281 | ||
161 | kasan_early_init(); | 282 | kasan_early_init(); |
162 | 283 | ||
@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) | |||
171 | */ | 292 | */ |
172 | load_ucode_bsp(); | 293 | load_ucode_bsp(); |
173 | 294 | ||
174 | /* set init_level4_pgt kernel high mapping*/ | 295 | /* set init_top_pgt kernel high mapping*/ |
175 | init_level4_pgt[511] = early_level4_pgt[511]; | 296 | init_top_pgt[511] = early_top_pgt[511]; |
176 | 297 | ||
177 | x86_64_start_reservations(real_mode_data); | 298 | x86_64_start_reservations(real_mode_data); |
178 | } | 299 | } |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ac9d327d2e42..6225550883df 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -37,10 +37,11 @@ | |||
37 | * | 37 | * |
38 | */ | 38 | */ |
39 | 39 | ||
40 | #define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) | ||
40 | #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) | 41 | #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) |
41 | 42 | ||
42 | L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) | 43 | PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) |
43 | L4_START_KERNEL = pgd_index(__START_KERNEL_map) | 44 | PGD_START_KERNEL = pgd_index(__START_KERNEL_map) |
44 | L3_START_KERNEL = pud_index(__START_KERNEL_map) | 45 | L3_START_KERNEL = pud_index(__START_KERNEL_map) |
45 | 46 | ||
46 | .text | 47 | .text |
@@ -72,101 +73,12 @@ startup_64: | |||
72 | /* Sanitize CPU configuration */ | 73 | /* Sanitize CPU configuration */ |
73 | call verify_cpu | 74 | call verify_cpu |
74 | 75 | ||
75 | /* | ||
76 | * Compute the delta between the address I am compiled to run at and the | ||
77 | * address I am actually running at. | ||
78 | */ | ||
79 | leaq _text(%rip), %rbp | ||
80 | subq $_text - __START_KERNEL_map, %rbp | ||
81 | |||
82 | /* Is the address not 2M aligned? */ | ||
83 | testl $~PMD_PAGE_MASK, %ebp | ||
84 | jnz bad_address | ||
85 | |||
86 | /* | ||
87 | * Is the address too large? | ||
88 | */ | ||
89 | leaq _text(%rip), %rax | ||
90 | shrq $MAX_PHYSMEM_BITS, %rax | ||
91 | jnz bad_address | ||
92 | |||
93 | /* | ||
94 | * Fixup the physical addresses in the page table | ||
95 | */ | ||
96 | addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) | ||
97 | |||
98 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) | ||
99 | addq %rbp, level3_kernel_pgt + (511*8)(%rip) | ||
100 | |||
101 | addq %rbp, level2_fixmap_pgt + (506*8)(%rip) | ||
102 | |||
103 | /* | ||
104 | * Set up the identity mapping for the switchover. These | ||
105 | * entries should *NOT* have the global bit set! This also | ||
106 | * creates a bunch of nonsense entries but that is fine -- | ||
107 | * it avoids problems around wraparound. | ||
108 | */ | ||
109 | leaq _text(%rip), %rdi | 76 | leaq _text(%rip), %rdi |
110 | leaq early_level4_pgt(%rip), %rbx | 77 | pushq %rsi |
111 | 78 | call __startup_64 | |
112 | movq %rdi, %rax | 79 | popq %rsi |
113 | shrq $PGDIR_SHIFT, %rax | ||
114 | |||
115 | leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx | ||
116 | movq %rdx, 0(%rbx,%rax,8) | ||
117 | movq %rdx, 8(%rbx,%rax,8) | ||
118 | |||
119 | addq $PAGE_SIZE, %rdx | ||
120 | movq %rdi, %rax | ||
121 | shrq $PUD_SHIFT, %rax | ||
122 | andl $(PTRS_PER_PUD-1), %eax | ||
123 | movq %rdx, PAGE_SIZE(%rbx,%rax,8) | ||
124 | incl %eax | ||
125 | andl $(PTRS_PER_PUD-1), %eax | ||
126 | movq %rdx, PAGE_SIZE(%rbx,%rax,8) | ||
127 | |||
128 | addq $PAGE_SIZE * 2, %rbx | ||
129 | movq %rdi, %rax | ||
130 | shrq $PMD_SHIFT, %rdi | ||
131 | addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax | ||
132 | leaq (_end - 1)(%rip), %rcx | ||
133 | shrq $PMD_SHIFT, %rcx | ||
134 | subq %rdi, %rcx | ||
135 | incl %ecx | ||
136 | |||
137 | 1: | ||
138 | andq $(PTRS_PER_PMD - 1), %rdi | ||
139 | movq %rax, (%rbx,%rdi,8) | ||
140 | incq %rdi | ||
141 | addq $PMD_SIZE, %rax | ||
142 | decl %ecx | ||
143 | jnz 1b | ||
144 | |||
145 | test %rbp, %rbp | ||
146 | jz .Lskip_fixup | ||
147 | 80 | ||
148 | /* | 81 | movq $(early_top_pgt - __START_KERNEL_map), %rax |
149 | * Fixup the kernel text+data virtual addresses. Note that | ||
150 | * we might write invalid pmds, when the kernel is relocated | ||
151 | * cleanup_highmap() fixes this up along with the mappings | ||
152 | * beyond _end. | ||
153 | */ | ||
154 | leaq level2_kernel_pgt(%rip), %rdi | ||
155 | leaq PAGE_SIZE(%rdi), %r8 | ||
156 | /* See if it is a valid page table entry */ | ||
157 | 1: testb $_PAGE_PRESENT, 0(%rdi) | ||
158 | jz 2f | ||
159 | addq %rbp, 0(%rdi) | ||
160 | /* Go to the next page */ | ||
161 | 2: addq $8, %rdi | ||
162 | cmp %r8, %rdi | ||
163 | jne 1b | ||
164 | |||
165 | /* Fixup phys_base */ | ||
166 | addq %rbp, phys_base(%rip) | ||
167 | |||
168 | .Lskip_fixup: | ||
169 | movq $(early_level4_pgt - __START_KERNEL_map), %rax | ||
170 | jmp 1f | 82 | jmp 1f |
171 | ENTRY(secondary_startup_64) | 83 | ENTRY(secondary_startup_64) |
172 | /* | 84 | /* |
@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64) | |||
186 | /* Sanitize CPU configuration */ | 98 | /* Sanitize CPU configuration */ |
187 | call verify_cpu | 99 | call verify_cpu |
188 | 100 | ||
189 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | 101 | movq $(init_top_pgt - __START_KERNEL_map), %rax |
190 | 1: | 102 | 1: |
191 | 103 | ||
192 | /* Enable PAE mode and PGE */ | 104 | /* Enable PAE mode, PGE and LA57 */ |
193 | movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx | 105 | movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx |
106 | #ifdef CONFIG_X86_5LEVEL | ||
107 | orl $X86_CR4_LA57, %ecx | ||
108 | #endif | ||
194 | movq %rcx, %cr4 | 109 | movq %rcx, %cr4 |
195 | 110 | ||
196 | /* Setup early boot stage 4 level pagetables. */ | 111 | /* Setup early boot stage 4-/5-level pagetables. */ |
197 | addq phys_base(%rip), %rax | 112 | addq phys_base(%rip), %rax |
198 | movq %rax, %cr3 | 113 | movq %rax, %cr3 |
199 | 114 | ||
@@ -417,9 +332,13 @@ GLOBAL(name) | |||
417 | .endr | 332 | .endr |
418 | 333 | ||
419 | __INITDATA | 334 | __INITDATA |
420 | NEXT_PAGE(early_level4_pgt) | 335 | NEXT_PAGE(early_top_pgt) |
421 | .fill 511,8,0 | 336 | .fill 511,8,0 |
337 | #ifdef CONFIG_X86_5LEVEL | ||
338 | .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
339 | #else | ||
422 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | 340 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
341 | #endif | ||
423 | 342 | ||
424 | NEXT_PAGE(early_dynamic_pgts) | 343 | NEXT_PAGE(early_dynamic_pgts) |
425 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 | 344 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 |
@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts) | |||
427 | .data | 346 | .data |
428 | 347 | ||
429 | #ifndef CONFIG_XEN | 348 | #ifndef CONFIG_XEN |
430 | NEXT_PAGE(init_level4_pgt) | 349 | NEXT_PAGE(init_top_pgt) |
431 | .fill 512,8,0 | 350 | .fill 512,8,0 |
432 | #else | 351 | #else |
433 | NEXT_PAGE(init_level4_pgt) | 352 | NEXT_PAGE(init_top_pgt) |
434 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 353 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
435 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 | 354 | .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 |
436 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 355 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
437 | .org init_level4_pgt + L4_START_KERNEL*8, 0 | 356 | .org init_top_pgt + PGD_START_KERNEL*8, 0 |
438 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | 357 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
439 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | 358 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
440 | 359 | ||
@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt) | |||
448 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) | 367 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) |
449 | #endif | 368 | #endif |
450 | 369 | ||
370 | #ifdef CONFIG_X86_5LEVEL | ||
371 | NEXT_PAGE(level4_kernel_pgt) | ||
372 | .fill 511,8,0 | ||
373 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
374 | #endif | ||
375 | |||
451 | NEXT_PAGE(level3_kernel_pgt) | 376 | NEXT_PAGE(level3_kernel_pgt) |
452 | .fill L3_START_KERNEL,8,0 | 377 | .fill L3_START_KERNEL,8,0 |
453 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ | 378 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ |
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index d4a15831ac58..a870910c8565 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c | |||
@@ -22,24 +22,25 @@ | |||
22 | #include <asm/syscalls.h> | 22 | #include <asm/syscalls.h> |
23 | 23 | ||
24 | /* context.lock is held for us, so we don't need any locking. */ | 24 | /* context.lock is held for us, so we don't need any locking. */ |
25 | static void flush_ldt(void *current_mm) | 25 | static void flush_ldt(void *__mm) |
26 | { | 26 | { |
27 | struct mm_struct *mm = __mm; | ||
27 | mm_context_t *pc; | 28 | mm_context_t *pc; |
28 | 29 | ||
29 | if (current->active_mm != current_mm) | 30 | if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) |
30 | return; | 31 | return; |
31 | 32 | ||
32 | pc = ¤t->active_mm->context; | 33 | pc = &mm->context; |
33 | set_ldt(pc->ldt->entries, pc->ldt->size); | 34 | set_ldt(pc->ldt->entries, pc->ldt->nr_entries); |
34 | } | 35 | } |
35 | 36 | ||
36 | /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ | 37 | /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ |
37 | static struct ldt_struct *alloc_ldt_struct(unsigned int size) | 38 | static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) |
38 | { | 39 | { |
39 | struct ldt_struct *new_ldt; | 40 | struct ldt_struct *new_ldt; |
40 | unsigned int alloc_size; | 41 | unsigned int alloc_size; |
41 | 42 | ||
42 | if (size > LDT_ENTRIES) | 43 | if (num_entries > LDT_ENTRIES) |
43 | return NULL; | 44 | return NULL; |
44 | 45 | ||
45 | new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); | 46 | new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); |
@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) | |||
47 | return NULL; | 48 | return NULL; |
48 | 49 | ||
49 | BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); | 50 | BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); |
50 | alloc_size = size * LDT_ENTRY_SIZE; | 51 | alloc_size = num_entries * LDT_ENTRY_SIZE; |
51 | 52 | ||
52 | /* | 53 | /* |
53 | * Xen is very picky: it requires a page-aligned LDT that has no | 54 | * Xen is very picky: it requires a page-aligned LDT that has no |
@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) | |||
65 | return NULL; | 66 | return NULL; |
66 | } | 67 | } |
67 | 68 | ||
68 | new_ldt->size = size; | 69 | new_ldt->nr_entries = num_entries; |
69 | return new_ldt; | 70 | return new_ldt; |
70 | } | 71 | } |
71 | 72 | ||
72 | /* After calling this, the LDT is immutable. */ | 73 | /* After calling this, the LDT is immutable. */ |
73 | static void finalize_ldt_struct(struct ldt_struct *ldt) | 74 | static void finalize_ldt_struct(struct ldt_struct *ldt) |
74 | { | 75 | { |
75 | paravirt_alloc_ldt(ldt->entries, ldt->size); | 76 | paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); |
76 | } | 77 | } |
77 | 78 | ||
78 | /* context.lock is held */ | 79 | /* context.lock is held */ |
@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt) | |||
91 | if (likely(!ldt)) | 92 | if (likely(!ldt)) |
92 | return; | 93 | return; |
93 | 94 | ||
94 | paravirt_free_ldt(ldt->entries, ldt->size); | 95 | paravirt_free_ldt(ldt->entries, ldt->nr_entries); |
95 | if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) | 96 | if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) |
96 | vfree_atomic(ldt->entries); | 97 | vfree_atomic(ldt->entries); |
97 | else | 98 | else |
98 | free_page((unsigned long)ldt->entries); | 99 | free_page((unsigned long)ldt->entries); |
@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) | |||
122 | goto out_unlock; | 123 | goto out_unlock; |
123 | } | 124 | } |
124 | 125 | ||
125 | new_ldt = alloc_ldt_struct(old_mm->context.ldt->size); | 126 | new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); |
126 | if (!new_ldt) { | 127 | if (!new_ldt) { |
127 | retval = -ENOMEM; | 128 | retval = -ENOMEM; |
128 | goto out_unlock; | 129 | goto out_unlock; |
129 | } | 130 | } |
130 | 131 | ||
131 | memcpy(new_ldt->entries, old_mm->context.ldt->entries, | 132 | memcpy(new_ldt->entries, old_mm->context.ldt->entries, |
132 | new_ldt->size * LDT_ENTRY_SIZE); | 133 | new_ldt->nr_entries * LDT_ENTRY_SIZE); |
133 | finalize_ldt_struct(new_ldt); | 134 | finalize_ldt_struct(new_ldt); |
134 | 135 | ||
135 | mm->context.ldt = new_ldt; | 136 | mm->context.ldt = new_ldt; |
@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm) | |||
152 | 153 | ||
153 | static int read_ldt(void __user *ptr, unsigned long bytecount) | 154 | static int read_ldt(void __user *ptr, unsigned long bytecount) |
154 | { | 155 | { |
155 | int retval; | ||
156 | unsigned long size; | ||
157 | struct mm_struct *mm = current->mm; | 156 | struct mm_struct *mm = current->mm; |
157 | unsigned long entries_size; | ||
158 | int retval; | ||
158 | 159 | ||
159 | mutex_lock(&mm->context.lock); | 160 | mutex_lock(&mm->context.lock); |
160 | 161 | ||
@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) | |||
166 | if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) | 167 | if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) |
167 | bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; | 168 | bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; |
168 | 169 | ||
169 | size = mm->context.ldt->size * LDT_ENTRY_SIZE; | 170 | entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; |
170 | if (size > bytecount) | 171 | if (entries_size > bytecount) |
171 | size = bytecount; | 172 | entries_size = bytecount; |
172 | 173 | ||
173 | if (copy_to_user(ptr, mm->context.ldt->entries, size)) { | 174 | if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { |
174 | retval = -EFAULT; | 175 | retval = -EFAULT; |
175 | goto out_unlock; | 176 | goto out_unlock; |
176 | } | 177 | } |
177 | 178 | ||
178 | if (size != bytecount) { | 179 | if (entries_size != bytecount) { |
179 | /* Zero-fill the rest and pretend we read bytecount bytes. */ | 180 | /* Zero-fill the rest and pretend we read bytecount bytes. */ |
180 | if (clear_user(ptr + size, bytecount - size)) { | 181 | if (clear_user(ptr + entries_size, bytecount - entries_size)) { |
181 | retval = -EFAULT; | 182 | retval = -EFAULT; |
182 | goto out_unlock; | 183 | goto out_unlock; |
183 | } | 184 | } |
@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) | |||
208 | { | 209 | { |
209 | struct mm_struct *mm = current->mm; | 210 | struct mm_struct *mm = current->mm; |
210 | struct ldt_struct *new_ldt, *old_ldt; | 211 | struct ldt_struct *new_ldt, *old_ldt; |
211 | unsigned int oldsize, newsize; | 212 | unsigned int old_nr_entries, new_nr_entries; |
212 | struct user_desc ldt_info; | 213 | struct user_desc ldt_info; |
213 | struct desc_struct ldt; | 214 | struct desc_struct ldt; |
214 | int error; | 215 | int error; |
@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) | |||
247 | 248 | ||
248 | mutex_lock(&mm->context.lock); | 249 | mutex_lock(&mm->context.lock); |
249 | 250 | ||
250 | old_ldt = mm->context.ldt; | 251 | old_ldt = mm->context.ldt; |
251 | oldsize = old_ldt ? old_ldt->size : 0; | 252 | old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; |
252 | newsize = max(ldt_info.entry_number + 1, oldsize); | 253 | new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); |
253 | 254 | ||
254 | error = -ENOMEM; | 255 | error = -ENOMEM; |
255 | new_ldt = alloc_ldt_struct(newsize); | 256 | new_ldt = alloc_ldt_struct(new_nr_entries); |
256 | if (!new_ldt) | 257 | if (!new_ldt) |
257 | goto out_unlock; | 258 | goto out_unlock; |
258 | 259 | ||
259 | if (old_ldt) | 260 | if (old_ldt) |
260 | memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE); | 261 | memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); |
262 | |||
261 | new_ldt->entries[ldt_info.entry_number] = ldt; | 263 | new_ldt->entries[ldt_info.entry_number] = ldt; |
262 | finalize_ldt_struct(new_ldt); | 264 | finalize_ldt_struct(new_ldt); |
263 | 265 | ||
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6f5ca4ebe6e5..cb0a30473c23 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c | |||
@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image) | |||
347 | void arch_crash_save_vmcoreinfo(void) | 347 | void arch_crash_save_vmcoreinfo(void) |
348 | { | 348 | { |
349 | VMCOREINFO_NUMBER(phys_base); | 349 | VMCOREINFO_NUMBER(phys_base); |
350 | VMCOREINFO_SYMBOL(init_level4_pgt); | 350 | VMCOREINFO_SYMBOL(init_top_pgt); |
351 | 351 | ||
352 | #ifdef CONFIG_NUMA | 352 | #ifdef CONFIG_NUMA |
353 | VMCOREINFO_SYMBOL(node_data); | 353 | VMCOREINFO_SYMBOL(node_data); |
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 3586996fc50d..bc0a849589bb 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { | |||
391 | 391 | ||
392 | .read_cr2 = native_read_cr2, | 392 | .read_cr2 = native_read_cr2, |
393 | .write_cr2 = native_write_cr2, | 393 | .write_cr2 = native_write_cr2, |
394 | .read_cr3 = native_read_cr3, | 394 | .read_cr3 = __native_read_cr3, |
395 | .write_cr3 = native_write_cr3, | 395 | .write_cr3 = native_write_cr3, |
396 | 396 | ||
397 | .flush_tlb_user = native_flush_tlb, | 397 | .flush_tlb_user = native_flush_tlb, |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ffeae818aa7a..c6d6dc5f8bb2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all) | |||
92 | 92 | ||
93 | cr0 = read_cr0(); | 93 | cr0 = read_cr0(); |
94 | cr2 = read_cr2(); | 94 | cr2 = read_cr2(); |
95 | cr3 = read_cr3(); | 95 | cr3 = __read_cr3(); |
96 | cr4 = __read_cr4(); | 96 | cr4 = __read_cr4(); |
97 | printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", | 97 | printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", |
98 | cr0, cr2, cr3, cr4); | 98 | cr0, cr2, cr3, cr4); |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b6840bf3940b..c3169be4c596 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all) | |||
104 | 104 | ||
105 | cr0 = read_cr0(); | 105 | cr0 = read_cr0(); |
106 | cr2 = read_cr2(); | 106 | cr2 = read_cr2(); |
107 | cr3 = read_cr3(); | 107 | cr3 = __read_cr3(); |
108 | cr4 = __read_cr4(); | 108 | cr4 = __read_cr4(); |
109 | 109 | ||
110 | printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | 110 | printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", |
@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task) | |||
142 | pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", | 142 | pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", |
143 | dead_task->comm, | 143 | dead_task->comm, |
144 | dead_task->mm->context.ldt->entries, | 144 | dead_task->mm->context.ldt->entries, |
145 | dead_task->mm->context.ldt->size); | 145 | dead_task->mm->context.ldt->nr_entries); |
146 | BUG(); | 146 | BUG(); |
147 | } | 147 | } |
148 | #endif | 148 | #endif |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 045e4f993bd2..b474c8de7fba 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu) | |||
1589 | void play_dead_common(void) | 1589 | void play_dead_common(void) |
1590 | { | 1590 | { |
1591 | idle_task_exit(); | 1591 | idle_task_exit(); |
1592 | reset_lazy_tlbstate(); | ||
1593 | 1592 | ||
1594 | /* Ack it */ | 1593 | /* Ack it */ |
1595 | (void)cpu_report_death(); | 1594 | (void)cpu_report_death(); |
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index f07f83b3611b..5f25cfbd952e 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c | |||
@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re | |||
34 | 34 | ||
35 | mutex_lock(&child->mm->context.lock); | 35 | mutex_lock(&child->mm->context.lock); |
36 | if (unlikely(!child->mm->context.ldt || | 36 | if (unlikely(!child->mm->context.ldt || |
37 | seg >= child->mm->context.ldt->size)) | 37 | seg >= child->mm->context.ldt->nr_entries)) |
38 | addr = -1L; /* bogus selector, access would fault */ | 38 | addr = -1L; /* bogus selector, access would fault */ |
39 | else { | 39 | else { |
40 | desc = &child->mm->context.ldt->entries[seg]; | 40 | desc = &child->mm->context.ldt->entries[seg]; |