aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2013-01-24 15:19:52 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2013-01-29 18:20:06 -0500
commit8170e6bed465b4b0c7687f93e9948aca4358a33b (patch)
tree4a10d8a14af51dd0a0f51539a3fdc1cb7e9f304b
parent4f7b92263ad68cdc72b11808320d9c881bfa857e (diff)
x86, 64bit: Use a #PF handler to materialize early mappings on demand
Linear mode (CR0.PG = 0) is mutually exclusive with 64-bit mode; all 64-bit code has to use page tables. This makes it awkward before we have first set up properly all-covering page tables to access objects that are outside the static kernel range. So far we have dealt with that simply by mapping a fixed amount of low memory, but that fails in at least two upcoming use cases: 1. We will support load and run kernel, struct boot_params, ramdisk, command line, etc. above the 4 GiB mark. 2. need to access ramdisk early to get microcode to update that as early possible. We could use early_iomap to access them too, but it will make code to messy and hard to be unified with 32 bit. Hence, set up a #PF table and use a fixed number of buffers to set up page tables on demand. If the buffers fill up then we simply flush them and start over. These buffers are all in __initdata, so it does not increase RAM usage at runtime. Thus, with the help of the #PF handler, we can set the final kernel mapping from blank, and switch to init_level4_pgt later. During the switchover in head_64.S, before #PF handler is available, we use three pages to handle kernel crossing 1G, 512G boundaries with sharing page by playing games with page aliasing: the same page is mapped twice in the higher-level tables with appropriate wraparound. The kernel region itself will be properly mapped; other mappings may be spurious. early_make_pgtable is using kernel high mapping address to access pages to set page table. -v4: Add phys_base offset to make kexec happy, and add init_mapping_kernel() - Yinghai -v5: fix compiling with xen, and add back ident level3 and level2 for xen also move back init_level4_pgt from BSS to DATA again. because we have to clear it anyway. - Yinghai -v6: switch to init_level4_pgt in init_mem_mapping. - Yinghai -v7: remove not needed clear_page for init_level4_page it is with fill 512,8,0 already in head_64.S - Yinghai -v8: we need to keep that handler alive until init_mem_mapping and don't let early_trap_init to trash that early #PF handler. So split early_trap_pf_init out and move it down. - Yinghai -v9: switchover only cover kernel space instead of 1G so could avoid touch possible mem holes. - Yinghai -v11: change far jmp back to far return to initial_code, that is needed to fix failure that is reported by Konrad on AMD systems. - Yinghai Signed-off-by: Yinghai Lu <yinghai@kernel.org> Link: http://lkml.kernel.org/r/1359058816-7615-12-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/kernel/head64.c81
-rw-r--r--arch/x86/kernel/head_64.S210
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/mm/init.c3
7 files changed, 219 insertions, 91 deletions
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16fbbbd..2d883440cb9a 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_PGTABLE_64_DEFS_H 1#ifndef _ASM_X86_PGTABLE_64_DEFS_H
2#define _ASM_X86_PGTABLE_64_DEFS_H 2#define _ASM_X86_PGTABLE_64_DEFS_H
3 3
4#include <asm/sparsemem.h>
5
4#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
5#include <linux/types.h> 7#include <linux/types.h>
6 8
@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
60#define MODULES_END _AC(0xffffffffff000000, UL) 62#define MODULES_END _AC(0xffffffffff000000, UL)
61#define MODULES_LEN (MODULES_END - MODULES_VADDR) 63#define MODULES_LEN (MODULES_END - MODULES_VADDR)
62 64
65#define EARLY_DYNAMIC_PAGE_TABLES 64
66
63#endif /* _ASM_X86_PGTABLE_64_DEFS_H */ 67#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 888184b2fc85..bdee8bd318ea 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -731,6 +731,7 @@ extern void enable_sep_cpu(void);
731extern int sysenter_setup(void); 731extern int sysenter_setup(void);
732 732
733extern void early_trap_init(void); 733extern void early_trap_init(void);
734void early_trap_pf_init(void);
734 735
735/* Defined in head.S */ 736/* Defined in head.S */
736extern struct desc_ptr early_gdt_descr; 737extern struct desc_ptr early_gdt_descr;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7785e66840a4..f57df05ea126 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,11 +27,73 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/bootparam_utils.h> 28#include <asm/bootparam_utils.h>
29 29
30static void __init zap_identity_mappings(void) 30/*
31 * Manage page tables very early on.
32 */
33extern pgd_t early_level4_pgt[PTRS_PER_PGD];
34extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
35static unsigned int __initdata next_early_pgt = 2;
36
37/* Wipe all early page tables except for the kernel symbol map */
38static void __init reset_early_page_tables(void)
31{ 39{
32 pgd_t *pgd = pgd_offset_k(0UL); 40 unsigned long i;
33 pgd_clear(pgd); 41
34 __flush_tlb_all(); 42 for (i = 0; i < PTRS_PER_PGD-1; i++)
43 early_level4_pgt[i].pgd = 0;
44
45 next_early_pgt = 0;
46
47 write_cr3(__pa(early_level4_pgt));
48}
49
50/* Create a new PMD entry */
51int __init early_make_pgtable(unsigned long address)
52{
53 unsigned long physaddr = address - __PAGE_OFFSET;
54 unsigned long i;
55 pgdval_t pgd, *pgd_p;
56 pudval_t *pud_p;
57 pmdval_t pmd, *pmd_p;
58
59 /* Invalid address or early pgt is done ? */
60 if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
61 return -1;
62
63 i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1);
64 pgd_p = &early_level4_pgt[i].pgd;
65 pgd = *pgd_p;
66
67 /*
68 * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
69 * critical -- __PAGE_OFFSET would point us back into the dynamic
70 * range and we might end up looping forever...
71 */
72 if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) {
73 pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
74 } else {
75 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1)
76 reset_early_page_tables();
77
78 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
79 for (i = 0; i < PTRS_PER_PUD; i++)
80 pud_p[i] = 0;
81
82 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
83 }
84 i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
85 pud_p += i;
86
87 pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
88 pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
89 for (i = 0; i < PTRS_PER_PMD; i++) {
90 pmd_p[i] = pmd;
91 pmd += PMD_SIZE;
92 }
93
94 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
95
96 return 0;
35} 97}
36 98
37/* Don't add a printk in there. printk relies on the PDA which is not initialized 99/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -72,12 +134,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
72 (__START_KERNEL & PGDIR_MASK))); 134 (__START_KERNEL & PGDIR_MASK)));
73 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); 135 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
74 136
137 /* Kill off the identity-map trampoline */
138 reset_early_page_tables();
139
75 /* clear bss before set_intr_gate with early_idt_handler */ 140 /* clear bss before set_intr_gate with early_idt_handler */
76 clear_bss(); 141 clear_bss();
77 142
78 /* Make NULL pointers segfault */ 143 /* XXX - this is wrong... we need to build page tables from scratch */
79 zap_identity_mappings();
80
81 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; 144 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
82 145
83 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 146 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
@@ -94,6 +157,10 @@ void __init x86_64_start_kernel(char * real_mode_data)
94 if (console_loglevel == 10) 157 if (console_loglevel == 10)
95 early_printk("Kernel alive\n"); 158 early_printk("Kernel alive\n");
96 159
160 clear_page(init_level4_pgt);
161 /* set init_level4_pgt kernel high mapping*/
162 init_level4_pgt[511] = early_level4_pgt[511];
163
97 x86_64_start_reservations(real_mode_data); 164 x86_64_start_reservations(real_mode_data);
98} 165}
99 166
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c4b9cc..d94f6d68be2a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
47 .code64 47 .code64
48 .globl startup_64 48 .globl startup_64
49startup_64: 49startup_64:
50
51 /* 50 /*
52 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 51 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
53 * and someone has loaded an identity mapped page table 52 * and someone has loaded an identity mapped page table
54 * for us. These identity mapped page tables map all of the 53 * for us. These identity mapped page tables map all of the
55 * kernel pages and possibly all of memory. 54 * kernel pages and possibly all of memory.
56 * 55 *
57 * %esi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
58 * 57 *
59 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
60 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
66 * tables and then reload them. 65 * tables and then reload them.
67 */ 66 */
68 67
69 /* Compute the delta between the address I am compiled to run at and the 68 /*
69 * Compute the delta between the address I am compiled to run at and the
70 * address I am actually running at. 70 * address I am actually running at.
71 */ 71 */
72 leaq _text(%rip), %rbp 72 leaq _text(%rip), %rbp
@@ -78,45 +78,62 @@ startup_64:
78 testl %eax, %eax 78 testl %eax, %eax
79 jnz bad_address 79 jnz bad_address
80 80
81 /* Is the address too large? */ 81 /*
82 leaq _text(%rip), %rdx 82 * Is the address too large?
83 movq $PGDIR_SIZE, %rax
84 cmpq %rax, %rdx
85 jae bad_address
86
87 /* Fixup the physical addresses in the page table
88 */ 83 */
89 addq %rbp, init_level4_pgt + 0(%rip) 84 leaq _text(%rip), %rax
90 addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) 85 shrq $MAX_PHYSMEM_BITS, %rax
91 addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) 86 jnz bad_address
92 87
93 addq %rbp, level3_ident_pgt + 0(%rip) 88 /*
89 * Fixup the physical addresses in the page table
90 */
91 addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
94 92
95 addq %rbp, level3_kernel_pgt + (510*8)(%rip) 93 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
96 addq %rbp, level3_kernel_pgt + (511*8)(%rip) 94 addq %rbp, level3_kernel_pgt + (511*8)(%rip)
97 95
98 addq %rbp, level2_fixmap_pgt + (506*8)(%rip) 96 addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
99 97
100 /* Add an Identity mapping if I am above 1G */ 98 /*
99 * Set up the identity mapping for the switchover. These
100 * entries should *NOT* have the global bit set! This also
101 * creates a bunch of nonsense entries but that is fine --
102 * it avoids problems around wraparound.
103 */
101 leaq _text(%rip), %rdi 104 leaq _text(%rip), %rdi
102 andq $PMD_PAGE_MASK, %rdi 105 leaq early_level4_pgt(%rip), %rbx
103 106
104 movq %rdi, %rax 107 movq %rdi, %rax
105 shrq $PUD_SHIFT, %rax 108 shrq $PGDIR_SHIFT, %rax
106 andq $(PTRS_PER_PUD - 1), %rax
107 jz ident_complete
108 109
109 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx 110 leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
110 leaq level3_ident_pgt(%rip), %rbx 111 movq %rdx, 0(%rbx,%rax,8)
111 movq %rdx, 0(%rbx, %rax, 8) 112 movq %rdx, 8(%rbx,%rax,8)
112 113
114 addq $4096, %rdx
113 movq %rdi, %rax 115 movq %rdi, %rax
114 shrq $PMD_SHIFT, %rax 116 shrq $PUD_SHIFT, %rax
115 andq $(PTRS_PER_PMD - 1), %rax 117 andl $(PTRS_PER_PUD-1), %eax
116 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx 118 movq %rdx, (4096+0)(%rbx,%rax,8)
117 leaq level2_spare_pgt(%rip), %rbx 119 movq %rdx, (4096+8)(%rbx,%rax,8)
118 movq %rdx, 0(%rbx, %rax, 8) 120
119ident_complete: 121 addq $8192, %rbx
122 movq %rdi, %rax
123 shrq $PMD_SHIFT, %rdi
124 addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
125 leaq (_end - 1)(%rip), %rcx
126 shrq $PMD_SHIFT, %rcx
127 subq %rdi, %rcx
128 incl %ecx
129
1301:
131 andq $(PTRS_PER_PMD - 1), %rdi
132 movq %rax, (%rbx,%rdi,8)
133 incq %rdi
134 addq $PMD_SIZE, %rax
135 decl %ecx
136 jnz 1b
120 137
121 /* 138 /*
122 * Fixup the kernel text+data virtual addresses. Note that 139 * Fixup the kernel text+data virtual addresses. Note that
@@ -124,7 +141,6 @@ ident_complete:
124 * cleanup_highmap() fixes this up along with the mappings 141 * cleanup_highmap() fixes this up along with the mappings
125 * beyond _end. 142 * beyond _end.
126 */ 143 */
127
128 leaq level2_kernel_pgt(%rip), %rdi 144 leaq level2_kernel_pgt(%rip), %rdi
129 leaq 4096(%rdi), %r8 145 leaq 4096(%rdi), %r8
130 /* See if it is a valid page table entry */ 146 /* See if it is a valid page table entry */
@@ -139,17 +155,14 @@ ident_complete:
139 /* Fixup phys_base */ 155 /* Fixup phys_base */
140 addq %rbp, phys_base(%rip) 156 addq %rbp, phys_base(%rip)
141 157
142 /* Due to ENTRY(), sometimes the empty space gets filled with 158 movq $(early_level4_pgt - __START_KERNEL_map), %rax
143 * zeros. Better take a jmp than relying on empty space being 159 jmp 1f
144 * filled with 0x90 (nop)
145 */
146 jmp secondary_startup_64
147ENTRY(secondary_startup_64) 160ENTRY(secondary_startup_64)
148 /* 161 /*
149 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 162 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
150 * and someone has loaded a mapped page table. 163 * and someone has loaded a mapped page table.
151 * 164 *
152 * %esi holds a physical pointer to real_mode_data. 165 * %rsi holds a physical pointer to real_mode_data.
153 * 166 *
154 * We come here either from startup_64 (using physical addresses) 167 * We come here either from startup_64 (using physical addresses)
155 * or from trampoline.S (using virtual addresses). 168 * or from trampoline.S (using virtual addresses).
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64)
159 * after the boot processor executes this code. 172 * after the boot processor executes this code.
160 */ 173 */
161 174
175 movq $(init_level4_pgt - __START_KERNEL_map), %rax
1761:
177
162 /* Enable PAE mode and PGE */ 178 /* Enable PAE mode and PGE */
163 movl $(X86_CR4_PAE | X86_CR4_PGE), %eax 179 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
164 movq %rax, %cr4 180 movq %rcx, %cr4
165 181
166 /* Setup early boot stage 4 level pagetables. */ 182 /* Setup early boot stage 4 level pagetables. */
167 movq $(init_level4_pgt - __START_KERNEL_map), %rax
168 addq phys_base(%rip), %rax 183 addq phys_base(%rip), %rax
169 movq %rax, %cr3 184 movq %rax, %cr3
170 185
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64)
196 movq %rax, %cr0 211 movq %rax, %cr0
197 212
198 /* Setup a boot time stack */ 213 /* Setup a boot time stack */
199 movq stack_start(%rip),%rsp 214 movq stack_start(%rip), %rsp
200 215
201 /* zero EFLAGS after setting rsp */ 216 /* zero EFLAGS after setting rsp */
202 pushq $0 217 pushq $0
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64)
236 movl initial_gs+4(%rip),%edx 251 movl initial_gs+4(%rip),%edx
237 wrmsr 252 wrmsr
238 253
239 /* esi is pointer to real mode structure with interesting info. 254 /* rsi is pointer to real mode structure with interesting info.
240 pass it to C */ 255 pass it to C */
241 movl %esi, %edi 256 movq %rsi, %rdi
242 257
243 /* Finally jump to run C code and to be on real kernel address 258 /* Finally jump to run C code and to be on real kernel address
244 * Since we are running on identity-mapped space we have to jump 259 * Since we are running on identity-mapped space we have to jump
245 * to the full 64bit address, this is only possible as indirect 260 * to the full 64bit address, this is only possible as indirect
246 * jump. In addition we need to ensure %cs is set so we make this 261 * jump. In addition we need to ensure %cs is set so we make this
247 * a far return. 262 * a far return.
263 *
264 * Note: do not change to far jump indirect with 64bit offset.
265 *
266 * AMD does not support far jump indirect with 64bit offset.
267 * AMD64 Architecture Programmer's Manual, Volume 3: states only
268 * JMP FAR mem16:16 FF /5 Far jump indirect,
269 * with the target specified by a far pointer in memory.
270 * JMP FAR mem16:32 FF /5 Far jump indirect,
271 * with the target specified by a far pointer in memory.
272 *
273 * Intel64 does support 64bit offset.
274 * Software Developer Manual Vol 2: states:
275 * FF /5 JMP m16:16 Jump far, absolute indirect,
276 * address given in m16:16
277 * FF /5 JMP m16:32 Jump far, absolute indirect,
278 * address given in m16:32.
279 * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
280 * address given in m16:64.
248 */ 281 */
249 movq initial_code(%rip),%rax 282 movq initial_code(%rip),%rax
250 pushq $0 # fake return address to stop unwinder 283 pushq $0 # fake return address to stop unwinder
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0)
270 303
271 /* SMP bootup changes these two */ 304 /* SMP bootup changes these two */
272 __REFDATA 305 __REFDATA
273 .align 8 306 .balign 8
274 ENTRY(initial_code) 307 GLOBAL(initial_code)
275 .quad x86_64_start_kernel 308 .quad x86_64_start_kernel
276 ENTRY(initial_gs) 309 GLOBAL(initial_gs)
277 .quad INIT_PER_CPU_VAR(irq_stack_union) 310 .quad INIT_PER_CPU_VAR(irq_stack_union)
278 311
279 ENTRY(stack_start) 312 GLOBAL(stack_start)
280 .quad init_thread_union+THREAD_SIZE-8 313 .quad init_thread_union+THREAD_SIZE-8
281 .word 0 314 .word 0
282 __FINITDATA 315 __FINITDATA
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0)
284bad_address: 317bad_address:
285 jmp bad_address 318 jmp bad_address
286 319
287 .section ".init.text","ax" 320 __INIT
288 .globl early_idt_handlers 321 .globl early_idt_handlers
289early_idt_handlers: 322early_idt_handlers:
290 # 104(%rsp) %rflags 323 # 104(%rsp) %rflags
@@ -321,14 +354,22 @@ ENTRY(early_idt_handler)
321 pushq %r11 # 0(%rsp) 354 pushq %r11 # 0(%rsp)
322 355
323 cmpl $__KERNEL_CS,96(%rsp) 356 cmpl $__KERNEL_CS,96(%rsp)
324 jne 10f 357 jne 11f
358
359 cmpl $14,72(%rsp) # Page fault?
360 jnz 10f
361 GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
362 call early_make_pgtable
363 andl %eax,%eax
364 jz 20f # All good
325 365
36610:
326 leaq 88(%rsp),%rdi # Pointer to %rip 367 leaq 88(%rsp),%rdi # Pointer to %rip
327 call early_fixup_exception 368 call early_fixup_exception
328 andl %eax,%eax 369 andl %eax,%eax
329 jnz 20f # Found an exception entry 370 jnz 20f # Found an exception entry
330 371
33110: 37211:
332#ifdef CONFIG_EARLY_PRINTK 373#ifdef CONFIG_EARLY_PRINTK
333 GET_CR2_INTO(%r9) # can clobber any volatile register if pv 374 GET_CR2_INTO(%r9) # can clobber any volatile register if pv
334 movl 80(%rsp),%r8d # error code 375 movl 80(%rsp),%r8d # error code
@@ -350,7 +391,7 @@ ENTRY(early_idt_handler)
3501: hlt 3911: hlt
351 jmp 1b 392 jmp 1b
352 393
35320: # Exception table entry found 39420: # Exception table entry found or page table generated
354 popq %r11 395 popq %r11
355 popq %r10 396 popq %r10
356 popq %r9 397 popq %r9
@@ -364,6 +405,8 @@ ENTRY(early_idt_handler)
364 decl early_recursion_flag(%rip) 405 decl early_recursion_flag(%rip)
365 INTERRUPT_RETURN 406 INTERRUPT_RETURN
366 407
408 __INITDATA
409
367 .balign 4 410 .balign 4
368early_recursion_flag: 411early_recursion_flag:
369 .long 0 412 .long 0
@@ -374,11 +417,10 @@ early_idt_msg:
374early_idt_ripmsg: 417early_idt_ripmsg:
375 .asciz "RIP %s\n" 418 .asciz "RIP %s\n"
376#endif /* CONFIG_EARLY_PRINTK */ 419#endif /* CONFIG_EARLY_PRINTK */
377 .previous
378 420
379#define NEXT_PAGE(name) \ 421#define NEXT_PAGE(name) \
380 .balign PAGE_SIZE; \ 422 .balign PAGE_SIZE; \
381ENTRY(name) 423GLOBAL(name)
382 424
383/* Automate the creation of 1 to 1 mapping pmd entries */ 425/* Automate the creation of 1 to 1 mapping pmd entries */
384#define PMDS(START, PERM, COUNT) \ 426#define PMDS(START, PERM, COUNT) \
@@ -388,24 +430,37 @@ ENTRY(name)
388 i = i + 1 ; \ 430 i = i + 1 ; \
389 .endr 431 .endr
390 432
433 __INITDATA
434NEXT_PAGE(early_level4_pgt)
435 .fill 511,8,0
436 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
437
438NEXT_PAGE(early_dynamic_pgts)
439 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
440
391 .data 441 .data
392 /* 442
393 * This default setting generates an ident mapping at address 0x100000 443#ifndef CONFIG_XEN
394 * and a mapping for the kernel that precisely maps virtual address
395 * 0xffffffff80000000 to physical address 0x000000. (always using
396 * 2Mbyte large pages provided by PAE mode)
397 */
398NEXT_PAGE(init_level4_pgt) 444NEXT_PAGE(init_level4_pgt)
399 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 445 .fill 512,8,0
400 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 446#else
401 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 447NEXT_PAGE(init_level4_pgt)
402 .org init_level4_pgt + L4_START_KERNEL*8, 0 448 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
449 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
450 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
451 .org init_level4_pgt + L4_START_KERNEL*8, 0
403 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 452 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
404 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 453 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
405 454
406NEXT_PAGE(level3_ident_pgt) 455NEXT_PAGE(level3_ident_pgt)
407 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 456 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
408 .fill 511,8,0 457 .fill 511, 8, 0
458NEXT_PAGE(level2_ident_pgt)
459 /* Since I easily can, map the first 1G.
460 * Don't set NX because code runs from these pages.
461 */
462 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
463#endif
409 464
410NEXT_PAGE(level3_kernel_pgt) 465NEXT_PAGE(level3_kernel_pgt)
411 .fill L3_START_KERNEL,8,0 466 .fill L3_START_KERNEL,8,0
@@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt)
413 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 468 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
414 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 469 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
415 470
416NEXT_PAGE(level2_fixmap_pgt)
417 .fill 506,8,0
418 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
419 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
420 .fill 5,8,0
421
422NEXT_PAGE(level1_fixmap_pgt)
423 .fill 512,8,0
424
425NEXT_PAGE(level2_ident_pgt)
426 /* Since I easily can, map the first 1G.
427 * Don't set NX because code runs from these pages.
428 */
429 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
430
431NEXT_PAGE(level2_kernel_pgt) 471NEXT_PAGE(level2_kernel_pgt)
432 /* 472 /*
433 * 512 MB kernel mapping. We spend a full page on this pagetable 473 * 512 MB kernel mapping. We spend a full page on this pagetable
@@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt)
442 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, 482 PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
443 KERNEL_IMAGE_SIZE/PMD_SIZE) 483 KERNEL_IMAGE_SIZE/PMD_SIZE)
444 484
445NEXT_PAGE(level2_spare_pgt) 485NEXT_PAGE(level2_fixmap_pgt)
446 .fill 512, 8, 0 486 .fill 506,8,0
487 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
488 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
489 .fill 5,8,0
490
491NEXT_PAGE(level1_fixmap_pgt)
492 .fill 512,8,0
447 493
448#undef PMDS 494#undef PMDS
449#undef NEXT_PAGE
450 495
451 .data 496 .data
452 .align 16 497 .align 16
@@ -472,6 +517,5 @@ ENTRY(nmi_idt_table)
472 .skip IDT_ENTRIES * 16 517 .skip IDT_ENTRIES * 16
473 518
474 __PAGE_ALIGNED_BSS 519 __PAGE_ALIGNED_BSS
475 .align PAGE_SIZE 520NEXT_PAGE(empty_zero_page)
476ENTRY(empty_zero_page)
477 .skip PAGE_SIZE 521 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 85a8290801df..db9c41dae8d7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1005,6 +1005,8 @@ void __init setup_arch(char **cmdline_p)
1005 1005
1006 init_mem_mapping(); 1006 init_mem_mapping();
1007 1007
1008 early_trap_pf_init();
1009
1008 setup_real_mode(); 1010 setup_real_mode();
1009 1011
1010 memblock.current_limit = get_max_mapped(); 1012 memblock.current_limit = get_max_mapped();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca11f4e9..68bda7a84159 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -688,10 +688,19 @@ void __init early_trap_init(void)
688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
689 /* int3 can be called from all */ 689 /* int3 can be called from all */
690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
691#ifdef CONFIG_X86_32
691 set_intr_gate(X86_TRAP_PF, &page_fault); 692 set_intr_gate(X86_TRAP_PF, &page_fault);
693#endif
692 load_idt(&idt_descr); 694 load_idt(&idt_descr);
693} 695}
694 696
697void __init early_trap_pf_init(void)
698{
699#ifdef CONFIG_X86_64
700 set_intr_gate(X86_TRAP_PF, &page_fault);
701#endif
702}
703
695void __init trap_init(void) 704void __init trap_init(void)
696{ 705{
697 int i; 706 int i;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 78d1ef3eab66..3364a7643a4c 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -446,9 +446,10 @@ void __init init_mem_mapping(void)
446 } 446 }
447#else 447#else
448 early_ioremap_page_table_range_init(); 448 early_ioremap_page_table_range_init();
449#endif
450
449 load_cr3(swapper_pg_dir); 451 load_cr3(swapper_pg_dir);
450 __flush_tlb_all(); 452 __flush_tlb_all();
451#endif
452 453
453 early_memtest(0, max_pfn_mapped << PAGE_SHIFT); 454 early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
454} 455}