diff options
author | H. Peter Anvin <hpa@zytor.com> | 2013-01-24 15:19:52 -0500 |
---|---|---|
committer | H. Peter Anvin <hpa@linux.intel.com> | 2013-01-29 18:20:06 -0500 |
commit | 8170e6bed465b4b0c7687f93e9948aca4358a33b (patch) | |
tree | 4a10d8a14af51dd0a0f51539a3fdc1cb7e9f304b | |
parent | 4f7b92263ad68cdc72b11808320d9c881bfa857e (diff) |
x86, 64bit: Use a #PF handler to materialize early mappings on demand
Linear mode (CR0.PG = 0) is mutually exclusive with 64-bit mode; all
64-bit code has to use page tables. This makes it awkward before we
have first set up properly all-covering page tables to access objects
that are outside the static kernel range.
So far we have dealt with that simply by mapping a fixed amount of
low memory, but that fails in at least two upcoming use cases:
1. We will support load and run kernel, struct boot_params, ramdisk,
command line, etc. above the 4 GiB mark.
2. need to access ramdisk early to get microcode to update that as
early possible.
We could use early_iomap to access them too, but it will make code to
messy and hard to be unified with 32 bit.
Hence, set up a #PF table and use a fixed number of buffers to set up
page tables on demand. If the buffers fill up then we simply flush
them and start over. These buffers are all in __initdata, so it does
not increase RAM usage at runtime.
Thus, with the help of the #PF handler, we can set the final kernel
mapping from blank, and switch to init_level4_pgt later.
During the switchover in head_64.S, before #PF handler is available,
we use three pages to handle kernel crossing 1G, 512G boundaries with
sharing page by playing games with page aliasing: the same page is
mapped twice in the higher-level tables with appropriate wraparound.
The kernel region itself will be properly mapped; other mappings may
be spurious.
early_make_pgtable is using kernel high mapping address to access pages
to set page table.
-v4: Add phys_base offset to make kexec happy, and add
init_mapping_kernel() - Yinghai
-v5: fix compiling with xen, and add back ident level3 and level2 for xen
also move back init_level4_pgt from BSS to DATA again.
because we have to clear it anyway. - Yinghai
-v6: switch to init_level4_pgt in init_mem_mapping. - Yinghai
-v7: remove not needed clear_page for init_level4_page
it is with fill 512,8,0 already in head_64.S - Yinghai
-v8: we need to keep that handler alive until init_mem_mapping and don't
let early_trap_init to trash that early #PF handler.
So split early_trap_pf_init out and move it down. - Yinghai
-v9: switchover only cover kernel space instead of 1G so could avoid
touch possible mem holes. - Yinghai
-v11: change far jmp back to far return to initial_code, that is needed
to fix failure that is reported by Konrad on AMD systems. - Yinghai
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1359058816-7615-12-git-send-email-yinghai@kernel.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-rw-r--r-- | arch/x86/include/asm/pgtable_64_types.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/processor.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/head64.c | 81 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 210 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 9 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 3 |
7 files changed, 219 insertions, 91 deletions
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 766ea16fbbbd..2d883440cb9a 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _ASM_X86_PGTABLE_64_DEFS_H | 1 | #ifndef _ASM_X86_PGTABLE_64_DEFS_H |
2 | #define _ASM_X86_PGTABLE_64_DEFS_H | 2 | #define _ASM_X86_PGTABLE_64_DEFS_H |
3 | 3 | ||
4 | #include <asm/sparsemem.h> | ||
5 | |||
4 | #ifndef __ASSEMBLY__ | 6 | #ifndef __ASSEMBLY__ |
5 | #include <linux/types.h> | 7 | #include <linux/types.h> |
6 | 8 | ||
@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t; | |||
60 | #define MODULES_END _AC(0xffffffffff000000, UL) | 62 | #define MODULES_END _AC(0xffffffffff000000, UL) |
61 | #define MODULES_LEN (MODULES_END - MODULES_VADDR) | 63 | #define MODULES_LEN (MODULES_END - MODULES_VADDR) |
62 | 64 | ||
65 | #define EARLY_DYNAMIC_PAGE_TABLES 64 | ||
66 | |||
63 | #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ | 67 | #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 888184b2fc85..bdee8bd318ea 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -731,6 +731,7 @@ extern void enable_sep_cpu(void); | |||
731 | extern int sysenter_setup(void); | 731 | extern int sysenter_setup(void); |
732 | 732 | ||
733 | extern void early_trap_init(void); | 733 | extern void early_trap_init(void); |
734 | void early_trap_pf_init(void); | ||
734 | 735 | ||
735 | /* Defined in head.S */ | 736 | /* Defined in head.S */ |
736 | extern struct desc_ptr early_gdt_descr; | 737 | extern struct desc_ptr early_gdt_descr; |
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7785e66840a4..f57df05ea126 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
@@ -27,11 +27,73 @@ | |||
27 | #include <asm/bios_ebda.h> | 27 | #include <asm/bios_ebda.h> |
28 | #include <asm/bootparam_utils.h> | 28 | #include <asm/bootparam_utils.h> |
29 | 29 | ||
30 | static void __init zap_identity_mappings(void) | 30 | /* |
31 | * Manage page tables very early on. | ||
32 | */ | ||
33 | extern pgd_t early_level4_pgt[PTRS_PER_PGD]; | ||
34 | extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; | ||
35 | static unsigned int __initdata next_early_pgt = 2; | ||
36 | |||
37 | /* Wipe all early page tables except for the kernel symbol map */ | ||
38 | static void __init reset_early_page_tables(void) | ||
31 | { | 39 | { |
32 | pgd_t *pgd = pgd_offset_k(0UL); | 40 | unsigned long i; |
33 | pgd_clear(pgd); | 41 | |
34 | __flush_tlb_all(); | 42 | for (i = 0; i < PTRS_PER_PGD-1; i++) |
43 | early_level4_pgt[i].pgd = 0; | ||
44 | |||
45 | next_early_pgt = 0; | ||
46 | |||
47 | write_cr3(__pa(early_level4_pgt)); | ||
48 | } | ||
49 | |||
50 | /* Create a new PMD entry */ | ||
51 | int __init early_make_pgtable(unsigned long address) | ||
52 | { | ||
53 | unsigned long physaddr = address - __PAGE_OFFSET; | ||
54 | unsigned long i; | ||
55 | pgdval_t pgd, *pgd_p; | ||
56 | pudval_t *pud_p; | ||
57 | pmdval_t pmd, *pmd_p; | ||
58 | |||
59 | /* Invalid address or early pgt is done ? */ | ||
60 | if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) | ||
61 | return -1; | ||
62 | |||
63 | i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1); | ||
64 | pgd_p = &early_level4_pgt[i].pgd; | ||
65 | pgd = *pgd_p; | ||
66 | |||
67 | /* | ||
68 | * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is | ||
69 | * critical -- __PAGE_OFFSET would point us back into the dynamic | ||
70 | * range and we might end up looping forever... | ||
71 | */ | ||
72 | if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) { | ||
73 | pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); | ||
74 | } else { | ||
75 | if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1) | ||
76 | reset_early_page_tables(); | ||
77 | |||
78 | pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; | ||
79 | for (i = 0; i < PTRS_PER_PUD; i++) | ||
80 | pud_p[i] = 0; | ||
81 | |||
82 | *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; | ||
83 | } | ||
84 | i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); | ||
85 | pud_p += i; | ||
86 | |||
87 | pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; | ||
88 | pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); | ||
89 | for (i = 0; i < PTRS_PER_PMD; i++) { | ||
90 | pmd_p[i] = pmd; | ||
91 | pmd += PMD_SIZE; | ||
92 | } | ||
93 | |||
94 | *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; | ||
95 | |||
96 | return 0; | ||
35 | } | 97 | } |
36 | 98 | ||
37 | /* Don't add a printk in there. printk relies on the PDA which is not initialized | 99 | /* Don't add a printk in there. printk relies on the PDA which is not initialized |
@@ -72,12 +134,13 @@ void __init x86_64_start_kernel(char * real_mode_data) | |||
72 | (__START_KERNEL & PGDIR_MASK))); | 134 | (__START_KERNEL & PGDIR_MASK))); |
73 | BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); | 135 | BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); |
74 | 136 | ||
137 | /* Kill off the identity-map trampoline */ | ||
138 | reset_early_page_tables(); | ||
139 | |||
75 | /* clear bss before set_intr_gate with early_idt_handler */ | 140 | /* clear bss before set_intr_gate with early_idt_handler */ |
76 | clear_bss(); | 141 | clear_bss(); |
77 | 142 | ||
78 | /* Make NULL pointers segfault */ | 143 | /* XXX - this is wrong... we need to build page tables from scratch */ |
79 | zap_identity_mappings(); | ||
80 | |||
81 | max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; | 144 | max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; |
82 | 145 | ||
83 | for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { | 146 | for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { |
@@ -94,6 +157,10 @@ void __init x86_64_start_kernel(char * real_mode_data) | |||
94 | if (console_loglevel == 10) | 157 | if (console_loglevel == 10) |
95 | early_printk("Kernel alive\n"); | 158 | early_printk("Kernel alive\n"); |
96 | 159 | ||
160 | clear_page(init_level4_pgt); | ||
161 | /* set init_level4_pgt kernel high mapping*/ | ||
162 | init_level4_pgt[511] = early_level4_pgt[511]; | ||
163 | |||
97 | x86_64_start_reservations(real_mode_data); | 164 | x86_64_start_reservations(real_mode_data); |
98 | } | 165 | } |
99 | 166 | ||
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 980053c4b9cc..d94f6d68be2a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) | |||
47 | .code64 | 47 | .code64 |
48 | .globl startup_64 | 48 | .globl startup_64 |
49 | startup_64: | 49 | startup_64: |
50 | |||
51 | /* | 50 | /* |
52 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | 51 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
53 | * and someone has loaded an identity mapped page table | 52 | * and someone has loaded an identity mapped page table |
54 | * for us. These identity mapped page tables map all of the | 53 | * for us. These identity mapped page tables map all of the |
55 | * kernel pages and possibly all of memory. | 54 | * kernel pages and possibly all of memory. |
56 | * | 55 | * |
57 | * %esi holds a physical pointer to real_mode_data. | 56 | * %rsi holds a physical pointer to real_mode_data. |
58 | * | 57 | * |
59 | * We come here either directly from a 64bit bootloader, or from | 58 | * We come here either directly from a 64bit bootloader, or from |
60 | * arch/x86_64/boot/compressed/head.S. | 59 | * arch/x86_64/boot/compressed/head.S. |
@@ -66,7 +65,8 @@ startup_64: | |||
66 | * tables and then reload them. | 65 | * tables and then reload them. |
67 | */ | 66 | */ |
68 | 67 | ||
69 | /* Compute the delta between the address I am compiled to run at and the | 68 | /* |
69 | * Compute the delta between the address I am compiled to run at and the | ||
70 | * address I am actually running at. | 70 | * address I am actually running at. |
71 | */ | 71 | */ |
72 | leaq _text(%rip), %rbp | 72 | leaq _text(%rip), %rbp |
@@ -78,45 +78,62 @@ startup_64: | |||
78 | testl %eax, %eax | 78 | testl %eax, %eax |
79 | jnz bad_address | 79 | jnz bad_address |
80 | 80 | ||
81 | /* Is the address too large? */ | 81 | /* |
82 | leaq _text(%rip), %rdx | 82 | * Is the address too large? |
83 | movq $PGDIR_SIZE, %rax | ||
84 | cmpq %rax, %rdx | ||
85 | jae bad_address | ||
86 | |||
87 | /* Fixup the physical addresses in the page table | ||
88 | */ | 83 | */ |
89 | addq %rbp, init_level4_pgt + 0(%rip) | 84 | leaq _text(%rip), %rax |
90 | addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) | 85 | shrq $MAX_PHYSMEM_BITS, %rax |
91 | addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) | 86 | jnz bad_address |
92 | 87 | ||
93 | addq %rbp, level3_ident_pgt + 0(%rip) | 88 | /* |
89 | * Fixup the physical addresses in the page table | ||
90 | */ | ||
91 | addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) | ||
94 | 92 | ||
95 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) | 93 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) |
96 | addq %rbp, level3_kernel_pgt + (511*8)(%rip) | 94 | addq %rbp, level3_kernel_pgt + (511*8)(%rip) |
97 | 95 | ||
98 | addq %rbp, level2_fixmap_pgt + (506*8)(%rip) | 96 | addq %rbp, level2_fixmap_pgt + (506*8)(%rip) |
99 | 97 | ||
100 | /* Add an Identity mapping if I am above 1G */ | 98 | /* |
99 | * Set up the identity mapping for the switchover. These | ||
100 | * entries should *NOT* have the global bit set! This also | ||
101 | * creates a bunch of nonsense entries but that is fine -- | ||
102 | * it avoids problems around wraparound. | ||
103 | */ | ||
101 | leaq _text(%rip), %rdi | 104 | leaq _text(%rip), %rdi |
102 | andq $PMD_PAGE_MASK, %rdi | 105 | leaq early_level4_pgt(%rip), %rbx |
103 | 106 | ||
104 | movq %rdi, %rax | 107 | movq %rdi, %rax |
105 | shrq $PUD_SHIFT, %rax | 108 | shrq $PGDIR_SHIFT, %rax |
106 | andq $(PTRS_PER_PUD - 1), %rax | ||
107 | jz ident_complete | ||
108 | 109 | ||
109 | leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx | 110 | leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx |
110 | leaq level3_ident_pgt(%rip), %rbx | 111 | movq %rdx, 0(%rbx,%rax,8) |
111 | movq %rdx, 0(%rbx, %rax, 8) | 112 | movq %rdx, 8(%rbx,%rax,8) |
112 | 113 | ||
114 | addq $4096, %rdx | ||
113 | movq %rdi, %rax | 115 | movq %rdi, %rax |
114 | shrq $PMD_SHIFT, %rax | 116 | shrq $PUD_SHIFT, %rax |
115 | andq $(PTRS_PER_PMD - 1), %rax | 117 | andl $(PTRS_PER_PUD-1), %eax |
116 | leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx | 118 | movq %rdx, (4096+0)(%rbx,%rax,8) |
117 | leaq level2_spare_pgt(%rip), %rbx | 119 | movq %rdx, (4096+8)(%rbx,%rax,8) |
118 | movq %rdx, 0(%rbx, %rax, 8) | 120 | |
119 | ident_complete: | 121 | addq $8192, %rbx |
122 | movq %rdi, %rax | ||
123 | shrq $PMD_SHIFT, %rdi | ||
124 | addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax | ||
125 | leaq (_end - 1)(%rip), %rcx | ||
126 | shrq $PMD_SHIFT, %rcx | ||
127 | subq %rdi, %rcx | ||
128 | incl %ecx | ||
129 | |||
130 | 1: | ||
131 | andq $(PTRS_PER_PMD - 1), %rdi | ||
132 | movq %rax, (%rbx,%rdi,8) | ||
133 | incq %rdi | ||
134 | addq $PMD_SIZE, %rax | ||
135 | decl %ecx | ||
136 | jnz 1b | ||
120 | 137 | ||
121 | /* | 138 | /* |
122 | * Fixup the kernel text+data virtual addresses. Note that | 139 | * Fixup the kernel text+data virtual addresses. Note that |
@@ -124,7 +141,6 @@ ident_complete: | |||
124 | * cleanup_highmap() fixes this up along with the mappings | 141 | * cleanup_highmap() fixes this up along with the mappings |
125 | * beyond _end. | 142 | * beyond _end. |
126 | */ | 143 | */ |
127 | |||
128 | leaq level2_kernel_pgt(%rip), %rdi | 144 | leaq level2_kernel_pgt(%rip), %rdi |
129 | leaq 4096(%rdi), %r8 | 145 | leaq 4096(%rdi), %r8 |
130 | /* See if it is a valid page table entry */ | 146 | /* See if it is a valid page table entry */ |
@@ -139,17 +155,14 @@ ident_complete: | |||
139 | /* Fixup phys_base */ | 155 | /* Fixup phys_base */ |
140 | addq %rbp, phys_base(%rip) | 156 | addq %rbp, phys_base(%rip) |
141 | 157 | ||
142 | /* Due to ENTRY(), sometimes the empty space gets filled with | 158 | movq $(early_level4_pgt - __START_KERNEL_map), %rax |
143 | * zeros. Better take a jmp than relying on empty space being | 159 | jmp 1f |
144 | * filled with 0x90 (nop) | ||
145 | */ | ||
146 | jmp secondary_startup_64 | ||
147 | ENTRY(secondary_startup_64) | 160 | ENTRY(secondary_startup_64) |
148 | /* | 161 | /* |
149 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | 162 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
150 | * and someone has loaded a mapped page table. | 163 | * and someone has loaded a mapped page table. |
151 | * | 164 | * |
152 | * %esi holds a physical pointer to real_mode_data. | 165 | * %rsi holds a physical pointer to real_mode_data. |
153 | * | 166 | * |
154 | * We come here either from startup_64 (using physical addresses) | 167 | * We come here either from startup_64 (using physical addresses) |
155 | * or from trampoline.S (using virtual addresses). | 168 | * or from trampoline.S (using virtual addresses). |
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64) | |||
159 | * after the boot processor executes this code. | 172 | * after the boot processor executes this code. |
160 | */ | 173 | */ |
161 | 174 | ||
175 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
176 | 1: | ||
177 | |||
162 | /* Enable PAE mode and PGE */ | 178 | /* Enable PAE mode and PGE */ |
163 | movl $(X86_CR4_PAE | X86_CR4_PGE), %eax | 179 | movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx |
164 | movq %rax, %cr4 | 180 | movq %rcx, %cr4 |
165 | 181 | ||
166 | /* Setup early boot stage 4 level pagetables. */ | 182 | /* Setup early boot stage 4 level pagetables. */ |
167 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
168 | addq phys_base(%rip), %rax | 183 | addq phys_base(%rip), %rax |
169 | movq %rax, %cr3 | 184 | movq %rax, %cr3 |
170 | 185 | ||
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64) | |||
196 | movq %rax, %cr0 | 211 | movq %rax, %cr0 |
197 | 212 | ||
198 | /* Setup a boot time stack */ | 213 | /* Setup a boot time stack */ |
199 | movq stack_start(%rip),%rsp | 214 | movq stack_start(%rip), %rsp |
200 | 215 | ||
201 | /* zero EFLAGS after setting rsp */ | 216 | /* zero EFLAGS after setting rsp */ |
202 | pushq $0 | 217 | pushq $0 |
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64) | |||
236 | movl initial_gs+4(%rip),%edx | 251 | movl initial_gs+4(%rip),%edx |
237 | wrmsr | 252 | wrmsr |
238 | 253 | ||
239 | /* esi is pointer to real mode structure with interesting info. | 254 | /* rsi is pointer to real mode structure with interesting info. |
240 | pass it to C */ | 255 | pass it to C */ |
241 | movl %esi, %edi | 256 | movq %rsi, %rdi |
242 | 257 | ||
243 | /* Finally jump to run C code and to be on real kernel address | 258 | /* Finally jump to run C code and to be on real kernel address |
244 | * Since we are running on identity-mapped space we have to jump | 259 | * Since we are running on identity-mapped space we have to jump |
245 | * to the full 64bit address, this is only possible as indirect | 260 | * to the full 64bit address, this is only possible as indirect |
246 | * jump. In addition we need to ensure %cs is set so we make this | 261 | * jump. In addition we need to ensure %cs is set so we make this |
247 | * a far return. | 262 | * a far return. |
263 | * | ||
264 | * Note: do not change to far jump indirect with 64bit offset. | ||
265 | * | ||
266 | * AMD does not support far jump indirect with 64bit offset. | ||
267 | * AMD64 Architecture Programmer's Manual, Volume 3: states only | ||
268 | * JMP FAR mem16:16 FF /5 Far jump indirect, | ||
269 | * with the target specified by a far pointer in memory. | ||
270 | * JMP FAR mem16:32 FF /5 Far jump indirect, | ||
271 | * with the target specified by a far pointer in memory. | ||
272 | * | ||
273 | * Intel64 does support 64bit offset. | ||
274 | * Software Developer Manual Vol 2: states: | ||
275 | * FF /5 JMP m16:16 Jump far, absolute indirect, | ||
276 | * address given in m16:16 | ||
277 | * FF /5 JMP m16:32 Jump far, absolute indirect, | ||
278 | * address given in m16:32. | ||
279 | * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, | ||
280 | * address given in m16:64. | ||
248 | */ | 281 | */ |
249 | movq initial_code(%rip),%rax | 282 | movq initial_code(%rip),%rax |
250 | pushq $0 # fake return address to stop unwinder | 283 | pushq $0 # fake return address to stop unwinder |
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0) | |||
270 | 303 | ||
271 | /* SMP bootup changes these two */ | 304 | /* SMP bootup changes these two */ |
272 | __REFDATA | 305 | __REFDATA |
273 | .align 8 | 306 | .balign 8 |
274 | ENTRY(initial_code) | 307 | GLOBAL(initial_code) |
275 | .quad x86_64_start_kernel | 308 | .quad x86_64_start_kernel |
276 | ENTRY(initial_gs) | 309 | GLOBAL(initial_gs) |
277 | .quad INIT_PER_CPU_VAR(irq_stack_union) | 310 | .quad INIT_PER_CPU_VAR(irq_stack_union) |
278 | 311 | ||
279 | ENTRY(stack_start) | 312 | GLOBAL(stack_start) |
280 | .quad init_thread_union+THREAD_SIZE-8 | 313 | .quad init_thread_union+THREAD_SIZE-8 |
281 | .word 0 | 314 | .word 0 |
282 | __FINITDATA | 315 | __FINITDATA |
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0) | |||
284 | bad_address: | 317 | bad_address: |
285 | jmp bad_address | 318 | jmp bad_address |
286 | 319 | ||
287 | .section ".init.text","ax" | 320 | __INIT |
288 | .globl early_idt_handlers | 321 | .globl early_idt_handlers |
289 | early_idt_handlers: | 322 | early_idt_handlers: |
290 | # 104(%rsp) %rflags | 323 | # 104(%rsp) %rflags |
@@ -321,14 +354,22 @@ ENTRY(early_idt_handler) | |||
321 | pushq %r11 # 0(%rsp) | 354 | pushq %r11 # 0(%rsp) |
322 | 355 | ||
323 | cmpl $__KERNEL_CS,96(%rsp) | 356 | cmpl $__KERNEL_CS,96(%rsp) |
324 | jne 10f | 357 | jne 11f |
358 | |||
359 | cmpl $14,72(%rsp) # Page fault? | ||
360 | jnz 10f | ||
361 | GET_CR2_INTO(%rdi) # can clobber any volatile register if pv | ||
362 | call early_make_pgtable | ||
363 | andl %eax,%eax | ||
364 | jz 20f # All good | ||
325 | 365 | ||
366 | 10: | ||
326 | leaq 88(%rsp),%rdi # Pointer to %rip | 367 | leaq 88(%rsp),%rdi # Pointer to %rip |
327 | call early_fixup_exception | 368 | call early_fixup_exception |
328 | andl %eax,%eax | 369 | andl %eax,%eax |
329 | jnz 20f # Found an exception entry | 370 | jnz 20f # Found an exception entry |
330 | 371 | ||
331 | 10: | 372 | 11: |
332 | #ifdef CONFIG_EARLY_PRINTK | 373 | #ifdef CONFIG_EARLY_PRINTK |
333 | GET_CR2_INTO(%r9) # can clobber any volatile register if pv | 374 | GET_CR2_INTO(%r9) # can clobber any volatile register if pv |
334 | movl 80(%rsp),%r8d # error code | 375 | movl 80(%rsp),%r8d # error code |
@@ -350,7 +391,7 @@ ENTRY(early_idt_handler) | |||
350 | 1: hlt | 391 | 1: hlt |
351 | jmp 1b | 392 | jmp 1b |
352 | 393 | ||
353 | 20: # Exception table entry found | 394 | 20: # Exception table entry found or page table generated |
354 | popq %r11 | 395 | popq %r11 |
355 | popq %r10 | 396 | popq %r10 |
356 | popq %r9 | 397 | popq %r9 |
@@ -364,6 +405,8 @@ ENTRY(early_idt_handler) | |||
364 | decl early_recursion_flag(%rip) | 405 | decl early_recursion_flag(%rip) |
365 | INTERRUPT_RETURN | 406 | INTERRUPT_RETURN |
366 | 407 | ||
408 | __INITDATA | ||
409 | |||
367 | .balign 4 | 410 | .balign 4 |
368 | early_recursion_flag: | 411 | early_recursion_flag: |
369 | .long 0 | 412 | .long 0 |
@@ -374,11 +417,10 @@ early_idt_msg: | |||
374 | early_idt_ripmsg: | 417 | early_idt_ripmsg: |
375 | .asciz "RIP %s\n" | 418 | .asciz "RIP %s\n" |
376 | #endif /* CONFIG_EARLY_PRINTK */ | 419 | #endif /* CONFIG_EARLY_PRINTK */ |
377 | .previous | ||
378 | 420 | ||
379 | #define NEXT_PAGE(name) \ | 421 | #define NEXT_PAGE(name) \ |
380 | .balign PAGE_SIZE; \ | 422 | .balign PAGE_SIZE; \ |
381 | ENTRY(name) | 423 | GLOBAL(name) |
382 | 424 | ||
383 | /* Automate the creation of 1 to 1 mapping pmd entries */ | 425 | /* Automate the creation of 1 to 1 mapping pmd entries */ |
384 | #define PMDS(START, PERM, COUNT) \ | 426 | #define PMDS(START, PERM, COUNT) \ |
@@ -388,24 +430,37 @@ ENTRY(name) | |||
388 | i = i + 1 ; \ | 430 | i = i + 1 ; \ |
389 | .endr | 431 | .endr |
390 | 432 | ||
433 | __INITDATA | ||
434 | NEXT_PAGE(early_level4_pgt) | ||
435 | .fill 511,8,0 | ||
436 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
437 | |||
438 | NEXT_PAGE(early_dynamic_pgts) | ||
439 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 | ||
440 | |||
391 | .data | 441 | .data |
392 | /* | 442 | |
393 | * This default setting generates an ident mapping at address 0x100000 | 443 | #ifndef CONFIG_XEN |
394 | * and a mapping for the kernel that precisely maps virtual address | ||
395 | * 0xffffffff80000000 to physical address 0x000000. (always using | ||
396 | * 2Mbyte large pages provided by PAE mode) | ||
397 | */ | ||
398 | NEXT_PAGE(init_level4_pgt) | 444 | NEXT_PAGE(init_level4_pgt) |
399 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 445 | .fill 512,8,0 |
400 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 | 446 | #else |
401 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 447 | NEXT_PAGE(init_level4_pgt) |
402 | .org init_level4_pgt + L4_START_KERNEL*8, 0 | 448 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
449 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 | ||
450 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
451 | .org init_level4_pgt + L4_START_KERNEL*8, 0 | ||
403 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | 452 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
404 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | 453 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
405 | 454 | ||
406 | NEXT_PAGE(level3_ident_pgt) | 455 | NEXT_PAGE(level3_ident_pgt) |
407 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 456 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
408 | .fill 511,8,0 | 457 | .fill 511, 8, 0 |
458 | NEXT_PAGE(level2_ident_pgt) | ||
459 | /* Since I easily can, map the first 1G. | ||
460 | * Don't set NX because code runs from these pages. | ||
461 | */ | ||
462 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) | ||
463 | #endif | ||
409 | 464 | ||
410 | NEXT_PAGE(level3_kernel_pgt) | 465 | NEXT_PAGE(level3_kernel_pgt) |
411 | .fill L3_START_KERNEL,8,0 | 466 | .fill L3_START_KERNEL,8,0 |
@@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt) | |||
413 | .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE | 468 | .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE |
414 | .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | 469 | .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE |
415 | 470 | ||
416 | NEXT_PAGE(level2_fixmap_pgt) | ||
417 | .fill 506,8,0 | ||
418 | .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
419 | /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ | ||
420 | .fill 5,8,0 | ||
421 | |||
422 | NEXT_PAGE(level1_fixmap_pgt) | ||
423 | .fill 512,8,0 | ||
424 | |||
425 | NEXT_PAGE(level2_ident_pgt) | ||
426 | /* Since I easily can, map the first 1G. | ||
427 | * Don't set NX because code runs from these pages. | ||
428 | */ | ||
429 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) | ||
430 | |||
431 | NEXT_PAGE(level2_kernel_pgt) | 471 | NEXT_PAGE(level2_kernel_pgt) |
432 | /* | 472 | /* |
433 | * 512 MB kernel mapping. We spend a full page on this pagetable | 473 | * 512 MB kernel mapping. We spend a full page on this pagetable |
@@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt) | |||
442 | PMDS(0, __PAGE_KERNEL_LARGE_EXEC, | 482 | PMDS(0, __PAGE_KERNEL_LARGE_EXEC, |
443 | KERNEL_IMAGE_SIZE/PMD_SIZE) | 483 | KERNEL_IMAGE_SIZE/PMD_SIZE) |
444 | 484 | ||
445 | NEXT_PAGE(level2_spare_pgt) | 485 | NEXT_PAGE(level2_fixmap_pgt) |
446 | .fill 512, 8, 0 | 486 | .fill 506,8,0 |
487 | .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
488 | /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ | ||
489 | .fill 5,8,0 | ||
490 | |||
491 | NEXT_PAGE(level1_fixmap_pgt) | ||
492 | .fill 512,8,0 | ||
447 | 493 | ||
448 | #undef PMDS | 494 | #undef PMDS |
449 | #undef NEXT_PAGE | ||
450 | 495 | ||
451 | .data | 496 | .data |
452 | .align 16 | 497 | .align 16 |
@@ -472,6 +517,5 @@ ENTRY(nmi_idt_table) | |||
472 | .skip IDT_ENTRIES * 16 | 517 | .skip IDT_ENTRIES * 16 |
473 | 518 | ||
474 | __PAGE_ALIGNED_BSS | 519 | __PAGE_ALIGNED_BSS |
475 | .align PAGE_SIZE | 520 | NEXT_PAGE(empty_zero_page) |
476 | ENTRY(empty_zero_page) | ||
477 | .skip PAGE_SIZE | 521 | .skip PAGE_SIZE |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 85a8290801df..db9c41dae8d7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -1005,6 +1005,8 @@ void __init setup_arch(char **cmdline_p) | |||
1005 | 1005 | ||
1006 | init_mem_mapping(); | 1006 | init_mem_mapping(); |
1007 | 1007 | ||
1008 | early_trap_pf_init(); | ||
1009 | |||
1008 | setup_real_mode(); | 1010 | setup_real_mode(); |
1009 | 1011 | ||
1010 | memblock.current_limit = get_max_mapped(); | 1012 | memblock.current_limit = get_max_mapped(); |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ecffca11f4e9..68bda7a84159 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -688,10 +688,19 @@ void __init early_trap_init(void) | |||
688 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); | 688 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); |
689 | /* int3 can be called from all */ | 689 | /* int3 can be called from all */ |
690 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); | 690 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); |
691 | #ifdef CONFIG_X86_32 | ||
691 | set_intr_gate(X86_TRAP_PF, &page_fault); | 692 | set_intr_gate(X86_TRAP_PF, &page_fault); |
693 | #endif | ||
692 | load_idt(&idt_descr); | 694 | load_idt(&idt_descr); |
693 | } | 695 | } |
694 | 696 | ||
697 | void __init early_trap_pf_init(void) | ||
698 | { | ||
699 | #ifdef CONFIG_X86_64 | ||
700 | set_intr_gate(X86_TRAP_PF, &page_fault); | ||
701 | #endif | ||
702 | } | ||
703 | |||
695 | void __init trap_init(void) | 704 | void __init trap_init(void) |
696 | { | 705 | { |
697 | int i; | 706 | int i; |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 78d1ef3eab66..3364a7643a4c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -446,9 +446,10 @@ void __init init_mem_mapping(void) | |||
446 | } | 446 | } |
447 | #else | 447 | #else |
448 | early_ioremap_page_table_range_init(); | 448 | early_ioremap_page_table_range_init(); |
449 | #endif | ||
450 | |||
449 | load_cr3(swapper_pg_dir); | 451 | load_cr3(swapper_pg_dir); |
450 | __flush_tlb_all(); | 452 | __flush_tlb_all(); |
451 | #endif | ||
452 | 453 | ||
453 | early_memtest(0, max_pfn_mapped << PAGE_SHIFT); | 454 | early_memtest(0, max_pfn_mapped << PAGE_SHIFT); |
454 | } | 455 | } |