diff options
| author | H. Peter Anvin <hpa@zytor.com> | 2013-01-24 15:19:52 -0500 |
|---|---|---|
| committer | H. Peter Anvin <hpa@linux.intel.com> | 2013-01-29 18:20:06 -0500 |
| commit | 8170e6bed465b4b0c7687f93e9948aca4358a33b (patch) | |
| tree | 4a10d8a14af51dd0a0f51539a3fdc1cb7e9f304b | |
| parent | 4f7b92263ad68cdc72b11808320d9c881bfa857e (diff) | |
x86, 64bit: Use a #PF handler to materialize early mappings on demand
Linear mode (CR0.PG = 0) is mutually exclusive with 64-bit mode; all
64-bit code has to use page tables. This makes it awkward before we
have first set up properly all-covering page tables to access objects
that are outside the static kernel range.
So far we have dealt with that simply by mapping a fixed amount of
low memory, but that fails in at least two upcoming use cases:
1. We will support load and run kernel, struct boot_params, ramdisk,
command line, etc. above the 4 GiB mark.
2. need to access ramdisk early to get microcode to update that as
early possible.
We could use early_iomap to access them too, but it will make code to
messy and hard to be unified with 32 bit.
Hence, set up a #PF table and use a fixed number of buffers to set up
page tables on demand. If the buffers fill up then we simply flush
them and start over. These buffers are all in __initdata, so it does
not increase RAM usage at runtime.
Thus, with the help of the #PF handler, we can set the final kernel
mapping from blank, and switch to init_level4_pgt later.
During the switchover in head_64.S, before #PF handler is available,
we use three pages to handle kernel crossing 1G, 512G boundaries with
sharing page by playing games with page aliasing: the same page is
mapped twice in the higher-level tables with appropriate wraparound.
The kernel region itself will be properly mapped; other mappings may
be spurious.
early_make_pgtable is using kernel high mapping address to access pages
to set page table.
-v4: Add phys_base offset to make kexec happy, and add
init_mapping_kernel() - Yinghai
-v5: fix compiling with xen, and add back ident level3 and level2 for xen
also move back init_level4_pgt from BSS to DATA again.
because we have to clear it anyway. - Yinghai
-v6: switch to init_level4_pgt in init_mem_mapping. - Yinghai
-v7: remove not needed clear_page for init_level4_page
it is with fill 512,8,0 already in head_64.S - Yinghai
-v8: we need to keep that handler alive until init_mem_mapping and don't
let early_trap_init to trash that early #PF handler.
So split early_trap_pf_init out and move it down. - Yinghai
-v9: switchover only cover kernel space instead of 1G so could avoid
touch possible mem holes. - Yinghai
-v11: change far jmp back to far return to initial_code, that is needed
to fix failure that is reported by Konrad on AMD systems. - Yinghai
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1359058816-7615-12-git-send-email-yinghai@kernel.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
| -rw-r--r-- | arch/x86/include/asm/pgtable_64_types.h | 4 | ||||
| -rw-r--r-- | arch/x86/include/asm/processor.h | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/head64.c | 81 | ||||
| -rw-r--r-- | arch/x86/kernel/head_64.S | 210 | ||||
| -rw-r--r-- | arch/x86/kernel/setup.c | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/traps.c | 9 | ||||
| -rw-r--r-- | arch/x86/mm/init.c | 3 |
7 files changed, 219 insertions, 91 deletions
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 766ea16fbbbd..2d883440cb9a 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | #ifndef _ASM_X86_PGTABLE_64_DEFS_H | 1 | #ifndef _ASM_X86_PGTABLE_64_DEFS_H |
| 2 | #define _ASM_X86_PGTABLE_64_DEFS_H | 2 | #define _ASM_X86_PGTABLE_64_DEFS_H |
| 3 | 3 | ||
| 4 | #include <asm/sparsemem.h> | ||
| 5 | |||
| 4 | #ifndef __ASSEMBLY__ | 6 | #ifndef __ASSEMBLY__ |
| 5 | #include <linux/types.h> | 7 | #include <linux/types.h> |
| 6 | 8 | ||
| @@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t; | |||
| 60 | #define MODULES_END _AC(0xffffffffff000000, UL) | 62 | #define MODULES_END _AC(0xffffffffff000000, UL) |
| 61 | #define MODULES_LEN (MODULES_END - MODULES_VADDR) | 63 | #define MODULES_LEN (MODULES_END - MODULES_VADDR) |
| 62 | 64 | ||
| 65 | #define EARLY_DYNAMIC_PAGE_TABLES 64 | ||
| 66 | |||
| 63 | #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ | 67 | #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 888184b2fc85..bdee8bd318ea 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
| @@ -731,6 +731,7 @@ extern void enable_sep_cpu(void); | |||
| 731 | extern int sysenter_setup(void); | 731 | extern int sysenter_setup(void); |
| 732 | 732 | ||
| 733 | extern void early_trap_init(void); | 733 | extern void early_trap_init(void); |
| 734 | void early_trap_pf_init(void); | ||
| 734 | 735 | ||
| 735 | /* Defined in head.S */ | 736 | /* Defined in head.S */ |
| 736 | extern struct desc_ptr early_gdt_descr; | 737 | extern struct desc_ptr early_gdt_descr; |
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7785e66840a4..f57df05ea126 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
| @@ -27,11 +27,73 @@ | |||
| 27 | #include <asm/bios_ebda.h> | 27 | #include <asm/bios_ebda.h> |
| 28 | #include <asm/bootparam_utils.h> | 28 | #include <asm/bootparam_utils.h> |
| 29 | 29 | ||
| 30 | static void __init zap_identity_mappings(void) | 30 | /* |
| 31 | * Manage page tables very early on. | ||
| 32 | */ | ||
| 33 | extern pgd_t early_level4_pgt[PTRS_PER_PGD]; | ||
| 34 | extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; | ||
| 35 | static unsigned int __initdata next_early_pgt = 2; | ||
| 36 | |||
| 37 | /* Wipe all early page tables except for the kernel symbol map */ | ||
| 38 | static void __init reset_early_page_tables(void) | ||
| 31 | { | 39 | { |
| 32 | pgd_t *pgd = pgd_offset_k(0UL); | 40 | unsigned long i; |
| 33 | pgd_clear(pgd); | 41 | |
| 34 | __flush_tlb_all(); | 42 | for (i = 0; i < PTRS_PER_PGD-1; i++) |
| 43 | early_level4_pgt[i].pgd = 0; | ||
| 44 | |||
| 45 | next_early_pgt = 0; | ||
| 46 | |||
| 47 | write_cr3(__pa(early_level4_pgt)); | ||
| 48 | } | ||
| 49 | |||
| 50 | /* Create a new PMD entry */ | ||
| 51 | int __init early_make_pgtable(unsigned long address) | ||
| 52 | { | ||
| 53 | unsigned long physaddr = address - __PAGE_OFFSET; | ||
| 54 | unsigned long i; | ||
| 55 | pgdval_t pgd, *pgd_p; | ||
| 56 | pudval_t *pud_p; | ||
| 57 | pmdval_t pmd, *pmd_p; | ||
| 58 | |||
| 59 | /* Invalid address or early pgt is done ? */ | ||
| 60 | if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) | ||
| 61 | return -1; | ||
| 62 | |||
| 63 | i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1); | ||
| 64 | pgd_p = &early_level4_pgt[i].pgd; | ||
| 65 | pgd = *pgd_p; | ||
| 66 | |||
| 67 | /* | ||
| 68 | * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is | ||
| 69 | * critical -- __PAGE_OFFSET would point us back into the dynamic | ||
| 70 | * range and we might end up looping forever... | ||
| 71 | */ | ||
| 72 | if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) { | ||
| 73 | pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); | ||
| 74 | } else { | ||
| 75 | if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1) | ||
| 76 | reset_early_page_tables(); | ||
| 77 | |||
| 78 | pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; | ||
| 79 | for (i = 0; i < PTRS_PER_PUD; i++) | ||
| 80 | pud_p[i] = 0; | ||
| 81 | |||
| 82 | *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; | ||
| 83 | } | ||
| 84 | i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); | ||
| 85 | pud_p += i; | ||
| 86 | |||
| 87 | pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; | ||
| 88 | pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); | ||
| 89 | for (i = 0; i < PTRS_PER_PMD; i++) { | ||
| 90 | pmd_p[i] = pmd; | ||
| 91 | pmd += PMD_SIZE; | ||
| 92 | } | ||
| 93 | |||
| 94 | *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; | ||
| 95 | |||
| 96 | return 0; | ||
| 35 | } | 97 | } |
| 36 | 98 | ||
| 37 | /* Don't add a printk in there. printk relies on the PDA which is not initialized | 99 | /* Don't add a printk in there. printk relies on the PDA which is not initialized |
| @@ -72,12 +134,13 @@ void __init x86_64_start_kernel(char * real_mode_data) | |||
| 72 | (__START_KERNEL & PGDIR_MASK))); | 134 | (__START_KERNEL & PGDIR_MASK))); |
| 73 | BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); | 135 | BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); |
| 74 | 136 | ||
| 137 | /* Kill off the identity-map trampoline */ | ||
| 138 | reset_early_page_tables(); | ||
| 139 | |||
| 75 | /* clear bss before set_intr_gate with early_idt_handler */ | 140 | /* clear bss before set_intr_gate with early_idt_handler */ |
| 76 | clear_bss(); | 141 | clear_bss(); |
| 77 | 142 | ||
| 78 | /* Make NULL pointers segfault */ | 143 | /* XXX - this is wrong... we need to build page tables from scratch */ |
| 79 | zap_identity_mappings(); | ||
| 80 | |||
| 81 | max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; | 144 | max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; |
| 82 | 145 | ||
| 83 | for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { | 146 | for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { |
| @@ -94,6 +157,10 @@ void __init x86_64_start_kernel(char * real_mode_data) | |||
| 94 | if (console_loglevel == 10) | 157 | if (console_loglevel == 10) |
| 95 | early_printk("Kernel alive\n"); | 158 | early_printk("Kernel alive\n"); |
| 96 | 159 | ||
| 160 | clear_page(init_level4_pgt); | ||
| 161 | /* set init_level4_pgt kernel high mapping*/ | ||
| 162 | init_level4_pgt[511] = early_level4_pgt[511]; | ||
| 163 | |||
| 97 | x86_64_start_reservations(real_mode_data); | 164 | x86_64_start_reservations(real_mode_data); |
| 98 | } | 165 | } |
| 99 | 166 | ||
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 980053c4b9cc..d94f6d68be2a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
| @@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) | |||
| 47 | .code64 | 47 | .code64 |
| 48 | .globl startup_64 | 48 | .globl startup_64 |
| 49 | startup_64: | 49 | startup_64: |
| 50 | |||
| 51 | /* | 50 | /* |
| 52 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | 51 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
| 53 | * and someone has loaded an identity mapped page table | 52 | * and someone has loaded an identity mapped page table |
| 54 | * for us. These identity mapped page tables map all of the | 53 | * for us. These identity mapped page tables map all of the |
| 55 | * kernel pages and possibly all of memory. | 54 | * kernel pages and possibly all of memory. |
| 56 | * | 55 | * |
| 57 | * %esi holds a physical pointer to real_mode_data. | 56 | * %rsi holds a physical pointer to real_mode_data. |
| 58 | * | 57 | * |
| 59 | * We come here either directly from a 64bit bootloader, or from | 58 | * We come here either directly from a 64bit bootloader, or from |
| 60 | * arch/x86_64/boot/compressed/head.S. | 59 | * arch/x86_64/boot/compressed/head.S. |
| @@ -66,7 +65,8 @@ startup_64: | |||
| 66 | * tables and then reload them. | 65 | * tables and then reload them. |
| 67 | */ | 66 | */ |
| 68 | 67 | ||
| 69 | /* Compute the delta between the address I am compiled to run at and the | 68 | /* |
| 69 | * Compute the delta between the address I am compiled to run at and the | ||
| 70 | * address I am actually running at. | 70 | * address I am actually running at. |
| 71 | */ | 71 | */ |
| 72 | leaq _text(%rip), %rbp | 72 | leaq _text(%rip), %rbp |
| @@ -78,45 +78,62 @@ startup_64: | |||
| 78 | testl %eax, %eax | 78 | testl %eax, %eax |
| 79 | jnz bad_address | 79 | jnz bad_address |
| 80 | 80 | ||
| 81 | /* Is the address too large? */ | 81 | /* |
| 82 | leaq _text(%rip), %rdx | 82 | * Is the address too large? |
| 83 | movq $PGDIR_SIZE, %rax | ||
| 84 | cmpq %rax, %rdx | ||
| 85 | jae bad_address | ||
| 86 | |||
| 87 | /* Fixup the physical addresses in the page table | ||
| 88 | */ | 83 | */ |
| 89 | addq %rbp, init_level4_pgt + 0(%rip) | 84 | leaq _text(%rip), %rax |
| 90 | addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) | 85 | shrq $MAX_PHYSMEM_BITS, %rax |
| 91 | addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) | 86 | jnz bad_address |
| 92 | 87 | ||
| 93 | addq %rbp, level3_ident_pgt + 0(%rip) | 88 | /* |
| 89 | * Fixup the physical addresses in the page table | ||
| 90 | */ | ||
| 91 | addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) | ||
| 94 | 92 | ||
| 95 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) | 93 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) |
| 96 | addq %rbp, level3_kernel_pgt + (511*8)(%rip) | 94 | addq %rbp, level3_kernel_pgt + (511*8)(%rip) |
| 97 | 95 | ||
| 98 | addq %rbp, level2_fixmap_pgt + (506*8)(%rip) | 96 | addq %rbp, level2_fixmap_pgt + (506*8)(%rip) |
| 99 | 97 | ||
| 100 | /* Add an Identity mapping if I am above 1G */ | 98 | /* |
| 99 | * Set up the identity mapping for the switchover. These | ||
| 100 | * entries should *NOT* have the global bit set! This also | ||
| 101 | * creates a bunch of nonsense entries but that is fine -- | ||
| 102 | * it avoids problems around wraparound. | ||
| 103 | */ | ||
| 101 | leaq _text(%rip), %rdi | 104 | leaq _text(%rip), %rdi |
| 102 | andq $PMD_PAGE_MASK, %rdi | 105 | leaq early_level4_pgt(%rip), %rbx |
| 103 | 106 | ||
| 104 | movq %rdi, %rax | 107 | movq %rdi, %rax |
| 105 | shrq $PUD_SHIFT, %rax | 108 | shrq $PGDIR_SHIFT, %rax |
| 106 | andq $(PTRS_PER_PUD - 1), %rax | ||
| 107 | jz ident_complete | ||
| 108 | 109 | ||
| 109 | leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx | 110 | leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx |
| 110 | leaq level3_ident_pgt(%rip), %rbx | 111 | movq %rdx, 0(%rbx,%rax,8) |
| 111 | movq %rdx, 0(%rbx, %rax, 8) | 112 | movq %rdx, 8(%rbx,%rax,8) |
| 112 | 113 | ||
| 114 | addq $4096, %rdx | ||
| 113 | movq %rdi, %rax | 115 | movq %rdi, %rax |
| 114 | shrq $PMD_SHIFT, %rax | 116 | shrq $PUD_SHIFT, %rax |
| 115 | andq $(PTRS_PER_PMD - 1), %rax | 117 | andl $(PTRS_PER_PUD-1), %eax |
| 116 | leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx | 118 | movq %rdx, (4096+0)(%rbx,%rax,8) |
| 117 | leaq level2_spare_pgt(%rip), %rbx | 119 | movq %rdx, (4096+8)(%rbx,%rax,8) |
| 118 | movq %rdx, 0(%rbx, %rax, 8) | 120 | |
| 119 | ident_complete: | 121 | addq $8192, %rbx |
| 122 | movq %rdi, %rax | ||
| 123 | shrq $PMD_SHIFT, %rdi | ||
| 124 | addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax | ||
| 125 | leaq (_end - 1)(%rip), %rcx | ||
| 126 | shrq $PMD_SHIFT, %rcx | ||
| 127 | subq %rdi, %rcx | ||
| 128 | incl %ecx | ||
| 129 | |||
| 130 | 1: | ||
| 131 | andq $(PTRS_PER_PMD - 1), %rdi | ||
| 132 | movq %rax, (%rbx,%rdi,8) | ||
| 133 | incq %rdi | ||
| 134 | addq $PMD_SIZE, %rax | ||
| 135 | decl %ecx | ||
| 136 | jnz 1b | ||
| 120 | 137 | ||
| 121 | /* | 138 | /* |
| 122 | * Fixup the kernel text+data virtual addresses. Note that | 139 | * Fixup the kernel text+data virtual addresses. Note that |
| @@ -124,7 +141,6 @@ ident_complete: | |||
| 124 | * cleanup_highmap() fixes this up along with the mappings | 141 | * cleanup_highmap() fixes this up along with the mappings |
| 125 | * beyond _end. | 142 | * beyond _end. |
| 126 | */ | 143 | */ |
| 127 | |||
| 128 | leaq level2_kernel_pgt(%rip), %rdi | 144 | leaq level2_kernel_pgt(%rip), %rdi |
| 129 | leaq 4096(%rdi), %r8 | 145 | leaq 4096(%rdi), %r8 |
| 130 | /* See if it is a valid page table entry */ | 146 | /* See if it is a valid page table entry */ |
| @@ -139,17 +155,14 @@ ident_complete: | |||
| 139 | /* Fixup phys_base */ | 155 | /* Fixup phys_base */ |
| 140 | addq %rbp, phys_base(%rip) | 156 | addq %rbp, phys_base(%rip) |
| 141 | 157 | ||
| 142 | /* Due to ENTRY(), sometimes the empty space gets filled with | 158 | movq $(early_level4_pgt - __START_KERNEL_map), %rax |
| 143 | * zeros. Better take a jmp than relying on empty space being | 159 | jmp 1f |
| 144 | * filled with 0x90 (nop) | ||
| 145 | */ | ||
| 146 | jmp secondary_startup_64 | ||
| 147 | ENTRY(secondary_startup_64) | 160 | ENTRY(secondary_startup_64) |
| 148 | /* | 161 | /* |
| 149 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | 162 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
| 150 | * and someone has loaded a mapped page table. | 163 | * and someone has loaded a mapped page table. |
| 151 | * | 164 | * |
| 152 | * %esi holds a physical pointer to real_mode_data. | 165 | * %rsi holds a physical pointer to real_mode_data. |
| 153 | * | 166 | * |
| 154 | * We come here either from startup_64 (using physical addresses) | 167 | * We come here either from startup_64 (using physical addresses) |
| 155 | * or from trampoline.S (using virtual addresses). | 168 | * or from trampoline.S (using virtual addresses). |
| @@ -159,12 +172,14 @@ ENTRY(secondary_startup_64) | |||
| 159 | * after the boot processor executes this code. | 172 | * after the boot processor executes this code. |
| 160 | */ | 173 | */ |
| 161 | 174 | ||
| 175 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
| 176 | 1: | ||
| 177 | |||
| 162 | /* Enable PAE mode and PGE */ | 178 | /* Enable PAE mode and PGE */ |
| 163 | movl $(X86_CR4_PAE | X86_CR4_PGE), %eax | 179 | movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx |
| 164 | movq %rax, %cr4 | 180 | movq %rcx, %cr4 |
| 165 | 181 | ||
| 166 | /* Setup early boot stage 4 level pagetables. */ | 182 | /* Setup early boot stage 4 level pagetables. */ |
| 167 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
| 168 | addq phys_base(%rip), %rax | 183 | addq phys_base(%rip), %rax |
| 169 | movq %rax, %cr3 | 184 | movq %rax, %cr3 |
| 170 | 185 | ||
| @@ -196,7 +211,7 @@ ENTRY(secondary_startup_64) | |||
| 196 | movq %rax, %cr0 | 211 | movq %rax, %cr0 |
| 197 | 212 | ||
| 198 | /* Setup a boot time stack */ | 213 | /* Setup a boot time stack */ |
| 199 | movq stack_start(%rip),%rsp | 214 | movq stack_start(%rip), %rsp |
| 200 | 215 | ||
| 201 | /* zero EFLAGS after setting rsp */ | 216 | /* zero EFLAGS after setting rsp */ |
| 202 | pushq $0 | 217 | pushq $0 |
| @@ -236,15 +251,33 @@ ENTRY(secondary_startup_64) | |||
| 236 | movl initial_gs+4(%rip),%edx | 251 | movl initial_gs+4(%rip),%edx |
| 237 | wrmsr | 252 | wrmsr |
| 238 | 253 | ||
| 239 | /* esi is pointer to real mode structure with interesting info. | 254 | /* rsi is pointer to real mode structure with interesting info. |
| 240 | pass it to C */ | 255 | pass it to C */ |
| 241 | movl %esi, %edi | 256 | movq %rsi, %rdi |
| 242 | 257 | ||
| 243 | /* Finally jump to run C code and to be on real kernel address | 258 | /* Finally jump to run C code and to be on real kernel address |
| 244 | * Since we are running on identity-mapped space we have to jump | 259 | * Since we are running on identity-mapped space we have to jump |
| 245 | * to the full 64bit address, this is only possible as indirect | 260 | * to the full 64bit address, this is only possible as indirect |
| 246 | * jump. In addition we need to ensure %cs is set so we make this | 261 | * jump. In addition we need to ensure %cs is set so we make this |
| 247 | * a far return. | 262 | * a far return. |
| 263 | * | ||
| 264 | * Note: do not change to far jump indirect with 64bit offset. | ||
| 265 | * | ||
| 266 | * AMD does not support far jump indirect with 64bit offset. | ||
| 267 | * AMD64 Architecture Programmer's Manual, Volume 3: states only | ||
| 268 | * JMP FAR mem16:16 FF /5 Far jump indirect, | ||
| 269 | * with the target specified by a far pointer in memory. | ||
| 270 | * JMP FAR mem16:32 FF /5 Far jump indirect, | ||
| 271 | * with the target specified by a far pointer in memory. | ||
| 272 | * | ||
| 273 | * Intel64 does support 64bit offset. | ||
| 274 | * Software Developer Manual Vol 2: states: | ||
| 275 | * FF /5 JMP m16:16 Jump far, absolute indirect, | ||
| 276 | * address given in m16:16 | ||
| 277 | * FF /5 JMP m16:32 Jump far, absolute indirect, | ||
| 278 | * address given in m16:32. | ||
| 279 | * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, | ||
| 280 | * address given in m16:64. | ||
| 248 | */ | 281 | */ |
| 249 | movq initial_code(%rip),%rax | 282 | movq initial_code(%rip),%rax |
| 250 | pushq $0 # fake return address to stop unwinder | 283 | pushq $0 # fake return address to stop unwinder |
| @@ -270,13 +303,13 @@ ENDPROC(start_cpu0) | |||
| 270 | 303 | ||
| 271 | /* SMP bootup changes these two */ | 304 | /* SMP bootup changes these two */ |
| 272 | __REFDATA | 305 | __REFDATA |
| 273 | .align 8 | 306 | .balign 8 |
| 274 | ENTRY(initial_code) | 307 | GLOBAL(initial_code) |
| 275 | .quad x86_64_start_kernel | 308 | .quad x86_64_start_kernel |
| 276 | ENTRY(initial_gs) | 309 | GLOBAL(initial_gs) |
| 277 | .quad INIT_PER_CPU_VAR(irq_stack_union) | 310 | .quad INIT_PER_CPU_VAR(irq_stack_union) |
| 278 | 311 | ||
| 279 | ENTRY(stack_start) | 312 | GLOBAL(stack_start) |
| 280 | .quad init_thread_union+THREAD_SIZE-8 | 313 | .quad init_thread_union+THREAD_SIZE-8 |
| 281 | .word 0 | 314 | .word 0 |
| 282 | __FINITDATA | 315 | __FINITDATA |
| @@ -284,7 +317,7 @@ ENDPROC(start_cpu0) | |||
| 284 | bad_address: | 317 | bad_address: |
| 285 | jmp bad_address | 318 | jmp bad_address |
| 286 | 319 | ||
| 287 | .section ".init.text","ax" | 320 | __INIT |
| 288 | .globl early_idt_handlers | 321 | .globl early_idt_handlers |
| 289 | early_idt_handlers: | 322 | early_idt_handlers: |
| 290 | # 104(%rsp) %rflags | 323 | # 104(%rsp) %rflags |
| @@ -321,14 +354,22 @@ ENTRY(early_idt_handler) | |||
| 321 | pushq %r11 # 0(%rsp) | 354 | pushq %r11 # 0(%rsp) |
| 322 | 355 | ||
| 323 | cmpl $__KERNEL_CS,96(%rsp) | 356 | cmpl $__KERNEL_CS,96(%rsp) |
| 324 | jne 10f | 357 | jne 11f |
| 358 | |||
| 359 | cmpl $14,72(%rsp) # Page fault? | ||
| 360 | jnz 10f | ||
| 361 | GET_CR2_INTO(%rdi) # can clobber any volatile register if pv | ||
| 362 | call early_make_pgtable | ||
| 363 | andl %eax,%eax | ||
| 364 | jz 20f # All good | ||
| 325 | 365 | ||
| 366 | 10: | ||
| 326 | leaq 88(%rsp),%rdi # Pointer to %rip | 367 | leaq 88(%rsp),%rdi # Pointer to %rip |
| 327 | call early_fixup_exception | 368 | call early_fixup_exception |
| 328 | andl %eax,%eax | 369 | andl %eax,%eax |
| 329 | jnz 20f # Found an exception entry | 370 | jnz 20f # Found an exception entry |
| 330 | 371 | ||
| 331 | 10: | 372 | 11: |
| 332 | #ifdef CONFIG_EARLY_PRINTK | 373 | #ifdef CONFIG_EARLY_PRINTK |
| 333 | GET_CR2_INTO(%r9) # can clobber any volatile register if pv | 374 | GET_CR2_INTO(%r9) # can clobber any volatile register if pv |
| 334 | movl 80(%rsp),%r8d # error code | 375 | movl 80(%rsp),%r8d # error code |
| @@ -350,7 +391,7 @@ ENTRY(early_idt_handler) | |||
| 350 | 1: hlt | 391 | 1: hlt |
| 351 | jmp 1b | 392 | jmp 1b |
| 352 | 393 | ||
| 353 | 20: # Exception table entry found | 394 | 20: # Exception table entry found or page table generated |
| 354 | popq %r11 | 395 | popq %r11 |
| 355 | popq %r10 | 396 | popq %r10 |
| 356 | popq %r9 | 397 | popq %r9 |
| @@ -364,6 +405,8 @@ ENTRY(early_idt_handler) | |||
| 364 | decl early_recursion_flag(%rip) | 405 | decl early_recursion_flag(%rip) |
| 365 | INTERRUPT_RETURN | 406 | INTERRUPT_RETURN |
| 366 | 407 | ||
| 408 | __INITDATA | ||
| 409 | |||
| 367 | .balign 4 | 410 | .balign 4 |
| 368 | early_recursion_flag: | 411 | early_recursion_flag: |
| 369 | .long 0 | 412 | .long 0 |
| @@ -374,11 +417,10 @@ early_idt_msg: | |||
| 374 | early_idt_ripmsg: | 417 | early_idt_ripmsg: |
| 375 | .asciz "RIP %s\n" | 418 | .asciz "RIP %s\n" |
| 376 | #endif /* CONFIG_EARLY_PRINTK */ | 419 | #endif /* CONFIG_EARLY_PRINTK */ |
| 377 | .previous | ||
| 378 | 420 | ||
| 379 | #define NEXT_PAGE(name) \ | 421 | #define NEXT_PAGE(name) \ |
| 380 | .balign PAGE_SIZE; \ | 422 | .balign PAGE_SIZE; \ |
| 381 | ENTRY(name) | 423 | GLOBAL(name) |
| 382 | 424 | ||
| 383 | /* Automate the creation of 1 to 1 mapping pmd entries */ | 425 | /* Automate the creation of 1 to 1 mapping pmd entries */ |
| 384 | #define PMDS(START, PERM, COUNT) \ | 426 | #define PMDS(START, PERM, COUNT) \ |
| @@ -388,24 +430,37 @@ ENTRY(name) | |||
| 388 | i = i + 1 ; \ | 430 | i = i + 1 ; \ |
| 389 | .endr | 431 | .endr |
| 390 | 432 | ||
| 433 | __INITDATA | ||
| 434 | NEXT_PAGE(early_level4_pgt) | ||
| 435 | .fill 511,8,0 | ||
| 436 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
| 437 | |||
| 438 | NEXT_PAGE(early_dynamic_pgts) | ||
| 439 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 | ||
| 440 | |||
| 391 | .data | 441 | .data |
| 392 | /* | 442 | |
| 393 | * This default setting generates an ident mapping at address 0x100000 | 443 | #ifndef CONFIG_XEN |
| 394 | * and a mapping for the kernel that precisely maps virtual address | ||
| 395 | * 0xffffffff80000000 to physical address 0x000000. (always using | ||
| 396 | * 2Mbyte large pages provided by PAE mode) | ||
| 397 | */ | ||
| 398 | NEXT_PAGE(init_level4_pgt) | 444 | NEXT_PAGE(init_level4_pgt) |
| 399 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 445 | .fill 512,8,0 |
| 400 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 | 446 | #else |
| 401 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 447 | NEXT_PAGE(init_level4_pgt) |
| 402 | .org init_level4_pgt + L4_START_KERNEL*8, 0 | 448 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| 449 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 | ||
| 450 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
| 451 | .org init_level4_pgt + L4_START_KERNEL*8, 0 | ||
| 403 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | 452 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
| 404 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | 453 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
| 405 | 454 | ||
| 406 | NEXT_PAGE(level3_ident_pgt) | 455 | NEXT_PAGE(level3_ident_pgt) |
| 407 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 456 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| 408 | .fill 511,8,0 | 457 | .fill 511, 8, 0 |
| 458 | NEXT_PAGE(level2_ident_pgt) | ||
| 459 | /* Since I easily can, map the first 1G. | ||
| 460 | * Don't set NX because code runs from these pages. | ||
| 461 | */ | ||
| 462 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) | ||
| 463 | #endif | ||
| 409 | 464 | ||
| 410 | NEXT_PAGE(level3_kernel_pgt) | 465 | NEXT_PAGE(level3_kernel_pgt) |
| 411 | .fill L3_START_KERNEL,8,0 | 466 | .fill L3_START_KERNEL,8,0 |
| @@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt) | |||
| 413 | .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE | 468 | .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| 414 | .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | 469 | .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE |
| 415 | 470 | ||
| 416 | NEXT_PAGE(level2_fixmap_pgt) | ||
| 417 | .fill 506,8,0 | ||
| 418 | .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
| 419 | /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ | ||
| 420 | .fill 5,8,0 | ||
| 421 | |||
| 422 | NEXT_PAGE(level1_fixmap_pgt) | ||
| 423 | .fill 512,8,0 | ||
| 424 | |||
| 425 | NEXT_PAGE(level2_ident_pgt) | ||
| 426 | /* Since I easily can, map the first 1G. | ||
| 427 | * Don't set NX because code runs from these pages. | ||
| 428 | */ | ||
| 429 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) | ||
| 430 | |||
| 431 | NEXT_PAGE(level2_kernel_pgt) | 471 | NEXT_PAGE(level2_kernel_pgt) |
| 432 | /* | 472 | /* |
| 433 | * 512 MB kernel mapping. We spend a full page on this pagetable | 473 | * 512 MB kernel mapping. We spend a full page on this pagetable |
| @@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt) | |||
| 442 | PMDS(0, __PAGE_KERNEL_LARGE_EXEC, | 482 | PMDS(0, __PAGE_KERNEL_LARGE_EXEC, |
| 443 | KERNEL_IMAGE_SIZE/PMD_SIZE) | 483 | KERNEL_IMAGE_SIZE/PMD_SIZE) |
| 444 | 484 | ||
| 445 | NEXT_PAGE(level2_spare_pgt) | 485 | NEXT_PAGE(level2_fixmap_pgt) |
| 446 | .fill 512, 8, 0 | 486 | .fill 506,8,0 |
| 487 | .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
| 488 | /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ | ||
| 489 | .fill 5,8,0 | ||
| 490 | |||
| 491 | NEXT_PAGE(level1_fixmap_pgt) | ||
| 492 | .fill 512,8,0 | ||
| 447 | 493 | ||
| 448 | #undef PMDS | 494 | #undef PMDS |
| 449 | #undef NEXT_PAGE | ||
| 450 | 495 | ||
| 451 | .data | 496 | .data |
| 452 | .align 16 | 497 | .align 16 |
| @@ -472,6 +517,5 @@ ENTRY(nmi_idt_table) | |||
| 472 | .skip IDT_ENTRIES * 16 | 517 | .skip IDT_ENTRIES * 16 |
| 473 | 518 | ||
| 474 | __PAGE_ALIGNED_BSS | 519 | __PAGE_ALIGNED_BSS |
| 475 | .align PAGE_SIZE | 520 | NEXT_PAGE(empty_zero_page) |
| 476 | ENTRY(empty_zero_page) | ||
| 477 | .skip PAGE_SIZE | 521 | .skip PAGE_SIZE |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 85a8290801df..db9c41dae8d7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
| @@ -1005,6 +1005,8 @@ void __init setup_arch(char **cmdline_p) | |||
| 1005 | 1005 | ||
| 1006 | init_mem_mapping(); | 1006 | init_mem_mapping(); |
| 1007 | 1007 | ||
| 1008 | early_trap_pf_init(); | ||
| 1009 | |||
| 1008 | setup_real_mode(); | 1010 | setup_real_mode(); |
| 1009 | 1011 | ||
| 1010 | memblock.current_limit = get_max_mapped(); | 1012 | memblock.current_limit = get_max_mapped(); |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ecffca11f4e9..68bda7a84159 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
| @@ -688,10 +688,19 @@ void __init early_trap_init(void) | |||
| 688 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); | 688 | set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); |
| 689 | /* int3 can be called from all */ | 689 | /* int3 can be called from all */ |
| 690 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); | 690 | set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); |
| 691 | #ifdef CONFIG_X86_32 | ||
| 691 | set_intr_gate(X86_TRAP_PF, &page_fault); | 692 | set_intr_gate(X86_TRAP_PF, &page_fault); |
| 693 | #endif | ||
| 692 | load_idt(&idt_descr); | 694 | load_idt(&idt_descr); |
| 693 | } | 695 | } |
| 694 | 696 | ||
| 697 | void __init early_trap_pf_init(void) | ||
| 698 | { | ||
| 699 | #ifdef CONFIG_X86_64 | ||
| 700 | set_intr_gate(X86_TRAP_PF, &page_fault); | ||
| 701 | #endif | ||
| 702 | } | ||
| 703 | |||
| 695 | void __init trap_init(void) | 704 | void __init trap_init(void) |
| 696 | { | 705 | { |
| 697 | int i; | 706 | int i; |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 78d1ef3eab66..3364a7643a4c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
| @@ -446,9 +446,10 @@ void __init init_mem_mapping(void) | |||
| 446 | } | 446 | } |
| 447 | #else | 447 | #else |
| 448 | early_ioremap_page_table_range_init(); | 448 | early_ioremap_page_table_range_init(); |
| 449 | #endif | ||
| 450 | |||
| 449 | load_cr3(swapper_pg_dir); | 451 | load_cr3(swapper_pg_dir); |
| 450 | __flush_tlb_all(); | 452 | __flush_tlb_all(); |
| 451 | #endif | ||
| 452 | 453 | ||
| 453 | early_memtest(0, max_pfn_mapped << PAGE_SHIFT); | 454 | early_memtest(0, max_pfn_mapped << PAGE_SHIFT); |
| 454 | } | 455 | } |
