diff options
author | Vivek Goyal <vgoyal@in.ibm.com> | 2007-05-02 13:27:07 -0400 |
---|---|---|
committer | Andi Kleen <andi@basil.nowhere.org> | 2007-05-02 13:27:07 -0400 |
commit | 1ab60e0f72f71ec54831e525a3e1154f1c092408 (patch) | |
tree | bd7dd8bbff43e3e2e3597f2b7780e82a856bb9d7 /arch/x86_64/kernel/head.S | |
parent | 0dbf7028c0c1f266c9631139450a1502d3cd457e (diff) |
[PATCH] x86-64: Relocatable Kernel Support
This patch modifies the x86_64 kernel so that it can be loaded and run
at any 2M aligned address, below 512G. The technique used is to
compile the decompressor with -fPIC and modify it so the decompressor
is fully relocatable. For the main kernel the page tables are
modified so the kernel remains at the same virtual address. In
addition a variable phys_base is kept that holds the physical address
the kernel is loaded at. __pa_symbol is modified to add that when
we take the address of a kernel symbol.
When loaded with a normal bootloader the decompressor will decompress
the kernel to 2M and it will run there. This both ensures the
relocation code is always working, and makes it easier to use 2M
pages for the kernel and the cpu.
AK: changed to not make RELOCATABLE default in Kconfig
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Diffstat (limited to 'arch/x86_64/kernel/head.S')
-rw-r--r-- | arch/x86_64/kernel/head.S | 233 |
1 files changed, 130 insertions, 103 deletions
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c211e52f1333..36aa98a6d15c 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S | |||
@@ -5,6 +5,7 @@ | |||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | 5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> |
6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> | 6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> |
7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> | 7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> |
8 | * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> | ||
8 | */ | 9 | */ |
9 | 10 | ||
10 | 11 | ||
@@ -17,95 +18,127 @@ | |||
17 | #include <asm/page.h> | 18 | #include <asm/page.h> |
18 | #include <asm/msr.h> | 19 | #include <asm/msr.h> |
19 | #include <asm/cache.h> | 20 | #include <asm/cache.h> |
20 | 21 | ||
21 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE | 22 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE |
22 | * because we need identity-mapped pages on setup so define __START_KERNEL to | 23 | * because we need identity-mapped pages. |
23 | * 0x100000 for this stage | 24 | * |
24 | * | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | .text | 27 | .text |
28 | .section .bootstrap.text | 28 | .section .bootstrap.text |
29 | .code32 | 29 | .code64 |
30 | .globl startup_32 | 30 | .globl startup_64 |
31 | /* %bx: 1 if coming from smp trampoline on secondary cpu */ | 31 | startup_64: |
32 | startup_32: | 32 | |
33 | |||
34 | /* | 33 | /* |
35 | * At this point the CPU runs in 32bit protected mode (CS.D = 1) with | 34 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
36 | * paging disabled and the point of this file is to switch to 64bit | 35 | * and someone has loaded an identity mapped page table |
37 | * long mode with a kernel mapping for kerneland to jump into the | 36 | * for us. These identity mapped page tables map all of the |
38 | * kernel virtual addresses. | 37 | * kernel pages and possibly all of memory. |
39 | * There is no stack until we set one up. | 38 | * |
39 | * %esi holds a physical pointer to real_mode_data. | ||
40 | * | ||
41 | * We come here either directly from a 64bit bootloader, or from | ||
42 | * arch/x86_64/boot/compressed/head.S. | ||
43 | * | ||
44 | * We only come here initially at boot nothing else comes here. | ||
45 | * | ||
46 | * Since we may be loaded at an address different from what we were | ||
47 | * compiled to run at we first fixup the physical addresses in our page | ||
48 | * tables and then reload them. | ||
40 | */ | 49 | */ |
41 | 50 | ||
42 | /* Initialize the %ds segment register */ | 51 | /* Compute the delta between the address I am compiled to run at and the |
43 | movl $__KERNEL_DS,%eax | 52 | * address I am actually running at. |
44 | movl %eax,%ds | ||
45 | |||
46 | /* Load new GDT with the 64bit segments using 32bit descriptor */ | ||
47 | lgdt pGDT32 - __START_KERNEL_map | ||
48 | |||
49 | /* If the CPU doesn't support CPUID this will double fault. | ||
50 | * Unfortunately it is hard to check for CPUID without a stack. | ||
51 | */ | 53 | */ |
52 | 54 | leaq _text(%rip), %rbp | |
53 | /* Check if extended functions are implemented */ | 55 | subq $_text - __START_KERNEL_map, %rbp |
54 | movl $0x80000000, %eax | 56 | |
55 | cpuid | 57 | /* Is the address not 2M aligned? */ |
56 | cmpl $0x80000000, %eax | 58 | movq %rbp, %rax |
57 | jbe no_long_mode | 59 | andl $~LARGE_PAGE_MASK, %eax |
58 | /* Check if long mode is implemented */ | 60 | testl %eax, %eax |
59 | mov $0x80000001, %eax | 61 | jnz bad_address |
60 | cpuid | 62 | |
61 | btl $29, %edx | 63 | /* Is the address too large? */ |
62 | jnc no_long_mode | 64 | leaq _text(%rip), %rdx |
63 | 65 | movq $PGDIR_SIZE, %rax | |
64 | /* | 66 | cmpq %rax, %rdx |
65 | * Prepare for entering 64bits mode | 67 | jae bad_address |
68 | |||
69 | /* Fixup the physical addresses in the page table | ||
66 | */ | 70 | */ |
71 | addq %rbp, init_level4_pgt + 0(%rip) | ||
72 | addq %rbp, init_level4_pgt + (258*8)(%rip) | ||
73 | addq %rbp, init_level4_pgt + (511*8)(%rip) | ||
74 | |||
75 | addq %rbp, level3_ident_pgt + 0(%rip) | ||
76 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) | ||
77 | |||
78 | /* Add an Identity mapping if I am above 1G */ | ||
79 | leaq _text(%rip), %rdi | ||
80 | andq $LARGE_PAGE_MASK, %rdi | ||
81 | |||
82 | movq %rdi, %rax | ||
83 | shrq $PUD_SHIFT, %rax | ||
84 | andq $(PTRS_PER_PUD - 1), %rax | ||
85 | jz ident_complete | ||
86 | |||
87 | leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx | ||
88 | leaq level3_ident_pgt(%rip), %rbx | ||
89 | movq %rdx, 0(%rbx, %rax, 8) | ||
90 | |||
91 | movq %rdi, %rax | ||
92 | shrq $PMD_SHIFT, %rax | ||
93 | andq $(PTRS_PER_PMD - 1), %rax | ||
94 | leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx | ||
95 | leaq level2_spare_pgt(%rip), %rbx | ||
96 | movq %rdx, 0(%rbx, %rax, 8) | ||
97 | ident_complete: | ||
98 | |||
99 | /* Fixup the kernel text+data virtual addresses | ||
100 | */ | ||
101 | leaq level2_kernel_pgt(%rip), %rdi | ||
102 | leaq 4096(%rdi), %r8 | ||
103 | /* See if it is a valid page table entry */ | ||
104 | 1: testq $1, 0(%rdi) | ||
105 | jz 2f | ||
106 | addq %rbp, 0(%rdi) | ||
107 | /* Go to the next page */ | ||
108 | 2: addq $8, %rdi | ||
109 | cmp %r8, %rdi | ||
110 | jne 1b | ||
111 | |||
112 | /* Fixup phys_base */ | ||
113 | addq %rbp, phys_base(%rip) | ||
67 | 114 | ||
68 | /* Enable PAE mode */ | 115 | #ifdef CONFIG_SMP |
69 | xorl %eax, %eax | 116 | addq %rbp, trampoline_level4_pgt + 0(%rip) |
70 | btsl $5, %eax | 117 | addq %rbp, trampoline_level4_pgt + (511*8)(%rip) |
71 | movl %eax, %cr4 | 118 | #endif |
72 | 119 | #ifdef CONFIG_ACPI_SLEEP | |
73 | /* Setup early boot stage 4 level pagetables */ | 120 | addq %rbp, wakeup_level4_pgt + 0(%rip) |
74 | movl $(init_level4_pgt - __START_KERNEL_map), %eax | 121 | addq %rbp, wakeup_level4_pgt + (511*8)(%rip) |
75 | movl %eax, %cr3 | 122 | #endif |
76 | |||
77 | /* Setup EFER (Extended Feature Enable Register) */ | ||
78 | movl $MSR_EFER, %ecx | ||
79 | rdmsr | ||
80 | |||
81 | /* Enable Long Mode */ | ||
82 | btsl $_EFER_LME, %eax | ||
83 | |||
84 | /* Make changes effective */ | ||
85 | wrmsr | ||
86 | 123 | ||
87 | xorl %eax, %eax | 124 | /* Due to ENTRY(), sometimes the empty space gets filled with |
88 | btsl $31, %eax /* Enable paging and in turn activate Long Mode */ | 125 | * zeros. Better take a jmp than relying on empty space being |
89 | btsl $0, %eax /* Enable protected mode */ | 126 | * filled with 0x90 (nop) |
90 | /* Make changes effective */ | ||
91 | movl %eax, %cr0 | ||
92 | /* | ||
93 | * At this point we're in long mode but in 32bit compatibility mode | ||
94 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
95 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use | ||
96 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
97 | */ | 127 | */ |
98 | ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) | 128 | jmp secondary_startup_64 |
99 | |||
100 | .code64 | ||
101 | .org 0x100 | ||
102 | .globl startup_64 | ||
103 | startup_64: | ||
104 | ENTRY(secondary_startup_64) | 129 | ENTRY(secondary_startup_64) |
105 | /* We come here either from startup_32 | 130 | /* |
106 | * or directly from a 64bit bootloader. | 131 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
107 | * Since we may have come directly from a bootloader we | 132 | * and someone has loaded a mapped page table. |
108 | * reload the page tables here. | 133 | * |
134 | * %esi holds a physical pointer to real_mode_data. | ||
135 | * | ||
136 | * We come here either from startup_64 (using physical addresses) | ||
137 | * or from trampoline.S (using virtual addresses). | ||
138 | * | ||
139 | * Using virtual addresses from trampoline.S removes the need | ||
140 | * to have any identity mapped pages in the kernel page table | ||
141 | * after the boot processor executes this code. | ||
109 | */ | 142 | */ |
110 | 143 | ||
111 | /* Enable PAE mode and PGE */ | 144 | /* Enable PAE mode and PGE */ |
@@ -116,8 +149,14 @@ ENTRY(secondary_startup_64) | |||
116 | 149 | ||
117 | /* Setup early boot stage 4 level pagetables. */ | 150 | /* Setup early boot stage 4 level pagetables. */ |
118 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | 151 | movq $(init_level4_pgt - __START_KERNEL_map), %rax |
152 | addq phys_base(%rip), %rax | ||
119 | movq %rax, %cr3 | 153 | movq %rax, %cr3 |
120 | 154 | ||
155 | /* Ensure I am executing from virtual addresses */ | ||
156 | movq $1f, %rax | ||
157 | jmp *%rax | ||
158 | 1: | ||
159 | |||
121 | /* Check if nx is implemented */ | 160 | /* Check if nx is implemented */ |
122 | movl $0x80000001, %eax | 161 | movl $0x80000001, %eax |
123 | cpuid | 162 | cpuid |
@@ -126,17 +165,11 @@ ENTRY(secondary_startup_64) | |||
126 | /* Setup EFER (Extended Feature Enable Register) */ | 165 | /* Setup EFER (Extended Feature Enable Register) */ |
127 | movl $MSR_EFER, %ecx | 166 | movl $MSR_EFER, %ecx |
128 | rdmsr | 167 | rdmsr |
129 | 168 | btsl $_EFER_SCE, %eax /* Enable System Call */ | |
130 | /* Enable System Call */ | 169 | btl $20,%edi /* No Execute supported? */ |
131 | btsl $_EFER_SCE, %eax | ||
132 | |||
133 | /* No Execute supported? */ | ||
134 | btl $20,%edi | ||
135 | jnc 1f | 170 | jnc 1f |
136 | btsl $_EFER_NX, %eax | 171 | btsl $_EFER_NX, %eax |
137 | 1: | 172 | 1: wrmsr /* Make changes effective */ |
138 | /* Make changes effective */ | ||
139 | wrmsr | ||
140 | 173 | ||
141 | /* Setup cr0 */ | 174 | /* Setup cr0 */ |
142 | #define CR0_PM 1 /* protected mode */ | 175 | #define CR0_PM 1 /* protected mode */ |
@@ -163,7 +196,7 @@ ENTRY(secondary_startup_64) | |||
163 | * addresses where we're currently running on. We have to do that here | 196 | * addresses where we're currently running on. We have to do that here |
164 | * because in 32bit we couldn't load a 64bit linear address. | 197 | * because in 32bit we couldn't load a 64bit linear address. |
165 | */ | 198 | */ |
166 | lgdt cpu_gdt_descr | 199 | lgdt cpu_gdt_descr(%rip) |
167 | 200 | ||
168 | /* set up data segments. actually 0 would do too */ | 201 | /* set up data segments. actually 0 would do too */ |
169 | movl $__KERNEL_DS,%eax | 202 | movl $__KERNEL_DS,%eax |
@@ -214,6 +247,9 @@ initial_code: | |||
214 | init_rsp: | 247 | init_rsp: |
215 | .quad init_thread_union+THREAD_SIZE-8 | 248 | .quad init_thread_union+THREAD_SIZE-8 |
216 | 249 | ||
250 | bad_address: | ||
251 | jmp bad_address | ||
252 | |||
217 | ENTRY(early_idt_handler) | 253 | ENTRY(early_idt_handler) |
218 | cmpl $2,early_recursion_flag(%rip) | 254 | cmpl $2,early_recursion_flag(%rip) |
219 | jz 1f | 255 | jz 1f |
@@ -242,23 +278,7 @@ early_idt_msg: | |||
242 | early_idt_ripmsg: | 278 | early_idt_ripmsg: |
243 | .asciz "RIP %s\n" | 279 | .asciz "RIP %s\n" |
244 | 280 | ||
245 | .code32 | 281 | .balign PAGE_SIZE |
246 | ENTRY(no_long_mode) | ||
247 | /* This isn't an x86-64 CPU so hang */ | ||
248 | 1: | ||
249 | jmp 1b | ||
250 | |||
251 | .org 0xf00 | ||
252 | .globl pGDT32 | ||
253 | pGDT32: | ||
254 | .word gdt_end-cpu_gdt_table-1 | ||
255 | .long cpu_gdt_table-__START_KERNEL_map | ||
256 | |||
257 | .org 0xf10 | ||
258 | ljumpvector: | ||
259 | .long startup_64-__START_KERNEL_map | ||
260 | .word __KERNEL_CS | ||
261 | |||
262 | ENTRY(stext) | 282 | ENTRY(stext) |
263 | ENTRY(_stext) | 283 | ENTRY(_stext) |
264 | 284 | ||
@@ -303,7 +323,7 @@ NEXT_PAGE(level2_ident_pgt) | |||
303 | * Don't set NX because code runs from these pages. | 323 | * Don't set NX because code runs from these pages. |
304 | */ | 324 | */ |
305 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) | 325 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) |
306 | 326 | ||
307 | NEXT_PAGE(level2_kernel_pgt) | 327 | NEXT_PAGE(level2_kernel_pgt) |
308 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. | 328 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. |
309 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ | 329 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ |
@@ -313,6 +333,9 @@ NEXT_PAGE(level2_kernel_pgt) | |||
313 | /* Module mapping starts here */ | 333 | /* Module mapping starts here */ |
314 | .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 | 334 | .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 |
315 | 335 | ||
336 | NEXT_PAGE(level2_spare_pgt) | ||
337 | .fill 512,8,0 | ||
338 | |||
316 | #undef PMDS | 339 | #undef PMDS |
317 | #undef NEXT_PAGE | 340 | #undef NEXT_PAGE |
318 | 341 | ||
@@ -330,6 +353,10 @@ gdt: | |||
330 | .endr | 353 | .endr |
331 | #endif | 354 | #endif |
332 | 355 | ||
356 | ENTRY(phys_base) | ||
357 | /* This must match the first entry in level2_kernel_pgt */ | ||
358 | .quad 0x0000000000000000 | ||
359 | |||
333 | /* We need valid kernel segments for data and code in long mode too | 360 | /* We need valid kernel segments for data and code in long mode too |
334 | * IRET will check the segment types kkeil 2000/10/28 | 361 | * IRET will check the segment types kkeil 2000/10/28 |
335 | * Also sysret mandates a special GDT layout | 362 | * Also sysret mandates a special GDT layout |