diff options
author | Vivek Goyal <vgoyal@in.ibm.com> | 2007-05-02 13:27:07 -0400 |
---|---|---|
committer | Andi Kleen <andi@basil.nowhere.org> | 2007-05-02 13:27:07 -0400 |
commit | 1ab60e0f72f71ec54831e525a3e1154f1c092408 (patch) | |
tree | bd7dd8bbff43e3e2e3597f2b7780e82a856bb9d7 | |
parent | 0dbf7028c0c1f266c9631139450a1502d3cd457e (diff) |
[PATCH] x86-64: Relocatable Kernel Support
This patch modifies the x86_64 kernel so that it can be loaded and run
at any 2M aligned address, below 512G. The technique used is to
compile the decompressor with -fPIC and modify it so the decompressor
is fully relocatable. For the main kernel the page tables are
modified so the kernel remains at the same virtual address. In
addition a variable phys_base is kept that holds the physical address
the kernel is loaded at. __pa_symbol is modified to add that when
we take the address of a kernel symbol.
When loaded with a normal bootloader the decompressor will decompress
the kernel to 2M and it will run there. This both ensures the
relocation code is always working, and makes it easier to use 2M
pages for the kernel and the cpu.
AK: changed to not make RELOCATABLE default in Kconfig
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
-rw-r--r-- | arch/x86_64/Kconfig | 49 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/Makefile | 12 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/head.S | 322 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/misc.c | 247 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/vmlinux.lds | 44 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/vmlinux.scr | 9 | ||||
-rw-r--r-- | arch/x86_64/kernel/head.S | 233 | ||||
-rw-r--r-- | arch/x86_64/kernel/suspend_asm.S | 7 | ||||
-rw-r--r-- | include/asm-x86_64/page.h | 6 |
9 files changed, 597 insertions, 332 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index b3dbf11eb82c..715632026073 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig | |||
@@ -565,23 +565,56 @@ config CRASH_DUMP | |||
565 | PHYSICAL_START. | 565 | PHYSICAL_START. |
566 | For more details see Documentation/kdump/kdump.txt | 566 | For more details see Documentation/kdump/kdump.txt |
567 | 567 | ||
568 | config RELOCATABLE | ||
569 | bool "Build a relocatable kernel(EXPERIMENTAL)" | ||
570 | depends on EXPERIMENTAL | ||
571 | help | ||
572 | Builds a relocatable kernel. This enables loading and running | ||
573 | a kernel binary from a different physical address than it has | ||
574 | been compiled for. | ||
575 | |||
576 | One use is for the kexec on panic case where the recovery kernel | ||
577 | must live at a different physical address than the primary | ||
578 | kernel. | ||
579 | |||
580 | Note: If CONFIG_RELOCATABLE=y, then kernel run from the address | ||
581 | it has been loaded at and compile time physical address | ||
582 | (CONFIG_PHYSICAL_START) is ignored. | ||
583 | |||
568 | config PHYSICAL_START | 584 | config PHYSICAL_START |
569 | hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) | 585 | hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) |
570 | default "0x1000000" if CRASH_DUMP | ||
571 | default "0x200000" | 586 | default "0x200000" |
572 | help | 587 | help |
573 | This gives the physical address where the kernel is loaded. Normally | 588 | This gives the physical address where the kernel is loaded. It |
574 | for regular kernels this value is 0x200000 (2MB). But in the case | 589 | should be aligned to 2MB boundary. |
575 | of kexec on panic the fail safe kernel needs to run at a different | 590 | |
576 | address than the panic-ed kernel. This option is used to set the load | 591 | If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then |
577 | address for kernels used to capture crash dump on being kexec'ed | 592 | bzImage will decompress itself to above physical address and |
578 | after panic. The default value for crash dump kernels is | 593 | run from there. Otherwise, bzImage will run from the address where |
579 | 0x1000000 (16MB). This can also be set based on the "X" value as | 594 | it has been loaded by the boot loader and will ignore above physical |
595 | address. | ||
596 | |||
597 | In normal kdump cases one does not have to set/change this option | ||
598 | as now bzImage can be compiled as a completely relocatable image | ||
599 | (CONFIG_RELOCATABLE=y) and be used to load and run from a different | ||
600 | address. This option is mainly useful for the folks who don't want | ||
601 | to use a bzImage for capturing the crash dump and want to use a | ||
602 | vmlinux instead. | ||
603 | |||
604 | So if you are using bzImage for capturing the crash dump, leave | ||
605 | the value here unchanged to 0x200000 and set CONFIG_RELOCATABLE=y. | ||
606 | Otherwise if you plan to use vmlinux for capturing the crash dump | ||
607 | change this value to start of the reserved region (Typically 16MB | ||
608 | 0x1000000). In other words, it can be set based on the "X" value as | ||
580 | specified in the "crashkernel=YM@XM" command line boot parameter | 609 | specified in the "crashkernel=YM@XM" command line boot parameter |
581 | passed to the panic-ed kernel. Typically this parameter is set as | 610 | passed to the panic-ed kernel. Typically this parameter is set as |
582 | crashkernel=64M@16M. Please take a look at | 611 | crashkernel=64M@16M. Please take a look at |
583 | Documentation/kdump/kdump.txt for more details about crash dumps. | 612 | Documentation/kdump/kdump.txt for more details about crash dumps. |
584 | 613 | ||
614 | Usage of bzImage for capturing the crash dump is advantageous as | ||
615 | one does not have to build two kernels. Same kernel can be used | ||
616 | as production kernel and capture kernel. | ||
617 | |||
585 | Don't change this unless you know what you are doing. | 618 | Don't change this unless you know what you are doing. |
586 | 619 | ||
587 | config SECCOMP | 620 | config SECCOMP |
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index e70fa6e1da08..705a3e33d7e1 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile | |||
@@ -8,16 +8,14 @@ | |||
8 | 8 | ||
9 | targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o | 9 | targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o |
10 | EXTRA_AFLAGS := -traditional | 10 | EXTRA_AFLAGS := -traditional |
11 | AFLAGS := $(subst -m64,-m32,$(AFLAGS)) | ||
12 | 11 | ||
13 | # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with | 12 | # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with |
14 | # -m32 | 13 | # -m32 |
15 | CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing | 14 | CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin |
16 | LDFLAGS := -m elf_i386 | 15 | LDFLAGS := -m elf_x86_64 |
17 | 16 | ||
18 | LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 | 17 | LDFLAGS_vmlinux := -T |
19 | 18 | $(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE | |
20 | $(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE | ||
21 | $(call if_changed,ld) | 19 | $(call if_changed,ld) |
22 | @: | 20 | @: |
23 | 21 | ||
@@ -27,7 +25,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE | |||
27 | $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE | 25 | $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE |
28 | $(call if_changed,gzip) | 26 | $(call if_changed,gzip) |
29 | 27 | ||
30 | LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T | 28 | LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T |
31 | 29 | ||
32 | $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE | 30 | $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE |
33 | $(call if_changed,ld) | 31 | $(call if_changed,ld) |
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 6f55565e4d42..c353a9266ea4 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S | |||
@@ -26,116 +26,262 @@ | |||
26 | 26 | ||
27 | #include <linux/linkage.h> | 27 | #include <linux/linkage.h> |
28 | #include <asm/segment.h> | 28 | #include <asm/segment.h> |
29 | #include <asm/pgtable.h> | ||
29 | #include <asm/page.h> | 30 | #include <asm/page.h> |
31 | #include <asm/msr.h> | ||
30 | 32 | ||
33 | .section ".text.head" | ||
31 | .code32 | 34 | .code32 |
32 | .globl startup_32 | 35 | .globl startup_32 |
33 | 36 | ||
34 | startup_32: | 37 | startup_32: |
35 | cld | 38 | cld |
36 | cli | 39 | cli |
37 | movl $(__KERNEL_DS),%eax | 40 | movl $(__KERNEL_DS), %eax |
38 | movl %eax,%ds | 41 | movl %eax, %ds |
39 | movl %eax,%es | 42 | movl %eax, %es |
40 | movl %eax,%fs | 43 | movl %eax, %ss |
41 | movl %eax,%gs | 44 | |
42 | 45 | /* Calculate the delta between where we were compiled to run | |
43 | lss stack_start,%esp | 46 | * at and where we were actually loaded at. This can only be done |
44 | xorl %eax,%eax | 47 | * with a short local call on x86. Nothing else will tell us what |
45 | 1: incl %eax # check that A20 really IS enabled | 48 | * address we are running at. The reserved chunk of the real-mode |
46 | movl %eax,0x000000 # loop forever if it isn't | 49 | * data at 0x34-0x3f are used as the stack for this calculation. |
47 | cmpl %eax,0x100000 | 50 | * Only 4 bytes are needed. |
48 | je 1b | 51 | */ |
52 | leal 0x40(%esi), %esp | ||
53 | call 1f | ||
54 | 1: popl %ebp | ||
55 | subl $1b, %ebp | ||
56 | |||
57 | /* Compute the delta between where we were compiled to run at | ||
58 | * and where the code will actually run at. | ||
59 | */ | ||
60 | /* %ebp contains the address we are loaded at by the boot loader and %ebx | ||
61 | * contains the address where we should move the kernel image temporarily | ||
62 | * for safe in-place decompression. | ||
63 | */ | ||
64 | |||
65 | #ifdef CONFIG_RELOCATABLE | ||
66 | movl %ebp, %ebx | ||
67 | addl $(LARGE_PAGE_SIZE -1), %ebx | ||
68 | andl $LARGE_PAGE_MASK, %ebx | ||
69 | #else | ||
70 | movl $CONFIG_PHYSICAL_START, %ebx | ||
71 | #endif | ||
72 | |||
73 | /* Replace the compressed data size with the uncompressed size */ | ||
74 | subl input_len(%ebp), %ebx | ||
75 | movl output_len(%ebp), %eax | ||
76 | addl %eax, %ebx | ||
77 | /* Add 8 bytes for every 32K input block */ | ||
78 | shrl $12, %eax | ||
79 | addl %eax, %ebx | ||
80 | /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ | ||
81 | addl $(32768 + 18 + 4095), %ebx | ||
82 | andl $~4095, %ebx | ||
49 | 83 | ||
50 | /* | 84 | /* |
51 | * Initialize eflags. Some BIOS's leave bits like NT set. This would | 85 | * Prepare for entering 64 bit mode |
52 | * confuse the debugger if this code is traced. | ||
53 | * XXX - best to initialize before switching to protected mode. | ||
54 | */ | 86 | */ |
55 | pushl $0 | 87 | |
56 | popfl | 88 | /* Load new GDT with the 64bit segments using 32bit descriptor */ |
89 | leal gdt(%ebp), %eax | ||
90 | movl %eax, gdt+2(%ebp) | ||
91 | lgdt gdt(%ebp) | ||
92 | |||
93 | /* Enable PAE mode */ | ||
94 | xorl %eax, %eax | ||
95 | orl $(1 << 5), %eax | ||
96 | movl %eax, %cr4 | ||
97 | |||
98 | /* | ||
99 | * Build early 4G boot pagetable | ||
100 | */ | ||
101 | /* Initialize Page tables to 0*/ | ||
102 | leal pgtable(%ebx), %edi | ||
103 | xorl %eax, %eax | ||
104 | movl $((4096*6)/4), %ecx | ||
105 | rep stosl | ||
106 | |||
107 | /* Build Level 4 */ | ||
108 | leal pgtable + 0(%ebx), %edi | ||
109 | leal 0x1007 (%edi), %eax | ||
110 | movl %eax, 0(%edi) | ||
111 | |||
112 | /* Build Level 3 */ | ||
113 | leal pgtable + 0x1000(%ebx), %edi | ||
114 | leal 0x1007(%edi), %eax | ||
115 | movl $4, %ecx | ||
116 | 1: movl %eax, 0x00(%edi) | ||
117 | addl $0x00001000, %eax | ||
118 | addl $8, %edi | ||
119 | decl %ecx | ||
120 | jnz 1b | ||
121 | |||
122 | /* Build Level 2 */ | ||
123 | leal pgtable + 0x2000(%ebx), %edi | ||
124 | movl $0x00000183, %eax | ||
125 | movl $2048, %ecx | ||
126 | 1: movl %eax, 0(%edi) | ||
127 | addl $0x00200000, %eax | ||
128 | addl $8, %edi | ||
129 | decl %ecx | ||
130 | jnz 1b | ||
131 | |||
132 | /* Enable the boot page tables */ | ||
133 | leal pgtable(%ebx), %eax | ||
134 | movl %eax, %cr3 | ||
135 | |||
136 | /* Enable Long mode in EFER (Extended Feature Enable Register) */ | ||
137 | movl $MSR_EFER, %ecx | ||
138 | rdmsr | ||
139 | btsl $_EFER_LME, %eax | ||
140 | wrmsr | ||
141 | |||
142 | /* Setup for the jump to 64bit mode | ||
143 | * | ||
144 | * When the jump is performend we will be in long mode but | ||
145 | * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 | ||
146 | * (and in turn EFER.LMA = 1). To jump into 64bit mode we use | ||
147 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
148 | * We place all of the values on our mini stack so lret can | ||
149 | * used to perform that far jump. | ||
150 | */ | ||
151 | pushl $__KERNEL_CS | ||
152 | leal startup_64(%ebp), %eax | ||
153 | pushl %eax | ||
154 | |||
155 | /* Enter paged protected Mode, activating Long Mode */ | ||
156 | movl $0x80000001, %eax /* Enable Paging and Protected mode */ | ||
157 | movl %eax, %cr0 | ||
158 | |||
159 | /* Jump from 32bit compatibility mode into 64bit mode. */ | ||
160 | lret | ||
161 | |||
162 | /* Be careful here startup_64 needs to be at a predictable | ||
163 | * address so I can export it in an ELF header. Bootloaders | ||
164 | * should look at the ELF header to find this address, as | ||
165 | * it may change in the future. | ||
166 | */ | ||
167 | .code64 | ||
168 | .org 0x100 | ||
169 | ENTRY(startup_64) | ||
170 | /* We come here either from startup_32 or directly from a | ||
171 | * 64bit bootloader. If we come here from a bootloader we depend on | ||
172 | * an identity mapped page table being provied that maps our | ||
173 | * entire text+data+bss and hopefully all of memory. | ||
174 | */ | ||
175 | |||
176 | /* Setup data segments. */ | ||
177 | xorl %eax, %eax | ||
178 | movl %eax, %ds | ||
179 | movl %eax, %es | ||
180 | movl %eax, %ss | ||
181 | |||
182 | /* Compute the decompressed kernel start address. It is where | ||
183 | * we were loaded at aligned to a 2M boundary. %rbp contains the | ||
184 | * decompressed kernel start address. | ||
185 | * | ||
186 | * If it is a relocatable kernel then decompress and run the kernel | ||
187 | * from load address aligned to 2MB addr, otherwise decompress and | ||
188 | * run the kernel from CONFIG_PHYSICAL_START | ||
189 | */ | ||
190 | |||
191 | /* Start with the delta to where the kernel will run at. */ | ||
192 | #ifdef CONFIG_RELOCATABLE | ||
193 | leaq startup_32(%rip) /* - $startup_32 */, %rbp | ||
194 | addq $(LARGE_PAGE_SIZE - 1), %rbp | ||
195 | andq $LARGE_PAGE_MASK, %rbp | ||
196 | movq %rbp, %rbx | ||
197 | #else | ||
198 | movq $CONFIG_PHYSICAL_START, %rbp | ||
199 | movq %rbp, %rbx | ||
200 | #endif | ||
201 | |||
202 | /* Replace the compressed data size with the uncompressed size */ | ||
203 | movl input_len(%rip), %eax | ||
204 | subq %rax, %rbx | ||
205 | movl output_len(%rip), %eax | ||
206 | addq %rax, %rbx | ||
207 | /* Add 8 bytes for every 32K input block */ | ||
208 | shrq $12, %rax | ||
209 | addq %rax, %rbx | ||
210 | /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ | ||
211 | addq $(32768 + 18 + 4095), %rbx | ||
212 | andq $~4095, %rbx | ||
213 | |||
214 | /* Copy the compressed kernel to the end of our buffer | ||
215 | * where decompression in place becomes safe. | ||
216 | */ | ||
217 | leaq _end(%rip), %r8 | ||
218 | leaq _end(%rbx), %r9 | ||
219 | movq $_end /* - $startup_32 */, %rcx | ||
220 | 1: subq $8, %r8 | ||
221 | subq $8, %r9 | ||
222 | movq 0(%r8), %rax | ||
223 | movq %rax, 0(%r9) | ||
224 | subq $8, %rcx | ||
225 | jnz 1b | ||
226 | |||
227 | /* | ||
228 | * Jump to the relocated address. | ||
229 | */ | ||
230 | leaq relocated(%rbx), %rax | ||
231 | jmp *%rax | ||
232 | |||
233 | .section ".text" | ||
234 | relocated: | ||
235 | |||
57 | /* | 236 | /* |
58 | * Clear BSS | 237 | * Clear BSS |
59 | */ | 238 | */ |
60 | xorl %eax,%eax | 239 | xorq %rax, %rax |
61 | movl $_edata,%edi | 240 | leaq _edata(%rbx), %rdi |
62 | movl $_end,%ecx | 241 | leaq _end(%rbx), %rcx |
63 | subl %edi,%ecx | 242 | subq %rdi, %rcx |
64 | cld | 243 | cld |
65 | rep | 244 | rep |
66 | stosb | 245 | stosb |
246 | |||
247 | /* Setup the stack */ | ||
248 | leaq user_stack_end(%rip), %rsp | ||
249 | |||
250 | /* zero EFLAGS after setting rsp */ | ||
251 | pushq $0 | ||
252 | popfq | ||
253 | |||
67 | /* | 254 | /* |
68 | * Do the decompression, and jump to the new kernel.. | 255 | * Do the decompression, and jump to the new kernel.. |
69 | */ | 256 | */ |
70 | subl $16,%esp # place for structure on the stack | 257 | pushq %rsi # Save the real mode argument |
71 | movl %esp,%eax | 258 | movq %rsi, %rdi # real mode address |
72 | pushl %esi # real mode pointer as second arg | 259 | leaq _heap(%rip), %rsi # _heap |
73 | pushl %eax # address of structure as first arg | 260 | leaq input_data(%rip), %rdx # input_data |
74 | call decompress_kernel | 261 | movl input_len(%rip), %eax |
75 | orl %eax,%eax | 262 | movq %rax, %rcx # input_len |
76 | jnz 3f | 263 | movq %rbp, %r8 # output |
77 | addl $8,%esp | 264 | call decompress_kernel |
78 | xorl %ebx,%ebx | 265 | popq %rsi |
79 | ljmp $(__KERNEL_CS), $__PHYSICAL_START | ||
80 | 266 | ||
81 | /* | ||
82 | * We come here, if we were loaded high. | ||
83 | * We need to move the move-in-place routine down to 0x1000 | ||
84 | * and then start it with the buffer addresses in registers, | ||
85 | * which we got from the stack. | ||
86 | */ | ||
87 | 3: | ||
88 | movl %esi,%ebx | ||
89 | movl $move_routine_start,%esi | ||
90 | movl $0x1000,%edi | ||
91 | movl $move_routine_end,%ecx | ||
92 | subl %esi,%ecx | ||
93 | addl $3,%ecx | ||
94 | shrl $2,%ecx | ||
95 | cld | ||
96 | rep | ||
97 | movsl | ||
98 | |||
99 | popl %esi # discard the address | ||
100 | addl $4,%esp # real mode pointer | ||
101 | popl %esi # low_buffer_start | ||
102 | popl %ecx # lcount | ||
103 | popl %edx # high_buffer_start | ||
104 | popl %eax # hcount | ||
105 | movl $__PHYSICAL_START,%edi | ||
106 | cli # make sure we don't get interrupted | ||
107 | ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine | ||
108 | 267 | ||
109 | /* | 268 | /* |
110 | * Routine (template) for moving the decompressed kernel in place, | 269 | * Jump to the decompressed kernel. |
111 | * if we were high loaded. This _must_ PIC-code ! | ||
112 | */ | 270 | */ |
113 | move_routine_start: | 271 | jmp *%rbp |
114 | movl %ecx,%ebp | ||
115 | shrl $2,%ecx | ||
116 | rep | ||
117 | movsl | ||
118 | movl %ebp,%ecx | ||
119 | andl $3,%ecx | ||
120 | rep | ||
121 | movsb | ||
122 | movl %edx,%esi | ||
123 | movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 | ||
124 | addl $3,%ecx | ||
125 | shrl $2,%ecx | ||
126 | rep | ||
127 | movsl | ||
128 | movl %ebx,%esi # Restore setup pointer | ||
129 | xorl %ebx,%ebx | ||
130 | ljmp $(__KERNEL_CS), $__PHYSICAL_START | ||
131 | move_routine_end: | ||
132 | 272 | ||
133 | 273 | .data | |
134 | /* Stack for uncompression */ | 274 | gdt: |
135 | .align 32 | 275 | .word gdt_end - gdt |
136 | user_stack: | 276 | .long gdt |
277 | .word 0 | ||
278 | .quad 0x0000000000000000 /* NULL descriptor */ | ||
279 | .quad 0x00af9a000000ffff /* __KERNEL_CS */ | ||
280 | .quad 0x00cf92000000ffff /* __KERNEL_DS */ | ||
281 | gdt_end: | ||
282 | .bss | ||
283 | /* Stack for uncompression */ | ||
284 | .balign 4 | ||
285 | user_stack: | ||
137 | .fill 4096,4,0 | 286 | .fill 4096,4,0 |
138 | stack_start: | 287 | user_stack_end: |
139 | .long user_stack+4096 | ||
140 | .word __KERNEL_DS | ||
141 | |||
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index 3755b2e394d0..fee54dbf1749 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c | |||
@@ -9,10 +9,95 @@ | |||
9 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 | 9 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #define _LINUX_STRING_H_ 1 | ||
13 | #define __LINUX_BITMAP_H 1 | ||
14 | |||
15 | #include <linux/linkage.h> | ||
12 | #include <linux/screen_info.h> | 16 | #include <linux/screen_info.h> |
13 | #include <asm/io.h> | 17 | #include <asm/io.h> |
14 | #include <asm/page.h> | 18 | #include <asm/page.h> |
15 | 19 | ||
20 | /* WARNING!! | ||
21 | * This code is compiled with -fPIC and it is relocated dynamically | ||
22 | * at run time, but no relocation processing is performed. | ||
23 | * This means that it is not safe to place pointers in static structures. | ||
24 | */ | ||
25 | |||
26 | /* | ||
27 | * Getting to provable safe in place decompression is hard. | ||
28 | * Worst case behaviours need to be analized. | ||
29 | * Background information: | ||
30 | * | ||
31 | * The file layout is: | ||
32 | * magic[2] | ||
33 | * method[1] | ||
34 | * flags[1] | ||
35 | * timestamp[4] | ||
36 | * extraflags[1] | ||
37 | * os[1] | ||
38 | * compressed data blocks[N] | ||
39 | * crc[4] orig_len[4] | ||
40 | * | ||
41 | * resulting in 18 bytes of non compressed data overhead. | ||
42 | * | ||
43 | * Files divided into blocks | ||
44 | * 1 bit (last block flag) | ||
45 | * 2 bits (block type) | ||
46 | * | ||
47 | * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. | ||
48 | * The smallest block type encoding is always used. | ||
49 | * | ||
50 | * stored: | ||
51 | * 32 bits length in bytes. | ||
52 | * | ||
53 | * fixed: | ||
54 | * magic fixed tree. | ||
55 | * symbols. | ||
56 | * | ||
57 | * dynamic: | ||
58 | * dynamic tree encoding. | ||
59 | * symbols. | ||
60 | * | ||
61 | * | ||
62 | * The buffer for decompression in place is the length of the | ||
63 | * uncompressed data, plus a small amount extra to keep the algorithm safe. | ||
64 | * The compressed data is placed at the end of the buffer. The output | ||
65 | * pointer is placed at the start of the buffer and the input pointer | ||
66 | * is placed where the compressed data starts. Problems will occur | ||
67 | * when the output pointer overruns the input pointer. | ||
68 | * | ||
69 | * The output pointer can only overrun the input pointer if the input | ||
70 | * pointer is moving faster than the output pointer. A condition only | ||
71 | * triggered by data whose compressed form is larger than the uncompressed | ||
72 | * form. | ||
73 | * | ||
74 | * The worst case at the block level is a growth of the compressed data | ||
75 | * of 5 bytes per 32767 bytes. | ||
76 | * | ||
77 | * The worst case internal to a compressed block is very hard to figure. | ||
78 | * The worst case can at least be boundined by having one bit that represents | ||
79 | * 32764 bytes and then all of the rest of the bytes representing the very | ||
80 | * very last byte. | ||
81 | * | ||
82 | * All of which is enough to compute an amount of extra data that is required | ||
83 | * to be safe. To avoid problems at the block level allocating 5 extra bytes | ||
84 | * per 32767 bytes of data is sufficient. To avoind problems internal to a block | ||
85 | * adding an extra 32767 bytes (the worst case uncompressed block size) is | ||
86 | * sufficient, to ensure that in the worst case the decompressed data for | ||
87 | * block will stop the byte before the compressed data for a block begins. | ||
88 | * To avoid problems with the compressed data's meta information an extra 18 | ||
89 | * bytes are needed. Leading to the formula: | ||
90 | * | ||
91 | * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. | ||
92 | * | ||
93 | * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. | ||
94 | * Adding 32768 instead of 32767 just makes for round numbers. | ||
95 | * Adding the decompressor_size is necessary as it musht live after all | ||
96 | * of the data as well. Last I measured the decompressor is about 14K. | ||
97 | * 10K of actuall data and 4K of bss. | ||
98 | * | ||
99 | */ | ||
100 | |||
16 | /* | 101 | /* |
17 | * gzip declarations | 102 | * gzip declarations |
18 | */ | 103 | */ |
@@ -28,15 +113,20 @@ typedef unsigned char uch; | |||
28 | typedef unsigned short ush; | 113 | typedef unsigned short ush; |
29 | typedef unsigned long ulg; | 114 | typedef unsigned long ulg; |
30 | 115 | ||
31 | #define WSIZE 0x8000 /* Window size must be at least 32k, */ | 116 | #define WSIZE 0x80000000 /* Window size must be at least 32k, |
32 | /* and a power of two */ | 117 | * and a power of two |
118 | * We don't actually have a window just | ||
119 | * a huge output buffer so I report | ||
120 | * a 2G windows size, as that should | ||
121 | * always be larger than our output buffer. | ||
122 | */ | ||
33 | 123 | ||
34 | static uch *inbuf; /* input buffer */ | 124 | static uch *inbuf; /* input buffer */ |
35 | static uch window[WSIZE]; /* Sliding window buffer */ | 125 | static uch *window; /* Sliding window buffer, (and final output buffer) */ |
36 | 126 | ||
37 | static unsigned insize = 0; /* valid bytes in inbuf */ | 127 | static unsigned insize; /* valid bytes in inbuf */ |
38 | static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ | 128 | static unsigned inptr; /* index of next byte to be processed in inbuf */ |
39 | static unsigned outcnt = 0; /* bytes in output buffer */ | 129 | static unsigned outcnt; /* bytes in output buffer */ |
40 | 130 | ||
41 | /* gzip flag byte */ | 131 | /* gzip flag byte */ |
42 | #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ | 132 | #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ |
@@ -87,8 +177,6 @@ extern unsigned char input_data[]; | |||
87 | extern int input_len; | 177 | extern int input_len; |
88 | 178 | ||
89 | static long bytes_out = 0; | 179 | static long bytes_out = 0; |
90 | static uch *output_data; | ||
91 | static unsigned long output_ptr = 0; | ||
92 | 180 | ||
93 | static void *malloc(int size); | 181 | static void *malloc(int size); |
94 | static void free(void *where); | 182 | static void free(void *where); |
@@ -98,17 +186,10 @@ static void *memcpy(void *dest, const void *src, unsigned n); | |||
98 | 186 | ||
99 | static void putstr(const char *); | 187 | static void putstr(const char *); |
100 | 188 | ||
101 | extern int end; | 189 | static long free_mem_ptr; |
102 | static long free_mem_ptr = (long)&end; | ||
103 | static long free_mem_end_ptr; | 190 | static long free_mem_end_ptr; |
104 | 191 | ||
105 | #define INPLACE_MOVE_ROUTINE 0x1000 | 192 | #define HEAP_SIZE 0x6000 |
106 | #define LOW_BUFFER_START 0x2000 | ||
107 | #define LOW_BUFFER_MAX 0x90000 | ||
108 | #define HEAP_SIZE 0x3000 | ||
109 | static unsigned int low_buffer_end, low_buffer_size; | ||
110 | static int high_loaded =0; | ||
111 | static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; | ||
112 | 193 | ||
113 | static char *vidmem = (char *)0xb8000; | 194 | static char *vidmem = (char *)0xb8000; |
114 | static int vidport; | 195 | static int vidport; |
@@ -218,58 +299,31 @@ static void* memcpy(void* dest, const void* src, unsigned n) | |||
218 | */ | 299 | */ |
219 | static int fill_inbuf(void) | 300 | static int fill_inbuf(void) |
220 | { | 301 | { |
221 | if (insize != 0) { | 302 | error("ran out of input data"); |
222 | error("ran out of input data"); | 303 | return 0; |
223 | } | ||
224 | |||
225 | inbuf = input_data; | ||
226 | insize = input_len; | ||
227 | inptr = 1; | ||
228 | return inbuf[0]; | ||
229 | } | 304 | } |
230 | 305 | ||
231 | /* =========================================================================== | 306 | /* =========================================================================== |
232 | * Write the output window window[0..outcnt-1] and update crc and bytes_out. | 307 | * Write the output window window[0..outcnt-1] and update crc and bytes_out. |
233 | * (Used for the decompressed data only.) | 308 | * (Used for the decompressed data only.) |
234 | */ | 309 | */ |
235 | static void flush_window_low(void) | ||
236 | { | ||
237 | ulg c = crc; /* temporary variable */ | ||
238 | unsigned n; | ||
239 | uch *in, *out, ch; | ||
240 | |||
241 | in = window; | ||
242 | out = &output_data[output_ptr]; | ||
243 | for (n = 0; n < outcnt; n++) { | ||
244 | ch = *out++ = *in++; | ||
245 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | ||
246 | } | ||
247 | crc = c; | ||
248 | bytes_out += (ulg)outcnt; | ||
249 | output_ptr += (ulg)outcnt; | ||
250 | outcnt = 0; | ||
251 | } | ||
252 | |||
253 | static void flush_window_high(void) | ||
254 | { | ||
255 | ulg c = crc; /* temporary variable */ | ||
256 | unsigned n; | ||
257 | uch *in, ch; | ||
258 | in = window; | ||
259 | for (n = 0; n < outcnt; n++) { | ||
260 | ch = *output_data++ = *in++; | ||
261 | if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start; | ||
262 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | ||
263 | } | ||
264 | crc = c; | ||
265 | bytes_out += (ulg)outcnt; | ||
266 | outcnt = 0; | ||
267 | } | ||
268 | |||
269 | static void flush_window(void) | 310 | static void flush_window(void) |
270 | { | 311 | { |
271 | if (high_loaded) flush_window_high(); | 312 | /* With my window equal to my output buffer |
272 | else flush_window_low(); | 313 | * I only need to compute the crc here. |
314 | */ | ||
315 | ulg c = crc; /* temporary variable */ | ||
316 | unsigned n; | ||
317 | uch *in, ch; | ||
318 | |||
319 | in = window; | ||
320 | for (n = 0; n < outcnt; n++) { | ||
321 | ch = *in++; | ||
322 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | ||
323 | } | ||
324 | crc = c; | ||
325 | bytes_out += (ulg)outcnt; | ||
326 | outcnt = 0; | ||
273 | } | 327 | } |
274 | 328 | ||
275 | static void error(char *x) | 329 | static void error(char *x) |
@@ -281,57 +335,8 @@ static void error(char *x) | |||
281 | while(1); /* Halt */ | 335 | while(1); /* Halt */ |
282 | } | 336 | } |
283 | 337 | ||
284 | static void setup_normal_output_buffer(void) | 338 | asmlinkage void decompress_kernel(void *rmode, unsigned long heap, |
285 | { | 339 | uch *input_data, unsigned long input_len, uch *output) |
286 | #ifdef STANDARD_MEMORY_BIOS_CALL | ||
287 | if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory"); | ||
288 | #else | ||
289 | if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); | ||
290 | #endif | ||
291 | output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ | ||
292 | free_mem_end_ptr = (long)real_mode; | ||
293 | } | ||
294 | |||
295 | struct moveparams { | ||
296 | uch *low_buffer_start; int lcount; | ||
297 | uch *high_buffer_start; int hcount; | ||
298 | }; | ||
299 | |||
300 | static void setup_output_buffer_if_we_run_high(struct moveparams *mv) | ||
301 | { | ||
302 | high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); | ||
303 | #ifdef STANDARD_MEMORY_BIOS_CALL | ||
304 | if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); | ||
305 | #else | ||
306 | if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); | ||
307 | #endif | ||
308 | mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; | ||
309 | low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX | ||
310 | ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; | ||
311 | low_buffer_size = low_buffer_end - LOW_BUFFER_START; | ||
312 | high_loaded = 1; | ||
313 | free_mem_end_ptr = (long)high_buffer_start; | ||
314 | if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { | ||
315 | high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); | ||
316 | mv->hcount = 0; /* say: we need not to move high_buffer */ | ||
317 | } | ||
318 | else mv->hcount = -1; | ||
319 | mv->high_buffer_start = high_buffer_start; | ||
320 | } | ||
321 | |||
322 | static void close_output_buffer_if_we_run_high(struct moveparams *mv) | ||
323 | { | ||
324 | if (bytes_out > low_buffer_size) { | ||
325 | mv->lcount = low_buffer_size; | ||
326 | if (mv->hcount) | ||
327 | mv->hcount = bytes_out - low_buffer_size; | ||
328 | } else { | ||
329 | mv->lcount = bytes_out; | ||
330 | mv->hcount = 0; | ||
331 | } | ||
332 | } | ||
333 | |||
334 | int decompress_kernel(struct moveparams *mv, void *rmode) | ||
335 | { | 340 | { |
336 | real_mode = rmode; | 341 | real_mode = rmode; |
337 | 342 | ||
@@ -346,13 +351,21 @@ int decompress_kernel(struct moveparams *mv, void *rmode) | |||
346 | lines = RM_SCREEN_INFO.orig_video_lines; | 351 | lines = RM_SCREEN_INFO.orig_video_lines; |
347 | cols = RM_SCREEN_INFO.orig_video_cols; | 352 | cols = RM_SCREEN_INFO.orig_video_cols; |
348 | 353 | ||
349 | if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); | 354 | window = output; /* Output buffer (Normally at 1M) */ |
350 | else setup_output_buffer_if_we_run_high(mv); | 355 | free_mem_ptr = heap; /* Heap */ |
356 | free_mem_end_ptr = heap + HEAP_SIZE; | ||
357 | inbuf = input_data; /* Input buffer */ | ||
358 | insize = input_len; | ||
359 | inptr = 0; | ||
360 | |||
361 | if ((ulg)output & 0x1fffffUL) | ||
362 | error("Destination address not 2M aligned"); | ||
363 | if ((ulg)output >= 0xffffffffffUL) | ||
364 | error("Destination address too large"); | ||
351 | 365 | ||
352 | makecrc(); | 366 | makecrc(); |
353 | putstr(".\nDecompressing Linux..."); | 367 | putstr(".\nDecompressing Linux..."); |
354 | gunzip(); | 368 | gunzip(); |
355 | putstr("done.\nBooting the kernel.\n"); | 369 | putstr("done.\nBooting the kernel.\n"); |
356 | if (high_loaded) close_output_buffer_if_we_run_high(mv); | 370 | return; |
357 | return high_loaded; | ||
358 | } | 371 | } |
diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds new file mode 100644 index 000000000000..94c13e557fb4 --- /dev/null +++ b/arch/x86_64/boot/compressed/vmlinux.lds | |||
@@ -0,0 +1,44 @@ | |||
1 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") | ||
2 | OUTPUT_ARCH(i386:x86-64) | ||
3 | ENTRY(startup_64) | ||
4 | SECTIONS | ||
5 | { | ||
6 | /* Be careful parts of head.S assume startup_32 is at | ||
7 | * address 0. | ||
8 | */ | ||
9 | . = 0; | ||
10 | .text : { | ||
11 | _head = . ; | ||
12 | *(.text.head) | ||
13 | _ehead = . ; | ||
14 | *(.text.compressed) | ||
15 | _text = .; /* Text */ | ||
16 | *(.text) | ||
17 | *(.text.*) | ||
18 | _etext = . ; | ||
19 | } | ||
20 | .rodata : { | ||
21 | _rodata = . ; | ||
22 | *(.rodata) /* read-only data */ | ||
23 | *(.rodata.*) | ||
24 | _erodata = . ; | ||
25 | } | ||
26 | .data : { | ||
27 | _data = . ; | ||
28 | *(.data) | ||
29 | *(.data.*) | ||
30 | _edata = . ; | ||
31 | } | ||
32 | .bss : { | ||
33 | _bss = . ; | ||
34 | *(.bss) | ||
35 | *(.bss.*) | ||
36 | *(COMMON) | ||
37 | . = ALIGN(8); | ||
38 | _end = . ; | ||
39 | . = ALIGN(4096); | ||
40 | pgtable = . ; | ||
41 | . = . + 4096 * 6; | ||
42 | _heap = .; | ||
43 | } | ||
44 | } | ||
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr index 1ed9d791f863..bd1429ce193e 100644 --- a/arch/x86_64/boot/compressed/vmlinux.scr +++ b/arch/x86_64/boot/compressed/vmlinux.scr | |||
@@ -1,9 +1,10 @@ | |||
1 | SECTIONS | 1 | SECTIONS |
2 | { | 2 | { |
3 | .data : { | 3 | .text.compressed : { |
4 | input_len = .; | 4 | input_len = .; |
5 | LONG(input_data_end - input_data) input_data = .; | 5 | LONG(input_data_end - input_data) input_data = .; |
6 | *(.data) | 6 | *(.data) |
7 | input_data_end = .; | 7 | output_len = . - 4; |
8 | input_data_end = .; | ||
8 | } | 9 | } |
9 | } | 10 | } |
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c211e52f1333..36aa98a6d15c 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S | |||
@@ -5,6 +5,7 @@ | |||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | 5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> |
6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> | 6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> |
7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> | 7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> |
8 | * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> | ||
8 | */ | 9 | */ |
9 | 10 | ||
10 | 11 | ||
@@ -17,95 +18,127 @@ | |||
17 | #include <asm/page.h> | 18 | #include <asm/page.h> |
18 | #include <asm/msr.h> | 19 | #include <asm/msr.h> |
19 | #include <asm/cache.h> | 20 | #include <asm/cache.h> |
20 | 21 | ||
21 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE | 22 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE |
22 | * because we need identity-mapped pages on setup so define __START_KERNEL to | 23 | * because we need identity-mapped pages. |
23 | * 0x100000 for this stage | 24 | * |
24 | * | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | .text | 27 | .text |
28 | .section .bootstrap.text | 28 | .section .bootstrap.text |
29 | .code32 | 29 | .code64 |
30 | .globl startup_32 | 30 | .globl startup_64 |
31 | /* %bx: 1 if coming from smp trampoline on secondary cpu */ | 31 | startup_64: |
32 | startup_32: | 32 | |
33 | |||
34 | /* | 33 | /* |
35 | * At this point the CPU runs in 32bit protected mode (CS.D = 1) with | 34 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
36 | * paging disabled and the point of this file is to switch to 64bit | 35 | * and someone has loaded an identity mapped page table |
37 | * long mode with a kernel mapping for kerneland to jump into the | 36 | * for us. These identity mapped page tables map all of the |
38 | * kernel virtual addresses. | 37 | * kernel pages and possibly all of memory. |
39 | * There is no stack until we set one up. | 38 | * |
39 | * %esi holds a physical pointer to real_mode_data. | ||
40 | * | ||
41 | * We come here either directly from a 64bit bootloader, or from | ||
42 | * arch/x86_64/boot/compressed/head.S. | ||
43 | * | ||
44 | * We only come here initially at boot nothing else comes here. | ||
45 | * | ||
46 | * Since we may be loaded at an address different from what we were | ||
47 | * compiled to run at we first fixup the physical addresses in our page | ||
48 | * tables and then reload them. | ||
40 | */ | 49 | */ |
41 | 50 | ||
42 | /* Initialize the %ds segment register */ | 51 | /* Compute the delta between the address I am compiled to run at and the |
43 | movl $__KERNEL_DS,%eax | 52 | * address I am actually running at. |
44 | movl %eax,%ds | ||
45 | |||
46 | /* Load new GDT with the 64bit segments using 32bit descriptor */ | ||
47 | lgdt pGDT32 - __START_KERNEL_map | ||
48 | |||
49 | /* If the CPU doesn't support CPUID this will double fault. | ||
50 | * Unfortunately it is hard to check for CPUID without a stack. | ||
51 | */ | 53 | */ |
52 | 54 | leaq _text(%rip), %rbp | |
53 | /* Check if extended functions are implemented */ | 55 | subq $_text - __START_KERNEL_map, %rbp |
54 | movl $0x80000000, %eax | 56 | |
55 | cpuid | 57 | /* Is the address not 2M aligned? */ |
56 | cmpl $0x80000000, %eax | 58 | movq %rbp, %rax |
57 | jbe no_long_mode | 59 | andl $~LARGE_PAGE_MASK, %eax |
58 | /* Check if long mode is implemented */ | 60 | testl %eax, %eax |
59 | mov $0x80000001, %eax | 61 | jnz bad_address |
60 | cpuid | 62 | |
61 | btl $29, %edx | 63 | /* Is the address too large? */ |
62 | jnc no_long_mode | 64 | leaq _text(%rip), %rdx |
63 | 65 | movq $PGDIR_SIZE, %rax | |
64 | /* | 66 | cmpq %rax, %rdx |
65 | * Prepare for entering 64bits mode | 67 | jae bad_address |
68 | |||
69 | /* Fixup the physical addresses in the page table | ||
66 | */ | 70 | */ |
71 | addq %rbp, init_level4_pgt + 0(%rip) | ||
72 | addq %rbp, init_level4_pgt + (258*8)(%rip) | ||
73 | addq %rbp, init_level4_pgt + (511*8)(%rip) | ||
74 | |||
75 | addq %rbp, level3_ident_pgt + 0(%rip) | ||
76 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) | ||
77 | |||
78 | /* Add an Identity mapping if I am above 1G */ | ||
79 | leaq _text(%rip), %rdi | ||
80 | andq $LARGE_PAGE_MASK, %rdi | ||
81 | |||
82 | movq %rdi, %rax | ||
83 | shrq $PUD_SHIFT, %rax | ||
84 | andq $(PTRS_PER_PUD - 1), %rax | ||
85 | jz ident_complete | ||
86 | |||
87 | leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx | ||
88 | leaq level3_ident_pgt(%rip), %rbx | ||
89 | movq %rdx, 0(%rbx, %rax, 8) | ||
90 | |||
91 | movq %rdi, %rax | ||
92 | shrq $PMD_SHIFT, %rax | ||
93 | andq $(PTRS_PER_PMD - 1), %rax | ||
94 | leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx | ||
95 | leaq level2_spare_pgt(%rip), %rbx | ||
96 | movq %rdx, 0(%rbx, %rax, 8) | ||
97 | ident_complete: | ||
98 | |||
99 | /* Fixup the kernel text+data virtual addresses | ||
100 | */ | ||
101 | leaq level2_kernel_pgt(%rip), %rdi | ||
102 | leaq 4096(%rdi), %r8 | ||
103 | /* See if it is a valid page table entry */ | ||
104 | 1: testq $1, 0(%rdi) | ||
105 | jz 2f | ||
106 | addq %rbp, 0(%rdi) | ||
107 | /* Go to the next page */ | ||
108 | 2: addq $8, %rdi | ||
109 | cmp %r8, %rdi | ||
110 | jne 1b | ||
111 | |||
112 | /* Fixup phys_base */ | ||
113 | addq %rbp, phys_base(%rip) | ||
67 | 114 | ||
68 | /* Enable PAE mode */ | 115 | #ifdef CONFIG_SMP |
69 | xorl %eax, %eax | 116 | addq %rbp, trampoline_level4_pgt + 0(%rip) |
70 | btsl $5, %eax | 117 | addq %rbp, trampoline_level4_pgt + (511*8)(%rip) |
71 | movl %eax, %cr4 | 118 | #endif |
72 | 119 | #ifdef CONFIG_ACPI_SLEEP | |
73 | /* Setup early boot stage 4 level pagetables */ | 120 | addq %rbp, wakeup_level4_pgt + 0(%rip) |
74 | movl $(init_level4_pgt - __START_KERNEL_map), %eax | 121 | addq %rbp, wakeup_level4_pgt + (511*8)(%rip) |
75 | movl %eax, %cr3 | 122 | #endif |
76 | |||
77 | /* Setup EFER (Extended Feature Enable Register) */ | ||
78 | movl $MSR_EFER, %ecx | ||
79 | rdmsr | ||
80 | |||
81 | /* Enable Long Mode */ | ||
82 | btsl $_EFER_LME, %eax | ||
83 | |||
84 | /* Make changes effective */ | ||
85 | wrmsr | ||
86 | 123 | ||
87 | xorl %eax, %eax | 124 | /* Due to ENTRY(), sometimes the empty space gets filled with |
88 | btsl $31, %eax /* Enable paging and in turn activate Long Mode */ | 125 | * zeros. Better take a jmp than relying on empty space being |
89 | btsl $0, %eax /* Enable protected mode */ | 126 | * filled with 0x90 (nop) |
90 | /* Make changes effective */ | ||
91 | movl %eax, %cr0 | ||
92 | /* | ||
93 | * At this point we're in long mode but in 32bit compatibility mode | ||
94 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
95 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use | ||
96 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
97 | */ | 127 | */ |
98 | ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) | 128 | jmp secondary_startup_64 |
99 | |||
100 | .code64 | ||
101 | .org 0x100 | ||
102 | .globl startup_64 | ||
103 | startup_64: | ||
104 | ENTRY(secondary_startup_64) | 129 | ENTRY(secondary_startup_64) |
105 | /* We come here either from startup_32 | 130 | /* |
106 | * or directly from a 64bit bootloader. | 131 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
107 | * Since we may have come directly from a bootloader we | 132 | * and someone has loaded a mapped page table. |
108 | * reload the page tables here. | 133 | * |
134 | * %esi holds a physical pointer to real_mode_data. | ||
135 | * | ||
136 | * We come here either from startup_64 (using physical addresses) | ||
137 | * or from trampoline.S (using virtual addresses). | ||
138 | * | ||
139 | * Using virtual addresses from trampoline.S removes the need | ||
140 | * to have any identity mapped pages in the kernel page table | ||
141 | * after the boot processor executes this code. | ||
109 | */ | 142 | */ |
110 | 143 | ||
111 | /* Enable PAE mode and PGE */ | 144 | /* Enable PAE mode and PGE */ |
@@ -116,8 +149,14 @@ ENTRY(secondary_startup_64) | |||
116 | 149 | ||
117 | /* Setup early boot stage 4 level pagetables. */ | 150 | /* Setup early boot stage 4 level pagetables. */ |
118 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | 151 | movq $(init_level4_pgt - __START_KERNEL_map), %rax |
152 | addq phys_base(%rip), %rax | ||
119 | movq %rax, %cr3 | 153 | movq %rax, %cr3 |
120 | 154 | ||
155 | /* Ensure I am executing from virtual addresses */ | ||
156 | movq $1f, %rax | ||
157 | jmp *%rax | ||
158 | 1: | ||
159 | |||
121 | /* Check if nx is implemented */ | 160 | /* Check if nx is implemented */ |
122 | movl $0x80000001, %eax | 161 | movl $0x80000001, %eax |
123 | cpuid | 162 | cpuid |
@@ -126,17 +165,11 @@ ENTRY(secondary_startup_64) | |||
126 | /* Setup EFER (Extended Feature Enable Register) */ | 165 | /* Setup EFER (Extended Feature Enable Register) */ |
127 | movl $MSR_EFER, %ecx | 166 | movl $MSR_EFER, %ecx |
128 | rdmsr | 167 | rdmsr |
129 | 168 | btsl $_EFER_SCE, %eax /* Enable System Call */ | |
130 | /* Enable System Call */ | 169 | btl $20,%edi /* No Execute supported? */ |
131 | btsl $_EFER_SCE, %eax | ||
132 | |||
133 | /* No Execute supported? */ | ||
134 | btl $20,%edi | ||
135 | jnc 1f | 170 | jnc 1f |
136 | btsl $_EFER_NX, %eax | 171 | btsl $_EFER_NX, %eax |
137 | 1: | 172 | 1: wrmsr /* Make changes effective */ |
138 | /* Make changes effective */ | ||
139 | wrmsr | ||
140 | 173 | ||
141 | /* Setup cr0 */ | 174 | /* Setup cr0 */ |
142 | #define CR0_PM 1 /* protected mode */ | 175 | #define CR0_PM 1 /* protected mode */ |
@@ -163,7 +196,7 @@ ENTRY(secondary_startup_64) | |||
163 | * addresses where we're currently running on. We have to do that here | 196 | * addresses where we're currently running on. We have to do that here |
164 | * because in 32bit we couldn't load a 64bit linear address. | 197 | * because in 32bit we couldn't load a 64bit linear address. |
165 | */ | 198 | */ |
166 | lgdt cpu_gdt_descr | 199 | lgdt cpu_gdt_descr(%rip) |
167 | 200 | ||
168 | /* set up data segments. actually 0 would do too */ | 201 | /* set up data segments. actually 0 would do too */ |
169 | movl $__KERNEL_DS,%eax | 202 | movl $__KERNEL_DS,%eax |
@@ -214,6 +247,9 @@ initial_code: | |||
214 | init_rsp: | 247 | init_rsp: |
215 | .quad init_thread_union+THREAD_SIZE-8 | 248 | .quad init_thread_union+THREAD_SIZE-8 |
216 | 249 | ||
250 | bad_address: | ||
251 | jmp bad_address | ||
252 | |||
217 | ENTRY(early_idt_handler) | 253 | ENTRY(early_idt_handler) |
218 | cmpl $2,early_recursion_flag(%rip) | 254 | cmpl $2,early_recursion_flag(%rip) |
219 | jz 1f | 255 | jz 1f |
@@ -242,23 +278,7 @@ early_idt_msg: | |||
242 | early_idt_ripmsg: | 278 | early_idt_ripmsg: |
243 | .asciz "RIP %s\n" | 279 | .asciz "RIP %s\n" |
244 | 280 | ||
245 | .code32 | 281 | .balign PAGE_SIZE |
246 | ENTRY(no_long_mode) | ||
247 | /* This isn't an x86-64 CPU so hang */ | ||
248 | 1: | ||
249 | jmp 1b | ||
250 | |||
251 | .org 0xf00 | ||
252 | .globl pGDT32 | ||
253 | pGDT32: | ||
254 | .word gdt_end-cpu_gdt_table-1 | ||
255 | .long cpu_gdt_table-__START_KERNEL_map | ||
256 | |||
257 | .org 0xf10 | ||
258 | ljumpvector: | ||
259 | .long startup_64-__START_KERNEL_map | ||
260 | .word __KERNEL_CS | ||
261 | |||
262 | ENTRY(stext) | 282 | ENTRY(stext) |
263 | ENTRY(_stext) | 283 | ENTRY(_stext) |
264 | 284 | ||
@@ -303,7 +323,7 @@ NEXT_PAGE(level2_ident_pgt) | |||
303 | * Don't set NX because code runs from these pages. | 323 | * Don't set NX because code runs from these pages. |
304 | */ | 324 | */ |
305 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) | 325 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) |
306 | 326 | ||
307 | NEXT_PAGE(level2_kernel_pgt) | 327 | NEXT_PAGE(level2_kernel_pgt) |
308 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. | 328 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. |
309 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ | 329 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ |
@@ -313,6 +333,9 @@ NEXT_PAGE(level2_kernel_pgt) | |||
313 | /* Module mapping starts here */ | 333 | /* Module mapping starts here */ |
314 | .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 | 334 | .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 |
315 | 335 | ||
336 | NEXT_PAGE(level2_spare_pgt) | ||
337 | .fill 512,8,0 | ||
338 | |||
316 | #undef PMDS | 339 | #undef PMDS |
317 | #undef NEXT_PAGE | 340 | #undef NEXT_PAGE |
318 | 341 | ||
@@ -330,6 +353,10 @@ gdt: | |||
330 | .endr | 353 | .endr |
331 | #endif | 354 | #endif |
332 | 355 | ||
356 | ENTRY(phys_base) | ||
357 | /* This must match the first entry in level2_kernel_pgt */ | ||
358 | .quad 0x0000000000000000 | ||
359 | |||
333 | /* We need valid kernel segments for data and code in long mode too | 360 | /* We need valid kernel segments for data and code in long mode too |
334 | * IRET will check the segment types kkeil 2000/10/28 | 361 | * IRET will check the segment types kkeil 2000/10/28 |
335 | * Also sysret mandates a special GDT layout | 362 | * Also sysret mandates a special GDT layout |
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S index bfbe00763c68..16d183f67bc1 100644 --- a/arch/x86_64/kernel/suspend_asm.S +++ b/arch/x86_64/kernel/suspend_asm.S | |||
@@ -71,9 +71,10 @@ loop: | |||
71 | jmp loop | 71 | jmp loop |
72 | done: | 72 | done: |
73 | /* go back to the original page tables */ | 73 | /* go back to the original page tables */ |
74 | leaq init_level4_pgt(%rip), %rax | 74 | movq $(init_level4_pgt - __START_KERNEL_map), %rax |
75 | subq $__START_KERNEL_map, %rax | 75 | addq phys_base(%rip), %rax |
76 | movq %rax, %cr3 | 76 | movq %rax, %cr3 |
77 | |||
77 | /* Flush TLB, including "global" things (vmalloc) */ | 78 | /* Flush TLB, including "global" things (vmalloc) */ |
78 | movq mmu_cr4_features(%rip), %rax | 79 | movq mmu_cr4_features(%rip), %rax |
79 | movq %rax, %rdx | 80 | movq %rax, %rdx |
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 4974433bbf34..40a24d0df090 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h | |||
@@ -61,6 +61,8 @@ typedef struct { unsigned long pgd; } pgd_t; | |||
61 | 61 | ||
62 | typedef struct { unsigned long pgprot; } pgprot_t; | 62 | typedef struct { unsigned long pgprot; } pgprot_t; |
63 | 63 | ||
64 | extern unsigned long phys_base; | ||
65 | |||
64 | #define pte_val(x) ((x).pte) | 66 | #define pte_val(x) ((x).pte) |
65 | #define pmd_val(x) ((x).pmd) | 67 | #define pmd_val(x) ((x).pmd) |
66 | #define pud_val(x) ((x).pud) | 68 | #define pud_val(x) ((x).pud) |
@@ -101,14 +103,14 @@ typedef struct { unsigned long pgprot; } pgprot_t; | |||
101 | #define PAGE_OFFSET __PAGE_OFFSET | 103 | #define PAGE_OFFSET __PAGE_OFFSET |
102 | 104 | ||
103 | /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. | 105 | /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. |
104 | Otherwise you risk miscompilation. */ | 106 | Otherwise you risk miscompilation. */ |
105 | #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) | 107 | #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) |
106 | /* __pa_symbol should be used for C visible symbols. | 108 | /* __pa_symbol should be used for C visible symbols. |
107 | This seems to be the official gcc blessed way to do such arithmetic. */ | 109 | This seems to be the official gcc blessed way to do such arithmetic. */ |
108 | #define __pa_symbol(x) \ | 110 | #define __pa_symbol(x) \ |
109 | ({unsigned long v; \ | 111 | ({unsigned long v; \ |
110 | asm("" : "=r" (v) : "0" (x)); \ | 112 | asm("" : "=r" (v) : "0" (x)); \ |
111 | (v - __START_KERNEL_map); }) | 113 | ((v - __START_KERNEL_map) + phys_base); }) |
112 | 114 | ||
113 | #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) | 115 | #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) |
114 | #ifdef CONFIG_FLATMEM | 116 | #ifdef CONFIG_FLATMEM |