diff options
-rw-r--r-- | arch/x86_64/Kconfig | 49 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/Makefile | 12 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/head.S | 322 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/misc.c | 247 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/vmlinux.lds | 44 | ||||
-rw-r--r-- | arch/x86_64/boot/compressed/vmlinux.scr | 9 | ||||
-rw-r--r-- | arch/x86_64/kernel/head.S | 233 | ||||
-rw-r--r-- | arch/x86_64/kernel/suspend_asm.S | 7 | ||||
-rw-r--r-- | include/asm-x86_64/page.h | 6 |
9 files changed, 597 insertions, 332 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index b3dbf11eb82c..715632026073 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig | |||
@@ -565,23 +565,56 @@ config CRASH_DUMP | |||
565 | PHYSICAL_START. | 565 | PHYSICAL_START. |
566 | For more details see Documentation/kdump/kdump.txt | 566 | For more details see Documentation/kdump/kdump.txt |
567 | 567 | ||
568 | config RELOCATABLE | ||
569 | bool "Build a relocatable kernel(EXPERIMENTAL)" | ||
570 | depends on EXPERIMENTAL | ||
571 | help | ||
572 | Builds a relocatable kernel. This enables loading and running | ||
573 | a kernel binary from a different physical address than it has | ||
574 | been compiled for. | ||
575 | |||
576 | One use is for the kexec on panic case where the recovery kernel | ||
577 | must live at a different physical address than the primary | ||
578 | kernel. | ||
579 | |||
580 | Note: If CONFIG_RELOCATABLE=y, then kernel run from the address | ||
581 | it has been loaded at and compile time physical address | ||
582 | (CONFIG_PHYSICAL_START) is ignored. | ||
583 | |||
568 | config PHYSICAL_START | 584 | config PHYSICAL_START |
569 | hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) | 585 | hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) |
570 | default "0x1000000" if CRASH_DUMP | ||
571 | default "0x200000" | 586 | default "0x200000" |
572 | help | 587 | help |
573 | This gives the physical address where the kernel is loaded. Normally | 588 | This gives the physical address where the kernel is loaded. It |
574 | for regular kernels this value is 0x200000 (2MB). But in the case | 589 | should be aligned to 2MB boundary. |
575 | of kexec on panic the fail safe kernel needs to run at a different | 590 | |
576 | address than the panic-ed kernel. This option is used to set the load | 591 | If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then |
577 | address for kernels used to capture crash dump on being kexec'ed | 592 | bzImage will decompress itself to above physical address and |
578 | after panic. The default value for crash dump kernels is | 593 | run from there. Otherwise, bzImage will run from the address where |
579 | 0x1000000 (16MB). This can also be set based on the "X" value as | 594 | it has been loaded by the boot loader and will ignore above physical |
595 | address. | ||
596 | |||
597 | In normal kdump cases one does not have to set/change this option | ||
598 | as now bzImage can be compiled as a completely relocatable image | ||
599 | (CONFIG_RELOCATABLE=y) and be used to load and run from a different | ||
600 | address. This option is mainly useful for the folks who don't want | ||
601 | to use a bzImage for capturing the crash dump and want to use a | ||
602 | vmlinux instead. | ||
603 | |||
604 | So if you are using bzImage for capturing the crash dump, leave | ||
605 | the value here unchanged to 0x200000 and set CONFIG_RELOCATABLE=y. | ||
606 | Otherwise if you plan to use vmlinux for capturing the crash dump | ||
607 | change this value to start of the reserved region (Typically 16MB | ||
608 | 0x1000000). In other words, it can be set based on the "X" value as | ||
580 | specified in the "crashkernel=YM@XM" command line boot parameter | 609 | specified in the "crashkernel=YM@XM" command line boot parameter |
581 | passed to the panic-ed kernel. Typically this parameter is set as | 610 | passed to the panic-ed kernel. Typically this parameter is set as |
582 | crashkernel=64M@16M. Please take a look at | 611 | crashkernel=64M@16M. Please take a look at |
583 | Documentation/kdump/kdump.txt for more details about crash dumps. | 612 | Documentation/kdump/kdump.txt for more details about crash dumps. |
584 | 613 | ||
614 | Usage of bzImage for capturing the crash dump is advantageous as | ||
615 | one does not have to build two kernels. Same kernel can be used | ||
616 | as production kernel and capture kernel. | ||
617 | |||
585 | Don't change this unless you know what you are doing. | 618 | Don't change this unless you know what you are doing. |
586 | 619 | ||
587 | config SECCOMP | 620 | config SECCOMP |
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index e70fa6e1da08..705a3e33d7e1 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile | |||
@@ -8,16 +8,14 @@ | |||
8 | 8 | ||
9 | targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o | 9 | targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o |
10 | EXTRA_AFLAGS := -traditional | 10 | EXTRA_AFLAGS := -traditional |
11 | AFLAGS := $(subst -m64,-m32,$(AFLAGS)) | ||
12 | 11 | ||
13 | # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with | 12 | # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with |
14 | # -m32 | 13 | # -m32 |
15 | CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing | 14 | CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin |
16 | LDFLAGS := -m elf_i386 | 15 | LDFLAGS := -m elf_x86_64 |
17 | 16 | ||
18 | LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 | 17 | LDFLAGS_vmlinux := -T |
19 | 18 | $(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE | |
20 | $(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE | ||
21 | $(call if_changed,ld) | 19 | $(call if_changed,ld) |
22 | @: | 20 | @: |
23 | 21 | ||
@@ -27,7 +25,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE | |||
27 | $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE | 25 | $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE |
28 | $(call if_changed,gzip) | 26 | $(call if_changed,gzip) |
29 | 27 | ||
30 | LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T | 28 | LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T |
31 | 29 | ||
32 | $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE | 30 | $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE |
33 | $(call if_changed,ld) | 31 | $(call if_changed,ld) |
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 6f55565e4d42..c353a9266ea4 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S | |||
@@ -26,116 +26,262 @@ | |||
26 | 26 | ||
27 | #include <linux/linkage.h> | 27 | #include <linux/linkage.h> |
28 | #include <asm/segment.h> | 28 | #include <asm/segment.h> |
29 | #include <asm/pgtable.h> | ||
29 | #include <asm/page.h> | 30 | #include <asm/page.h> |
31 | #include <asm/msr.h> | ||
30 | 32 | ||
33 | .section ".text.head" | ||
31 | .code32 | 34 | .code32 |
32 | .globl startup_32 | 35 | .globl startup_32 |
33 | 36 | ||
34 | startup_32: | 37 | startup_32: |
35 | cld | 38 | cld |
36 | cli | 39 | cli |
37 | movl $(__KERNEL_DS),%eax | 40 | movl $(__KERNEL_DS), %eax |
38 | movl %eax,%ds | 41 | movl %eax, %ds |
39 | movl %eax,%es | 42 | movl %eax, %es |
40 | movl %eax,%fs | 43 | movl %eax, %ss |
41 | movl %eax,%gs | 44 | |
42 | 45 | /* Calculate the delta between where we were compiled to run | |
43 | lss stack_start,%esp | 46 | * at and where we were actually loaded at. This can only be done |
44 | xorl %eax,%eax | 47 | * with a short local call on x86. Nothing else will tell us what |
45 | 1: incl %eax # check that A20 really IS enabled | 48 | * address we are running at. The reserved chunk of the real-mode |
46 | movl %eax,0x000000 # loop forever if it isn't | 49 | * data at 0x34-0x3f are used as the stack for this calculation. |
47 | cmpl %eax,0x100000 | 50 | * Only 4 bytes are needed. |
48 | je 1b | 51 | */ |
52 | leal 0x40(%esi), %esp | ||
53 | call 1f | ||
54 | 1: popl %ebp | ||
55 | subl $1b, %ebp | ||
56 | |||
57 | /* Compute the delta between where we were compiled to run at | ||
58 | * and where the code will actually run at. | ||
59 | */ | ||
60 | /* %ebp contains the address we are loaded at by the boot loader and %ebx | ||
61 | * contains the address where we should move the kernel image temporarily | ||
62 | * for safe in-place decompression. | ||
63 | */ | ||
64 | |||
65 | #ifdef CONFIG_RELOCATABLE | ||
66 | movl %ebp, %ebx | ||
67 | addl $(LARGE_PAGE_SIZE -1), %ebx | ||
68 | andl $LARGE_PAGE_MASK, %ebx | ||
69 | #else | ||
70 | movl $CONFIG_PHYSICAL_START, %ebx | ||
71 | #endif | ||
72 | |||
73 | /* Replace the compressed data size with the uncompressed size */ | ||
74 | subl input_len(%ebp), %ebx | ||
75 | movl output_len(%ebp), %eax | ||
76 | addl %eax, %ebx | ||
77 | /* Add 8 bytes for every 32K input block */ | ||
78 | shrl $12, %eax | ||
79 | addl %eax, %ebx | ||
80 | /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ | ||
81 | addl $(32768 + 18 + 4095), %ebx | ||
82 | andl $~4095, %ebx | ||
49 | 83 | ||
50 | /* | 84 | /* |
51 | * Initialize eflags. Some BIOS's leave bits like NT set. This would | 85 | * Prepare for entering 64 bit mode |
52 | * confuse the debugger if this code is traced. | ||
53 | * XXX - best to initialize before switching to protected mode. | ||
54 | */ | 86 | */ |
55 | pushl $0 | 87 | |
56 | popfl | 88 | /* Load new GDT with the 64bit segments using 32bit descriptor */ |
89 | leal gdt(%ebp), %eax | ||
90 | movl %eax, gdt+2(%ebp) | ||
91 | lgdt gdt(%ebp) | ||
92 | |||
93 | /* Enable PAE mode */ | ||
94 | xorl %eax, %eax | ||
95 | orl $(1 << 5), %eax | ||
96 | movl %eax, %cr4 | ||
97 | |||
98 | /* | ||
99 | * Build early 4G boot pagetable | ||
100 | */ | ||
101 | /* Initialize Page tables to 0*/ | ||
102 | leal pgtable(%ebx), %edi | ||
103 | xorl %eax, %eax | ||
104 | movl $((4096*6)/4), %ecx | ||
105 | rep stosl | ||
106 | |||
107 | /* Build Level 4 */ | ||
108 | leal pgtable + 0(%ebx), %edi | ||
109 | leal 0x1007 (%edi), %eax | ||
110 | movl %eax, 0(%edi) | ||
111 | |||
112 | /* Build Level 3 */ | ||
113 | leal pgtable + 0x1000(%ebx), %edi | ||
114 | leal 0x1007(%edi), %eax | ||
115 | movl $4, %ecx | ||
116 | 1: movl %eax, 0x00(%edi) | ||
117 | addl $0x00001000, %eax | ||
118 | addl $8, %edi | ||
119 | decl %ecx | ||
120 | jnz 1b | ||
121 | |||
122 | /* Build Level 2 */ | ||
123 | leal pgtable + 0x2000(%ebx), %edi | ||
124 | movl $0x00000183, %eax | ||
125 | movl $2048, %ecx | ||
126 | 1: movl %eax, 0(%edi) | ||
127 | addl $0x00200000, %eax | ||
128 | addl $8, %edi | ||
129 | decl %ecx | ||
130 | jnz 1b | ||
131 | |||
132 | /* Enable the boot page tables */ | ||
133 | leal pgtable(%ebx), %eax | ||
134 | movl %eax, %cr3 | ||
135 | |||
136 | /* Enable Long mode in EFER (Extended Feature Enable Register) */ | ||
137 | movl $MSR_EFER, %ecx | ||
138 | rdmsr | ||
139 | btsl $_EFER_LME, %eax | ||
140 | wrmsr | ||
141 | |||
142 | /* Setup for the jump to 64bit mode | ||
143 | * | ||
144 | * When the jump is performend we will be in long mode but | ||
145 | * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 | ||
146 | * (and in turn EFER.LMA = 1). To jump into 64bit mode we use | ||
147 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
148 | * We place all of the values on our mini stack so lret can | ||
149 | * used to perform that far jump. | ||
150 | */ | ||
151 | pushl $__KERNEL_CS | ||
152 | leal startup_64(%ebp), %eax | ||
153 | pushl %eax | ||
154 | |||
155 | /* Enter paged protected Mode, activating Long Mode */ | ||
156 | movl $0x80000001, %eax /* Enable Paging and Protected mode */ | ||
157 | movl %eax, %cr0 | ||
158 | |||
159 | /* Jump from 32bit compatibility mode into 64bit mode. */ | ||
160 | lret | ||
161 | |||
162 | /* Be careful here startup_64 needs to be at a predictable | ||
163 | * address so I can export it in an ELF header. Bootloaders | ||
164 | * should look at the ELF header to find this address, as | ||
165 | * it may change in the future. | ||
166 | */ | ||
167 | .code64 | ||
168 | .org 0x100 | ||
169 | ENTRY(startup_64) | ||
170 | /* We come here either from startup_32 or directly from a | ||
171 | * 64bit bootloader. If we come here from a bootloader we depend on | ||
172 | * an identity mapped page table being provied that maps our | ||
173 | * entire text+data+bss and hopefully all of memory. | ||
174 | */ | ||
175 | |||
176 | /* Setup data segments. */ | ||
177 | xorl %eax, %eax | ||
178 | movl %eax, %ds | ||
179 | movl %eax, %es | ||
180 | movl %eax, %ss | ||
181 | |||
182 | /* Compute the decompressed kernel start address. It is where | ||
183 | * we were loaded at aligned to a 2M boundary. %rbp contains the | ||
184 | * decompressed kernel start address. | ||
185 | * | ||
186 | * If it is a relocatable kernel then decompress and run the kernel | ||
187 | * from load address aligned to 2MB addr, otherwise decompress and | ||
188 | * run the kernel from CONFIG_PHYSICAL_START | ||
189 | */ | ||
190 | |||
191 | /* Start with the delta to where the kernel will run at. */ | ||
192 | #ifdef CONFIG_RELOCATABLE | ||
193 | leaq startup_32(%rip) /* - $startup_32 */, %rbp | ||
194 | addq $(LARGE_PAGE_SIZE - 1), %rbp | ||
195 | andq $LARGE_PAGE_MASK, %rbp | ||
196 | movq %rbp, %rbx | ||
197 | #else | ||
198 | movq $CONFIG_PHYSICAL_START, %rbp | ||
199 | movq %rbp, %rbx | ||
200 | #endif | ||
201 | |||
202 | /* Replace the compressed data size with the uncompressed size */ | ||
203 | movl input_len(%rip), %eax | ||
204 | subq %rax, %rbx | ||
205 | movl output_len(%rip), %eax | ||
206 | addq %rax, %rbx | ||
207 | /* Add 8 bytes for every 32K input block */ | ||
208 | shrq $12, %rax | ||
209 | addq %rax, %rbx | ||
210 | /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ | ||
211 | addq $(32768 + 18 + 4095), %rbx | ||
212 | andq $~4095, %rbx | ||
213 | |||
214 | /* Copy the compressed kernel to the end of our buffer | ||
215 | * where decompression in place becomes safe. | ||
216 | */ | ||
217 | leaq _end(%rip), %r8 | ||
218 | leaq _end(%rbx), %r9 | ||
219 | movq $_end /* - $startup_32 */, %rcx | ||
220 | 1: subq $8, %r8 | ||
221 | subq $8, %r9 | ||
222 | movq 0(%r8), %rax | ||
223 | movq %rax, 0(%r9) | ||
224 | subq $8, %rcx | ||
225 | jnz 1b | ||
226 | |||
227 | /* | ||
228 | * Jump to the relocated address. | ||
229 | */ | ||
230 | leaq relocated(%rbx), %rax | ||
231 | jmp *%rax | ||
232 | |||
233 | .section ".text" | ||
234 | relocated: | ||
235 | |||
57 | /* | 236 | /* |
58 | * Clear BSS | 237 | * Clear BSS |
59 | */ | 238 | */ |
60 | xorl %eax,%eax | 239 | xorq %rax, %rax |
61 | movl $_edata,%edi | 240 | leaq _edata(%rbx), %rdi |
62 | movl $_end,%ecx | 241 | leaq _end(%rbx), %rcx |
63 | subl %edi,%ecx | 242 | subq %rdi, %rcx |
64 | cld | 243 | cld |
65 | rep | 244 | rep |
66 | stosb | 245 | stosb |
246 | |||
247 | /* Setup the stack */ | ||
248 | leaq user_stack_end(%rip), %rsp | ||
249 | |||
250 | /* zero EFLAGS after setting rsp */ | ||
251 | pushq $0 | ||
252 | popfq | ||
253 | |||
67 | /* | 254 | /* |
68 | * Do the decompression, and jump to the new kernel.. | 255 | * Do the decompression, and jump to the new kernel.. |
69 | */ | 256 | */ |
70 | subl $16,%esp # place for structure on the stack | 257 | pushq %rsi # Save the real mode argument |
71 | movl %esp,%eax | 258 | movq %rsi, %rdi # real mode address |
72 | pushl %esi # real mode pointer as second arg | 259 | leaq _heap(%rip), %rsi # _heap |
73 | pushl %eax # address of structure as first arg | 260 | leaq input_data(%rip), %rdx # input_data |
74 | call decompress_kernel | 261 | movl input_len(%rip), %eax |
75 | orl %eax,%eax | 262 | movq %rax, %rcx # input_len |
76 | jnz 3f | 263 | movq %rbp, %r8 # output |
77 | addl $8,%esp | 264 | call decompress_kernel |
78 | xorl %ebx,%ebx | 265 | popq %rsi |
79 | ljmp $(__KERNEL_CS), $__PHYSICAL_START | ||
80 | 266 | ||
81 | /* | ||
82 | * We come here, if we were loaded high. | ||
83 | * We need to move the move-in-place routine down to 0x1000 | ||
84 | * and then start it with the buffer addresses in registers, | ||
85 | * which we got from the stack. | ||
86 | */ | ||
87 | 3: | ||
88 | movl %esi,%ebx | ||
89 | movl $move_routine_start,%esi | ||
90 | movl $0x1000,%edi | ||
91 | movl $move_routine_end,%ecx | ||
92 | subl %esi,%ecx | ||
93 | addl $3,%ecx | ||
94 | shrl $2,%ecx | ||
95 | cld | ||
96 | rep | ||
97 | movsl | ||
98 | |||
99 | popl %esi # discard the address | ||
100 | addl $4,%esp # real mode pointer | ||
101 | popl %esi # low_buffer_start | ||
102 | popl %ecx # lcount | ||
103 | popl %edx # high_buffer_start | ||
104 | popl %eax # hcount | ||
105 | movl $__PHYSICAL_START,%edi | ||
106 | cli # make sure we don't get interrupted | ||
107 | ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine | ||
108 | 267 | ||
109 | /* | 268 | /* |
110 | * Routine (template) for moving the decompressed kernel in place, | 269 | * Jump to the decompressed kernel. |
111 | * if we were high loaded. This _must_ PIC-code ! | ||
112 | */ | 270 | */ |
113 | move_routine_start: | 271 | jmp *%rbp |
114 | movl %ecx,%ebp | ||
115 | shrl $2,%ecx | ||
116 | rep | ||
117 | movsl | ||
118 | movl %ebp,%ecx | ||
119 | andl $3,%ecx | ||
120 | rep | ||
121 | movsb | ||
122 | movl %edx,%esi | ||
123 | movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 | ||
124 | addl $3,%ecx | ||
125 | shrl $2,%ecx | ||
126 | rep | ||
127 | movsl | ||
128 | movl %ebx,%esi # Restore setup pointer | ||
129 | xorl %ebx,%ebx | ||
130 | ljmp $(__KERNEL_CS), $__PHYSICAL_START | ||
131 | move_routine_end: | ||
132 | 272 | ||
133 | 273 | .data | |
134 | /* Stack for uncompression */ | 274 | gdt: |
135 | .align 32 | 275 | .word gdt_end - gdt |
136 | user_stack: | 276 | .long gdt |
277 | .word 0 | ||
278 | .quad 0x0000000000000000 /* NULL descriptor */ | ||
279 | .quad 0x00af9a000000ffff /* __KERNEL_CS */ | ||
280 | .quad 0x00cf92000000ffff /* __KERNEL_DS */ | ||
281 | gdt_end: | ||
282 | .bss | ||
283 | /* Stack for uncompression */ | ||
284 | .balign 4 | ||
285 | user_stack: | ||
137 | .fill 4096,4,0 | 286 | .fill 4096,4,0 |
138 | stack_start: | 287 | user_stack_end: |
139 | .long user_stack+4096 | ||
140 | .word __KERNEL_DS | ||
141 | |||
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index 3755b2e394d0..fee54dbf1749 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c | |||
@@ -9,10 +9,95 @@ | |||
9 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 | 9 | * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #define _LINUX_STRING_H_ 1 | ||
13 | #define __LINUX_BITMAP_H 1 | ||
14 | |||
15 | #include <linux/linkage.h> | ||
12 | #include <linux/screen_info.h> | 16 | #include <linux/screen_info.h> |
13 | #include <asm/io.h> | 17 | #include <asm/io.h> |
14 | #include <asm/page.h> | 18 | #include <asm/page.h> |
15 | 19 | ||
20 | /* WARNING!! | ||
21 | * This code is compiled with -fPIC and it is relocated dynamically | ||
22 | * at run time, but no relocation processing is performed. | ||
23 | * This means that it is not safe to place pointers in static structures. | ||
24 | */ | ||
25 | |||
26 | /* | ||
27 | * Getting to provable safe in place decompression is hard. | ||
28 | * Worst case behaviours need to be analized. | ||
29 | * Background information: | ||
30 | * | ||
31 | * The file layout is: | ||
32 | * magic[2] | ||
33 | * method[1] | ||
34 | * flags[1] | ||
35 | * timestamp[4] | ||
36 | * extraflags[1] | ||
37 | * os[1] | ||
38 | * compressed data blocks[N] | ||
39 | * crc[4] orig_len[4] | ||
40 | * | ||
41 | * resulting in 18 bytes of non compressed data overhead. | ||
42 | * | ||
43 | * Files divided into blocks | ||
44 | * 1 bit (last block flag) | ||
45 | * 2 bits (block type) | ||
46 | * | ||
47 | * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. | ||
48 | * The smallest block type encoding is always used. | ||
49 | * | ||
50 | * stored: | ||
51 | * 32 bits length in bytes. | ||
52 | * | ||
53 | * fixed: | ||
54 | * magic fixed tree. | ||
55 | * symbols. | ||
56 | * | ||
57 | * dynamic: | ||
58 | * dynamic tree encoding. | ||
59 | * symbols. | ||
60 | * | ||
61 | * | ||
62 | * The buffer for decompression in place is the length of the | ||
63 | * uncompressed data, plus a small amount extra to keep the algorithm safe. | ||
64 | * The compressed data is placed at the end of the buffer. The output | ||
65 | * pointer is placed at the start of the buffer and the input pointer | ||
66 | * is placed where the compressed data starts. Problems will occur | ||
67 | * when the output pointer overruns the input pointer. | ||
68 | * | ||
69 | * The output pointer can only overrun the input pointer if the input | ||
70 | * pointer is moving faster than the output pointer. A condition only | ||
71 | * triggered by data whose compressed form is larger than the uncompressed | ||
72 | * form. | ||
73 | * | ||
74 | * The worst case at the block level is a growth of the compressed data | ||
75 | * of 5 bytes per 32767 bytes. | ||
76 | * | ||
77 | * The worst case internal to a compressed block is very hard to figure. | ||
78 | * The worst case can at least be boundined by having one bit that represents | ||
79 | * 32764 bytes and then all of the rest of the bytes representing the very | ||
80 | * very last byte. | ||
81 | * | ||
82 | * All of which is enough to compute an amount of extra data that is required | ||
83 | * to be safe. To avoid problems at the block level allocating 5 extra bytes | ||
84 | * per 32767 bytes of data is sufficient. To avoind problems internal to a block | ||
85 | * adding an extra 32767 bytes (the worst case uncompressed block size) is | ||
86 | * sufficient, to ensure that in the worst case the decompressed data for | ||
87 | * block will stop the byte before the compressed data for a block begins. | ||
88 | * To avoid problems with the compressed data's meta information an extra 18 | ||
89 | * bytes are needed. Leading to the formula: | ||
90 | * | ||
91 | * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. | ||
92 | * | ||
93 | * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. | ||
94 | * Adding 32768 instead of 32767 just makes for round numbers. | ||
95 | * Adding the decompressor_size is necessary as it musht live after all | ||
96 | * of the data as well. Last I measured the decompressor is about 14K. | ||
97 | * 10K of actuall data and 4K of bss. | ||
98 | * | ||
99 | */ | ||
100 | |||
16 | /* | 101 | /* |
17 | * gzip declarations | 102 | * gzip declarations |
18 | */ | 103 | */ |
@@ -28,15 +113,20 @@ typedef unsigned char uch; | |||
28 | typedef unsigned short ush; | 113 | typedef unsigned short ush; |
29 | typedef unsigned long ulg; | 114 | typedef unsigned long ulg; |
30 | 115 | ||
31 | #define WSIZE 0x8000 /* Window size must be at least 32k, */ | 116 | #define WSIZE 0x80000000 /* Window size must be at least 32k, |
32 | /* and a power of two */ | 117 | * and a power of two |
118 | * We don't actually have a window just | ||
119 | * a huge output buffer so I report | ||
120 | * a 2G windows size, as that should | ||
121 | * always be larger than our output buffer. | ||
122 | */ | ||
33 | 123 | ||
34 | static uch *inbuf; /* input buffer */ | 124 | static uch *inbuf; /* input buffer */ |
35 | static uch window[WSIZE]; /* Sliding window buffer */ | 125 | static uch *window; /* Sliding window buffer, (and final output buffer) */ |
36 | 126 | ||
37 | static unsigned insize = 0; /* valid bytes in inbuf */ | 127 | static unsigned insize; /* valid bytes in inbuf */ |
38 | static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ | 128 | static unsigned inptr; /* index of next byte to be processed in inbuf */ |
39 | static unsigned outcnt = 0; /* bytes in output buffer */ | 129 | static unsigned outcnt; /* bytes in output buffer */ |
40 | 130 | ||
41 | /* gzip flag byte */ | 131 | /* gzip flag byte */ |
42 | #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ | 132 | #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ |
@@ -87,8 +177,6 @@ extern unsigned char input_data[]; | |||
87 | extern int input_len; | 177 | extern int input_len; |
88 | 178 | ||
89 | static long bytes_out = 0; | 179 | static long bytes_out = 0; |
90 | static uch *output_data; | ||
91 | static unsigned long output_ptr = 0; | ||
92 | 180 | ||
93 | static void *malloc(int size); | 181 | static void *malloc(int size); |
94 | static void free(void *where); | 182 | static void free(void *where); |
@@ -98,17 +186,10 @@ static void *memcpy(void *dest, const void *src, unsigned n); | |||
98 | 186 | ||
99 | static void putstr(const char *); | 187 | static void putstr(const char *); |
100 | 188 | ||
101 | extern int end; | 189 | static long free_mem_ptr; |
102 | static long free_mem_ptr = (long)&end; | ||
103 | static long free_mem_end_ptr; | 190 | static long free_mem_end_ptr; |
104 | 191 | ||
105 | #define INPLACE_MOVE_ROUTINE 0x1000 | 192 | #define HEAP_SIZE 0x6000 |
106 | #define LOW_BUFFER_START 0x2000 | ||
107 | #define LOW_BUFFER_MAX 0x90000 | ||
108 | #define HEAP_SIZE 0x3000 | ||
109 | static unsigned int low_buffer_end, low_buffer_size; | ||
110 | static int high_loaded =0; | ||
111 | static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; | ||
112 | 193 | ||
113 | static char *vidmem = (char *)0xb8000; | 194 | static char *vidmem = (char *)0xb8000; |
114 | static int vidport; | 195 | static int vidport; |
@@ -218,58 +299,31 @@ static void* memcpy(void* dest, const void* src, unsigned n) | |||
218 | */ | 299 | */ |
219 | static int fill_inbuf(void) | 300 | static int fill_inbuf(void) |
220 | { | 301 | { |
221 | if (insize != 0) { | 302 | error("ran out of input data"); |
222 | error("ran out of input data"); | 303 | return 0; |
223 | } | ||
224 | |||
225 | inbuf = input_data; | ||
226 | insize = input_len; | ||
227 | inptr = 1; | ||
228 | return inbuf[0]; | ||
229 | } | 304 | } |
230 | 305 | ||
231 | /* =========================================================================== | 306 | /* =========================================================================== |
232 | * Write the output window window[0..outcnt-1] and update crc and bytes_out. | 307 | * Write the output window window[0..outcnt-1] and update crc and bytes_out. |
233 | * (Used for the decompressed data only.) | 308 | * (Used for the decompressed data only.) |
234 | */ | 309 | */ |
235 | static void flush_window_low(void) | ||
236 | { | ||
237 | ulg c = crc; /* temporary variable */ | ||
238 | unsigned n; | ||
239 | uch *in, *out, ch; | ||
240 | |||
241 | in = window; | ||
242 | out = &output_data[output_ptr]; | ||
243 | for (n = 0; n < outcnt; n++) { | ||
244 | ch = *out++ = *in++; | ||
245 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | ||
246 | } | ||
247 | crc = c; | ||
248 | bytes_out += (ulg)outcnt; | ||
249 | output_ptr += (ulg)outcnt; | ||
250 | outcnt = 0; | ||
251 | } | ||
252 | |||
253 | static void flush_window_high(void) | ||
254 | { | ||
255 | ulg c = crc; /* temporary variable */ | ||
256 | unsigned n; | ||
257 | uch *in, ch; | ||
258 | in = window; | ||
259 | for (n = 0; n < outcnt; n++) { | ||
260 | ch = *output_data++ = *in++; | ||
261 | if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start; | ||
262 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | ||
263 | } | ||
264 | crc = c; | ||
265 | bytes_out += (ulg)outcnt; | ||
266 | outcnt = 0; | ||
267 | } | ||
268 | |||
269 | static void flush_window(void) | 310 | static void flush_window(void) |
270 | { | 311 | { |
271 | if (high_loaded) flush_window_high(); | 312 | /* With my window equal to my output buffer |
272 | else flush_window_low(); | 313 | * I only need to compute the crc here. |
314 | */ | ||
315 | ulg c = crc; /* temporary variable */ | ||
316 | unsigned n; | ||
317 | uch *in, ch; | ||
318 | |||
319 | in = window; | ||
320 | for (n = 0; n < outcnt; n++) { | ||
321 | ch = *in++; | ||
322 | c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); | ||
323 | } | ||
324 | crc = c; | ||
325 | bytes_out += (ulg)outcnt; | ||
326 | outcnt = 0; | ||
273 | } | 327 | } |
274 | 328 | ||
275 | static void error(char *x) | 329 | static void error(char *x) |
@@ -281,57 +335,8 @@ static void error(char *x) | |||
281 | while(1); /* Halt */ | 335 | while(1); /* Halt */ |
282 | } | 336 | } |
283 | 337 | ||
284 | static void setup_normal_output_buffer(void) | 338 | asmlinkage void decompress_kernel(void *rmode, unsigned long heap, |
285 | { | 339 | uch *input_data, unsigned long input_len, uch *output) |
286 | #ifdef STANDARD_MEMORY_BIOS_CALL | ||
287 | if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory"); | ||
288 | #else | ||
289 | if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); | ||
290 | #endif | ||
291 | output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ | ||
292 | free_mem_end_ptr = (long)real_mode; | ||
293 | } | ||
294 | |||
295 | struct moveparams { | ||
296 | uch *low_buffer_start; int lcount; | ||
297 | uch *high_buffer_start; int hcount; | ||
298 | }; | ||
299 | |||
300 | static void setup_output_buffer_if_we_run_high(struct moveparams *mv) | ||
301 | { | ||
302 | high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); | ||
303 | #ifdef STANDARD_MEMORY_BIOS_CALL | ||
304 | if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); | ||
305 | #else | ||
306 | if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); | ||
307 | #endif | ||
308 | mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; | ||
309 | low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX | ||
310 | ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; | ||
311 | low_buffer_size = low_buffer_end - LOW_BUFFER_START; | ||
312 | high_loaded = 1; | ||
313 | free_mem_end_ptr = (long)high_buffer_start; | ||
314 | if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { | ||
315 | high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); | ||
316 | mv->hcount = 0; /* say: we need not to move high_buffer */ | ||
317 | } | ||
318 | else mv->hcount = -1; | ||
319 | mv->high_buffer_start = high_buffer_start; | ||
320 | } | ||
321 | |||
322 | static void close_output_buffer_if_we_run_high(struct moveparams *mv) | ||
323 | { | ||
324 | if (bytes_out > low_buffer_size) { | ||
325 | mv->lcount = low_buffer_size; | ||
326 | if (mv->hcount) | ||
327 | mv->hcount = bytes_out - low_buffer_size; | ||
328 | } else { | ||
329 | mv->lcount = bytes_out; | ||
330 | mv->hcount = 0; | ||
331 | } | ||
332 | } | ||
333 | |||
334 | int decompress_kernel(struct moveparams *mv, void *rmode) | ||
335 | { | 340 | { |
336 | real_mode = rmode; | 341 | real_mode = rmode; |
337 | 342 | ||
@@ -346,13 +351,21 @@ int decompress_kernel(struct moveparams *mv, void *rmode) | |||
346 | lines = RM_SCREEN_INFO.orig_video_lines; | 351 | lines = RM_SCREEN_INFO.orig_video_lines; |
347 | cols = RM_SCREEN_INFO.orig_video_cols; | 352 | cols = RM_SCREEN_INFO.orig_video_cols; |
348 | 353 | ||
349 | if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); | 354 | window = output; /* Output buffer (Normally at 1M) */ |
350 | else setup_output_buffer_if_we_run_high(mv); | 355 | free_mem_ptr = heap; /* Heap */ |
356 | free_mem_end_ptr = heap + HEAP_SIZE; | ||
357 | inbuf = input_data; /* Input buffer */ | ||
358 | insize = input_len; | ||
359 | inptr = 0; | ||
360 | |||
361 | if ((ulg)output & 0x1fffffUL) | ||
362 | error("Destination address not 2M aligned"); | ||
363 | if ((ulg)output >= 0xffffffffffUL) | ||
364 | error("Destination address too large"); | ||
351 | 365 | ||
352 | makecrc(); | 366 | makecrc(); |
353 | putstr(".\nDecompressing Linux..."); | 367 | putstr(".\nDecompressing Linux..."); |
354 | gunzip(); | 368 | gunzip(); |
355 | putstr("done.\nBooting the kernel.\n"); | 369 | putstr("done.\nBooting the kernel.\n"); |
356 | if (high_loaded) close_output_buffer_if_we_run_high(mv); | 370 | return; |
357 | return high_loaded; | ||
358 | } | 371 | } |
diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds new file mode 100644 index 000000000000..94c13e557fb4 --- /dev/null +++ b/arch/x86_64/boot/compressed/vmlinux.lds | |||
@@ -0,0 +1,44 @@ | |||
1 | OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") | ||
2 | OUTPUT_ARCH(i386:x86-64) | ||
3 | ENTRY(startup_64) | ||
4 | SECTIONS | ||
5 | { | ||
6 | /* Be careful parts of head.S assume startup_32 is at | ||
7 | * address 0. | ||
8 | */ | ||
9 | . = 0; | ||
10 | .text : { | ||
11 | _head = . ; | ||
12 | *(.text.head) | ||
13 | _ehead = . ; | ||
14 | *(.text.compressed) | ||
15 | _text = .; /* Text */ | ||
16 | *(.text) | ||
17 | *(.text.*) | ||
18 | _etext = . ; | ||
19 | } | ||
20 | .rodata : { | ||
21 | _rodata = . ; | ||
22 | *(.rodata) /* read-only data */ | ||
23 | *(.rodata.*) | ||
24 | _erodata = . ; | ||
25 | } | ||
26 | .data : { | ||
27 | _data = . ; | ||
28 | *(.data) | ||
29 | *(.data.*) | ||
30 | _edata = . ; | ||
31 | } | ||
32 | .bss : { | ||
33 | _bss = . ; | ||
34 | *(.bss) | ||
35 | *(.bss.*) | ||
36 | *(COMMON) | ||
37 | . = ALIGN(8); | ||
38 | _end = . ; | ||
39 | . = ALIGN(4096); | ||
40 | pgtable = . ; | ||
41 | . = . + 4096 * 6; | ||
42 | _heap = .; | ||
43 | } | ||
44 | } | ||
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr index 1ed9d791f863..bd1429ce193e 100644 --- a/arch/x86_64/boot/compressed/vmlinux.scr +++ b/arch/x86_64/boot/compressed/vmlinux.scr | |||
@@ -1,9 +1,10 @@ | |||
1 | SECTIONS | 1 | SECTIONS |
2 | { | 2 | { |
3 | .data : { | 3 | .text.compressed : { |
4 | input_len = .; | 4 | input_len = .; |
5 | LONG(input_data_end - input_data) input_data = .; | 5 | LONG(input_data_end - input_data) input_data = .; |
6 | *(.data) | 6 | *(.data) |
7 | input_data_end = .; | 7 | output_len = . - 4; |
8 | input_data_end = .; | ||
8 | } | 9 | } |
9 | } | 10 | } |
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c211e52f1333..36aa98a6d15c 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S | |||
@@ -5,6 +5,7 @@ | |||
5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | 5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> |
6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> | 6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> |
7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> | 7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> |
8 | * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> | ||
8 | */ | 9 | */ |
9 | 10 | ||
10 | 11 | ||
@@ -17,95 +18,127 @@ | |||
17 | #include <asm/page.h> | 18 | #include <asm/page.h> |
18 | #include <asm/msr.h> | 19 | #include <asm/msr.h> |
19 | #include <asm/cache.h> | 20 | #include <asm/cache.h> |
20 | 21 | ||
21 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE | 22 | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE |
22 | * because we need identity-mapped pages on setup so define __START_KERNEL to | 23 | * because we need identity-mapped pages. |
23 | * 0x100000 for this stage | 24 | * |
24 | * | ||
25 | */ | 25 | */ |
26 | 26 | ||
27 | .text | 27 | .text |
28 | .section .bootstrap.text | 28 | .section .bootstrap.text |
29 | .code32 | 29 | .code64 |
30 | .globl startup_32 | 30 | .globl startup_64 |
31 | /* %bx: 1 if coming from smp trampoline on secondary cpu */ | 31 | startup_64: |
32 | startup_32: | 32 | |
33 | |||
34 | /* | 33 | /* |
35 | * At this point the CPU runs in 32bit protected mode (CS.D = 1) with | 34 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
36 | * paging disabled and the point of this file is to switch to 64bit | 35 | * and someone has loaded an identity mapped page table |
37 | * long mode with a kernel mapping for kerneland to jump into the | 36 | * for us. These identity mapped page tables map all of the |
38 | * kernel virtual addresses. | 37 | * kernel pages and possibly all of memory. |
39 | * There is no stack until we set one up. | 38 | * |
39 | * %esi holds a physical pointer to real_mode_data. | ||
40 | * | ||
41 | * We come here either directly from a 64bit bootloader, or from | ||
42 | * arch/x86_64/boot/compressed/head.S. | ||
43 | * | ||
44 | * We only come here initially at boot nothing else comes here. | ||
45 | * | ||
46 | * Since we may be loaded at an address different from what we were | ||
47 | * compiled to run at we first fixup the physical addresses in our page | ||
48 | * tables and then reload them. | ||
40 | */ | 49 | */ |
41 | 50 | ||
42 | /* Initialize the %ds segment register */ | 51 | /* Compute the delta between the address I am compiled to run at and the |
43 | movl $__KERNEL_DS,%eax | 52 | * address I am actually running at. |
44 | movl %eax,%ds | ||
45 | |||
46 | /* Load new GDT with the 64bit segments using 32bit descriptor */ | ||
47 | lgdt pGDT32 - __START_KERNEL_map | ||
48 | |||
49 | /* If the CPU doesn't support CPUID this will double fault. | ||
50 | * Unfortunately it is hard to check for CPUID without a stack. | ||
51 | */ | 53 | */ |
52 | 54 | leaq _text(%rip), %rbp | |
53 | /* Check if extended functions are implemented */ | 55 | subq $_text - __START_KERNEL_map, %rbp |
54 | movl $0x80000000, %eax | 56 | |
55 | cpuid | 57 | /* Is the address not 2M aligned? */ |
56 | cmpl $0x80000000, %eax | 58 | movq %rbp, %rax |
57 | jbe no_long_mode | 59 | andl $~LARGE_PAGE_MASK, %eax |
58 | /* Check if long mode is implemented */ | 60 | testl %eax, %eax |
59 | mov $0x80000001, %eax | 61 | jnz bad_address |
60 | cpuid | 62 | |
61 | btl $29, %edx | 63 | /* Is the address too large? */ |
62 | jnc no_long_mode | 64 | leaq _text(%rip), %rdx |
63 | 65 | movq $PGDIR_SIZE, %rax | |
64 | /* | 66 | cmpq %rax, %rdx |
65 | * Prepare for entering 64bits mode | 67 | jae bad_address |
68 | |||
69 | /* Fixup the physical addresses in the page table | ||
66 | */ | 70 | */ |
71 | addq %rbp, init_level4_pgt + 0(%rip) | ||
72 | addq %rbp, init_level4_pgt + (258*8)(%rip) | ||
73 | addq %rbp, init_level4_pgt + (511*8)(%rip) | ||
74 | |||
75 | addq %rbp, level3_ident_pgt + 0(%rip) | ||
76 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) | ||
77 | |||
78 | /* Add an Identity mapping if I am above 1G */ | ||
79 | leaq _text(%rip), %rdi | ||
80 | andq $LARGE_PAGE_MASK, %rdi | ||
81 | |||
82 | movq %rdi, %rax | ||
83 | shrq $PUD_SHIFT, %rax | ||
84 | andq $(PTRS_PER_PUD - 1), %rax | ||
85 | jz ident_complete | ||
86 | |||
87 | leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx | ||
88 | leaq level3_ident_pgt(%rip), %rbx | ||
89 | movq %rdx, 0(%rbx, %rax, 8) | ||
90 | |||
91 | movq %rdi, %rax | ||
92 | shrq $PMD_SHIFT, %rax | ||
93 | andq $(PTRS_PER_PMD - 1), %rax | ||
94 | leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx | ||
95 | leaq level2_spare_pgt(%rip), %rbx | ||
96 | movq %rdx, 0(%rbx, %rax, 8) | ||
97 | ident_complete: | ||
98 | |||
99 | /* Fixup the kernel text+data virtual addresses | ||
100 | */ | ||
101 | leaq level2_kernel_pgt(%rip), %rdi | ||
102 | leaq 4096(%rdi), %r8 | ||
103 | /* See if it is a valid page table entry */ | ||
104 | 1: testq $1, 0(%rdi) | ||
105 | jz 2f | ||
106 | addq %rbp, 0(%rdi) | ||
107 | /* Go to the next page */ | ||
108 | 2: addq $8, %rdi | ||
109 | cmp %r8, %rdi | ||
110 | jne 1b | ||
111 | |||
112 | /* Fixup phys_base */ | ||
113 | addq %rbp, phys_base(%rip) | ||
67 | 114 | ||
68 | /* Enable PAE mode */ | 115 | #ifdef CONFIG_SMP |
69 | xorl %eax, %eax | 116 | addq %rbp, trampoline_level4_pgt + 0(%rip) |
70 | btsl $5, %eax | 117 | addq %rbp, trampoline_level4_pgt + (511*8)(%rip) |
71 | movl %eax, %cr4 | 118 | #endif |
72 | 119 | #ifdef CONFIG_ACPI_SLEEP | |
73 | /* Setup early boot stage 4 level pagetables */ | 120 | addq %rbp, wakeup_level4_pgt + 0(%rip) |
74 | movl $(init_level4_pgt - __START_KERNEL_map), %eax | 121 | addq %rbp, wakeup_level4_pgt + (511*8)(%rip) |
75 | movl %eax, %cr3 | 122 | #endif |
76 | |||
77 | /* Setup EFER (Extended Feature Enable Register) */ | ||
78 | movl $MSR_EFER, %ecx | ||
79 | rdmsr | ||
80 | |||
81 | /* Enable Long Mode */ | ||
82 | btsl $_EFER_LME, %eax | ||
83 | |||
84 | /* Make changes effective */ | ||
85 | wrmsr | ||
86 | 123 | ||
87 | xorl %eax, %eax | 124 | /* Due to ENTRY(), sometimes the empty space gets filled with |
88 | btsl $31, %eax /* Enable paging and in turn activate Long Mode */ | 125 | * zeros. Better take a jmp than relying on empty space being |
89 | btsl $0, %eax /* Enable protected mode */ | 126 | * filled with 0x90 (nop) |
90 | /* Make changes effective */ | ||
91 | movl %eax, %cr0 | ||
92 | /* | ||
93 | * At this point we're in long mode but in 32bit compatibility mode | ||
94 | * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn | ||
95 | * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use | ||
96 | * the new gdt/idt that has __KERNEL_CS with CS.L = 1. | ||
97 | */ | 127 | */ |
98 | ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) | 128 | jmp secondary_startup_64 |
99 | |||
100 | .code64 | ||
101 | .org 0x100 | ||
102 | .globl startup_64 | ||
103 | startup_64: | ||
104 | ENTRY(secondary_startup_64) | 129 | ENTRY(secondary_startup_64) |
105 | /* We come here either from startup_32 | 130 | /* |
106 | * or directly from a 64bit bootloader. | 131 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, |
107 | * Since we may have come directly from a bootloader we | 132 | * and someone has loaded a mapped page table. |
108 | * reload the page tables here. | 133 | * |
134 | * %esi holds a physical pointer to real_mode_data. | ||
135 | * | ||
136 | * We come here either from startup_64 (using physical addresses) | ||
137 | * or from trampoline.S (using virtual addresses). | ||
138 | * | ||
139 | * Using virtual addresses from trampoline.S removes the need | ||
140 | * to have any identity mapped pages in the kernel page table | ||
141 | * after the boot processor executes this code. | ||
109 | */ | 142 | */ |
110 | 143 | ||
111 | /* Enable PAE mode and PGE */ | 144 | /* Enable PAE mode and PGE */ |
@@ -116,8 +149,14 @@ ENTRY(secondary_startup_64) | |||
116 | 149 | ||
117 | /* Setup early boot stage 4 level pagetables. */ | 150 | /* Setup early boot stage 4 level pagetables. */ |
118 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | 151 | movq $(init_level4_pgt - __START_KERNEL_map), %rax |
152 | addq phys_base(%rip), %rax | ||
119 | movq %rax, %cr3 | 153 | movq %rax, %cr3 |
120 | 154 | ||
155 | /* Ensure I am executing from virtual addresses */ | ||
156 | movq $1f, %rax | ||
157 | jmp *%rax | ||
158 | 1: | ||
159 | |||
121 | /* Check if nx is implemented */ | 160 | /* Check if nx is implemented */ |
122 | movl $0x80000001, %eax | 161 | movl $0x80000001, %eax |
123 | cpuid | 162 | cpuid |
@@ -126,17 +165,11 @@ ENTRY(secondary_startup_64) | |||
126 | /* Setup EFER (Extended Feature Enable Register) */ | 165 | /* Setup EFER (Extended Feature Enable Register) */ |
127 | movl $MSR_EFER, %ecx | 166 | movl $MSR_EFER, %ecx |
128 | rdmsr | 167 | rdmsr |
129 | 168 | btsl $_EFER_SCE, %eax /* Enable System Call */ | |
130 | /* Enable System Call */ | 169 | btl $20,%edi /* No Execute supported? */ |
131 | btsl $_EFER_SCE, %eax | ||
132 | |||
133 | /* No Execute supported? */ | ||
134 | btl $20,%edi | ||
135 | jnc 1f | 170 | jnc 1f |
136 | btsl $_EFER_NX, %eax | 171 | btsl $_EFER_NX, %eax |
137 | 1: | 172 | 1: wrmsr /* Make changes effective */ |
138 | /* Make changes effective */ | ||
139 | wrmsr | ||
140 | 173 | ||
141 | /* Setup cr0 */ | 174 | /* Setup cr0 */ |
142 | #define CR0_PM 1 /* protected mode */ | 175 | #define CR0_PM 1 /* protected mode */ |
@@ -163,7 +196,7 @@ ENTRY(secondary_startup_64) | |||
163 | * addresses where we're currently running on. We have to do that here | 196 | * addresses where we're currently running on. We have to do that here |
164 | * because in 32bit we couldn't load a 64bit linear address. | 197 | * because in 32bit we couldn't load a 64bit linear address. |
165 | */ | 198 | */ |
166 | lgdt cpu_gdt_descr | 199 | lgdt cpu_gdt_descr(%rip) |
167 | 200 | ||
168 | /* set up data segments. actually 0 would do too */ | 201 | /* set up data segments. actually 0 would do too */ |
169 | movl $__KERNEL_DS,%eax | 202 | movl $__KERNEL_DS,%eax |
@@ -214,6 +247,9 @@ initial_code: | |||
214 | init_rsp: | 247 | init_rsp: |
215 | .quad init_thread_union+THREAD_SIZE-8 | 248 | .quad init_thread_union+THREAD_SIZE-8 |
216 | 249 | ||
250 | bad_address: | ||
251 | jmp bad_address | ||
252 | |||
217 | ENTRY(early_idt_handler) | 253 | ENTRY(early_idt_handler) |
218 | cmpl $2,early_recursion_flag(%rip) | 254 | cmpl $2,early_recursion_flag(%rip) |
219 | jz 1f | 255 | jz 1f |
@@ -242,23 +278,7 @@ early_idt_msg: | |||
242 | early_idt_ripmsg: | 278 | early_idt_ripmsg: |
243 | .asciz "RIP %s\n" | 279 | .asciz "RIP %s\n" |
244 | 280 | ||
245 | .code32 | 281 | .balign PAGE_SIZE |
246 | ENTRY(no_long_mode) | ||
247 | /* This isn't an x86-64 CPU so hang */ | ||
248 | 1: | ||
249 | jmp 1b | ||
250 | |||
251 | .org 0xf00 | ||
252 | .globl pGDT32 | ||
253 | pGDT32: | ||
254 | .word gdt_end-cpu_gdt_table-1 | ||
255 | .long cpu_gdt_table-__START_KERNEL_map | ||
256 | |||
257 | .org 0xf10 | ||
258 | ljumpvector: | ||
259 | .long startup_64-__START_KERNEL_map | ||
260 | .word __KERNEL_CS | ||
261 | |||
262 | ENTRY(stext) | 282 | ENTRY(stext) |
263 | ENTRY(_stext) | 283 | ENTRY(_stext) |
264 | 284 | ||
@@ -303,7 +323,7 @@ NEXT_PAGE(level2_ident_pgt) | |||
303 | * Don't set NX because code runs from these pages. | 323 | * Don't set NX because code runs from these pages. |
304 | */ | 324 | */ |
305 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) | 325 | PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) |
306 | 326 | ||
307 | NEXT_PAGE(level2_kernel_pgt) | 327 | NEXT_PAGE(level2_kernel_pgt) |
308 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. | 328 | /* 40MB kernel mapping. The kernel code cannot be bigger than that. |
309 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ | 329 | When you change this change KERNEL_TEXT_SIZE in page.h too. */ |
@@ -313,6 +333,9 @@ NEXT_PAGE(level2_kernel_pgt) | |||
313 | /* Module mapping starts here */ | 333 | /* Module mapping starts here */ |
314 | .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 | 334 | .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 |
315 | 335 | ||
336 | NEXT_PAGE(level2_spare_pgt) | ||
337 | .fill 512,8,0 | ||
338 | |||
316 | #undef PMDS | 339 | #undef PMDS |
317 | #undef NEXT_PAGE | 340 | #undef NEXT_PAGE |
318 | 341 | ||
@@ -330,6 +353,10 @@ gdt: | |||
330 | .endr | 353 | .endr |
331 | #endif | 354 | #endif |
332 | 355 | ||
356 | ENTRY(phys_base) | ||
357 | /* This must match the first entry in level2_kernel_pgt */ | ||
358 | .quad 0x0000000000000000 | ||
359 | |||
333 | /* We need valid kernel segments for data and code in long mode too | 360 | /* We need valid kernel segments for data and code in long mode too |
334 | * IRET will check the segment types kkeil 2000/10/28 | 361 | * IRET will check the segment types kkeil 2000/10/28 |
335 | * Also sysret mandates a special GDT layout | 362 | * Also sysret mandates a special GDT layout |
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S index bfbe00763c68..16d183f67bc1 100644 --- a/arch/x86_64/kernel/suspend_asm.S +++ b/arch/x86_64/kernel/suspend_asm.S | |||
@@ -71,9 +71,10 @@ loop: | |||
71 | jmp loop | 71 | jmp loop |
72 | done: | 72 | done: |
73 | /* go back to the original page tables */ | 73 | /* go back to the original page tables */ |
74 | leaq init_level4_pgt(%rip), %rax | 74 | movq $(init_level4_pgt - __START_KERNEL_map), %rax |
75 | subq $__START_KERNEL_map, %rax | 75 | addq phys_base(%rip), %rax |
76 | movq %rax, %cr3 | 76 | movq %rax, %cr3 |
77 | |||
77 | /* Flush TLB, including "global" things (vmalloc) */ | 78 | /* Flush TLB, including "global" things (vmalloc) */ |
78 | movq mmu_cr4_features(%rip), %rax | 79 | movq mmu_cr4_features(%rip), %rax |
79 | movq %rax, %rdx | 80 | movq %rax, %rdx |
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 4974433bbf34..40a24d0df090 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h | |||
@@ -61,6 +61,8 @@ typedef struct { unsigned long pgd; } pgd_t; | |||
61 | 61 | ||
62 | typedef struct { unsigned long pgprot; } pgprot_t; | 62 | typedef struct { unsigned long pgprot; } pgprot_t; |
63 | 63 | ||
64 | extern unsigned long phys_base; | ||
65 | |||
64 | #define pte_val(x) ((x).pte) | 66 | #define pte_val(x) ((x).pte) |
65 | #define pmd_val(x) ((x).pmd) | 67 | #define pmd_val(x) ((x).pmd) |
66 | #define pud_val(x) ((x).pud) | 68 | #define pud_val(x) ((x).pud) |
@@ -101,14 +103,14 @@ typedef struct { unsigned long pgprot; } pgprot_t; | |||
101 | #define PAGE_OFFSET __PAGE_OFFSET | 103 | #define PAGE_OFFSET __PAGE_OFFSET |
102 | 104 | ||
103 | /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. | 105 | /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. |
104 | Otherwise you risk miscompilation. */ | 106 | Otherwise you risk miscompilation. */ |
105 | #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) | 107 | #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) |
106 | /* __pa_symbol should be used for C visible symbols. | 108 | /* __pa_symbol should be used for C visible symbols. |
107 | This seems to be the official gcc blessed way to do such arithmetic. */ | 109 | This seems to be the official gcc blessed way to do such arithmetic. */ |
108 | #define __pa_symbol(x) \ | 110 | #define __pa_symbol(x) \ |
109 | ({unsigned long v; \ | 111 | ({unsigned long v; \ |
110 | asm("" : "=r" (v) : "0" (x)); \ | 112 | asm("" : "=r" (v) : "0" (x)); \ |
111 | (v - __START_KERNEL_map); }) | 113 | ((v - __START_KERNEL_map) + phys_base); }) |
112 | 114 | ||
113 | #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) | 115 | #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) |
114 | #ifdef CONFIG_FLATMEM | 116 | #ifdef CONFIG_FLATMEM |