aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVivek Goyal <vgoyal@in.ibm.com>2007-05-02 13:27:07 -0400
committerAndi Kleen <andi@basil.nowhere.org>2007-05-02 13:27:07 -0400
commit1ab60e0f72f71ec54831e525a3e1154f1c092408 (patch)
treebd7dd8bbff43e3e2e3597f2b7780e82a856bb9d7
parent0dbf7028c0c1f266c9631139450a1502d3cd457e (diff)
[PATCH] x86-64: Relocatable Kernel Support
This patch modifies the x86_64 kernel so that it can be loaded and run at any 2M aligned address, below 512G. The technique used is to compile the decompressor with -fPIC and modify it so the decompressor is fully relocatable. For the main kernel the page tables are modified so the kernel remains at the same virtual address. In addition a variable phys_base is kept that holds the physical address the kernel is loaded at. __pa_symbol is modified to add that when we take the address of a kernel symbol. When loaded with a normal bootloader the decompressor will decompress the kernel to 2M and it will run there. This both ensures the relocation code is always working, and makes it easier to use 2M pages for the kernel and the cpu. AK: changed to not make RELOCATABLE default in Kconfig Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de>
-rw-r--r--arch/x86_64/Kconfig49
-rw-r--r--arch/x86_64/boot/compressed/Makefile12
-rw-r--r--arch/x86_64/boot/compressed/head.S322
-rw-r--r--arch/x86_64/boot/compressed/misc.c247
-rw-r--r--arch/x86_64/boot/compressed/vmlinux.lds44
-rw-r--r--arch/x86_64/boot/compressed/vmlinux.scr9
-rw-r--r--arch/x86_64/kernel/head.S233
-rw-r--r--arch/x86_64/kernel/suspend_asm.S7
-rw-r--r--include/asm-x86_64/page.h6
9 files changed, 597 insertions, 332 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index b3dbf11eb82c..715632026073 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -565,23 +565,56 @@ config CRASH_DUMP
565 PHYSICAL_START. 565 PHYSICAL_START.
566 For more details see Documentation/kdump/kdump.txt 566 For more details see Documentation/kdump/kdump.txt
567 567
568config RELOCATABLE
569 bool "Build a relocatable kernel(EXPERIMENTAL)"
570 depends on EXPERIMENTAL
571 help
572 Builds a relocatable kernel. This enables loading and running
573 a kernel binary from a different physical address than it has
574 been compiled for.
575
576 One use is for the kexec on panic case where the recovery kernel
577 must live at a different physical address than the primary
578 kernel.
579
580 Note: If CONFIG_RELOCATABLE=y, then kernel run from the address
581 it has been loaded at and compile time physical address
582 (CONFIG_PHYSICAL_START) is ignored.
583
568config PHYSICAL_START 584config PHYSICAL_START
569 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 585 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
570 default "0x1000000" if CRASH_DUMP
571 default "0x200000" 586 default "0x200000"
572 help 587 help
573 This gives the physical address where the kernel is loaded. Normally 588 This gives the physical address where the kernel is loaded. It
574 for regular kernels this value is 0x200000 (2MB). But in the case 589 should be aligned to 2MB boundary.
575 of kexec on panic the fail safe kernel needs to run at a different 590
576 address than the panic-ed kernel. This option is used to set the load 591 If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
577 address for kernels used to capture crash dump on being kexec'ed 592 bzImage will decompress itself to above physical address and
578 after panic. The default value for crash dump kernels is 593 run from there. Otherwise, bzImage will run from the address where
579 0x1000000 (16MB). This can also be set based on the "X" value as 594 it has been loaded by the boot loader and will ignore above physical
595 address.
596
597 In normal kdump cases one does not have to set/change this option
598 as now bzImage can be compiled as a completely relocatable image
599 (CONFIG_RELOCATABLE=y) and be used to load and run from a different
600 address. This option is mainly useful for the folks who don't want
601 to use a bzImage for capturing the crash dump and want to use a
602 vmlinux instead.
603
604 So if you are using bzImage for capturing the crash dump, leave
605 the value here unchanged to 0x200000 and set CONFIG_RELOCATABLE=y.
606 Otherwise if you plan to use vmlinux for capturing the crash dump
607 change this value to start of the reserved region (Typically 16MB
608 0x1000000). In other words, it can be set based on the "X" value as
580 specified in the "crashkernel=YM@XM" command line boot parameter 609 specified in the "crashkernel=YM@XM" command line boot parameter
581 passed to the panic-ed kernel. Typically this parameter is set as 610 passed to the panic-ed kernel. Typically this parameter is set as
582 crashkernel=64M@16M. Please take a look at 611 crashkernel=64M@16M. Please take a look at
583 Documentation/kdump/kdump.txt for more details about crash dumps. 612 Documentation/kdump/kdump.txt for more details about crash dumps.
584 613
614 Usage of bzImage for capturing the crash dump is advantageous as
615 one does not have to build two kernels. Same kernel can be used
616 as production kernel and capture kernel.
617
585 Don't change this unless you know what you are doing. 618 Don't change this unless you know what you are doing.
586 619
587config SECCOMP 620config SECCOMP
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile
index e70fa6e1da08..705a3e33d7e1 100644
--- a/arch/x86_64/boot/compressed/Makefile
+++ b/arch/x86_64/boot/compressed/Makefile
@@ -8,16 +8,14 @@
8 8
9targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o 9targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
10EXTRA_AFLAGS := -traditional 10EXTRA_AFLAGS := -traditional
11AFLAGS := $(subst -m64,-m32,$(AFLAGS))
12 11
13# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with 12# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with
14# -m32 13# -m32
15CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing 14CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin
16LDFLAGS := -m elf_i386 15LDFLAGS := -m elf_x86_64
17 16
18LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 17LDFLAGS_vmlinux := -T
19 18$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
20$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
21 $(call if_changed,ld) 19 $(call if_changed,ld)
22 @: 20 @:
23 21
@@ -27,7 +25,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
27$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 25$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
28 $(call if_changed,gzip) 26 $(call if_changed,gzip)
29 27
30LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T 28LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
31 29
32$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 30$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
33 $(call if_changed,ld) 31 $(call if_changed,ld)
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
index 6f55565e4d42..c353a9266ea4 100644
--- a/arch/x86_64/boot/compressed/head.S
+++ b/arch/x86_64/boot/compressed/head.S
@@ -26,116 +26,262 @@
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/segment.h> 28#include <asm/segment.h>
29#include <asm/pgtable.h>
29#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/msr.h>
30 32
33.section ".text.head"
31 .code32 34 .code32
32 .globl startup_32 35 .globl startup_32
33 36
34startup_32: 37startup_32:
35 cld 38 cld
36 cli 39 cli
37 movl $(__KERNEL_DS),%eax 40 movl $(__KERNEL_DS), %eax
38 movl %eax,%ds 41 movl %eax, %ds
39 movl %eax,%es 42 movl %eax, %es
40 movl %eax,%fs 43 movl %eax, %ss
41 movl %eax,%gs 44
42 45/* Calculate the delta between where we were compiled to run
43 lss stack_start,%esp 46 * at and where we were actually loaded at. This can only be done
44 xorl %eax,%eax 47 * with a short local call on x86. Nothing else will tell us what
451: incl %eax # check that A20 really IS enabled 48 * address we are running at. The reserved chunk of the real-mode
46 movl %eax,0x000000 # loop forever if it isn't 49 * data at 0x34-0x3f are used as the stack for this calculation.
47 cmpl %eax,0x100000 50 * Only 4 bytes are needed.
48 je 1b 51 */
52 leal 0x40(%esi), %esp
53 call 1f
541: popl %ebp
55 subl $1b, %ebp
56
57/* Compute the delta between where we were compiled to run at
58 * and where the code will actually run at.
59 */
60/* %ebp contains the address we are loaded at by the boot loader and %ebx
61 * contains the address where we should move the kernel image temporarily
62 * for safe in-place decompression.
63 */
64
65#ifdef CONFIG_RELOCATABLE
66 movl %ebp, %ebx
67 addl $(LARGE_PAGE_SIZE -1), %ebx
68 andl $LARGE_PAGE_MASK, %ebx
69#else
70 movl $CONFIG_PHYSICAL_START, %ebx
71#endif
72
73 /* Replace the compressed data size with the uncompressed size */
74 subl input_len(%ebp), %ebx
75 movl output_len(%ebp), %eax
76 addl %eax, %ebx
77 /* Add 8 bytes for every 32K input block */
78 shrl $12, %eax
79 addl %eax, %ebx
80 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
81 addl $(32768 + 18 + 4095), %ebx
82 andl $~4095, %ebx
49 83
50/* 84/*
51 * Initialize eflags. Some BIOS's leave bits like NT set. This would 85 * Prepare for entering 64 bit mode
52 * confuse the debugger if this code is traced.
53 * XXX - best to initialize before switching to protected mode.
54 */ 86 */
55 pushl $0 87
56 popfl 88 /* Load new GDT with the 64bit segments using 32bit descriptor */
89 leal gdt(%ebp), %eax
90 movl %eax, gdt+2(%ebp)
91 lgdt gdt(%ebp)
92
93 /* Enable PAE mode */
94 xorl %eax, %eax
95 orl $(1 << 5), %eax
96 movl %eax, %cr4
97
98 /*
99 * Build early 4G boot pagetable
100 */
101 /* Initialize Page tables to 0*/
102 leal pgtable(%ebx), %edi
103 xorl %eax, %eax
104 movl $((4096*6)/4), %ecx
105 rep stosl
106
107 /* Build Level 4 */
108 leal pgtable + 0(%ebx), %edi
109 leal 0x1007 (%edi), %eax
110 movl %eax, 0(%edi)
111
112 /* Build Level 3 */
113 leal pgtable + 0x1000(%ebx), %edi
114 leal 0x1007(%edi), %eax
115 movl $4, %ecx
1161: movl %eax, 0x00(%edi)
117 addl $0x00001000, %eax
118 addl $8, %edi
119 decl %ecx
120 jnz 1b
121
122 /* Build Level 2 */
123 leal pgtable + 0x2000(%ebx), %edi
124 movl $0x00000183, %eax
125 movl $2048, %ecx
1261: movl %eax, 0(%edi)
127 addl $0x00200000, %eax
128 addl $8, %edi
129 decl %ecx
130 jnz 1b
131
132 /* Enable the boot page tables */
133 leal pgtable(%ebx), %eax
134 movl %eax, %cr3
135
136 /* Enable Long mode in EFER (Extended Feature Enable Register) */
137 movl $MSR_EFER, %ecx
138 rdmsr
139 btsl $_EFER_LME, %eax
140 wrmsr
141
142 /* Setup for the jump to 64bit mode
143 *
144 * When the jump is performend we will be in long mode but
145 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
146 * (and in turn EFER.LMA = 1). To jump into 64bit mode we use
147 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
148 * We place all of the values on our mini stack so lret can
149 * used to perform that far jump.
150 */
151 pushl $__KERNEL_CS
152 leal startup_64(%ebp), %eax
153 pushl %eax
154
155 /* Enter paged protected Mode, activating Long Mode */
156 movl $0x80000001, %eax /* Enable Paging and Protected mode */
157 movl %eax, %cr0
158
159 /* Jump from 32bit compatibility mode into 64bit mode. */
160 lret
161
162 /* Be careful here startup_64 needs to be at a predictable
163 * address so I can export it in an ELF header. Bootloaders
164 * should look at the ELF header to find this address, as
165 * it may change in the future.
166 */
167 .code64
168 .org 0x100
169ENTRY(startup_64)
170 /* We come here either from startup_32 or directly from a
171 * 64bit bootloader. If we come here from a bootloader we depend on
172 * an identity mapped page table being provied that maps our
173 * entire text+data+bss and hopefully all of memory.
174 */
175
176 /* Setup data segments. */
177 xorl %eax, %eax
178 movl %eax, %ds
179 movl %eax, %es
180 movl %eax, %ss
181
182 /* Compute the decompressed kernel start address. It is where
183 * we were loaded at aligned to a 2M boundary. %rbp contains the
184 * decompressed kernel start address.
185 *
186 * If it is a relocatable kernel then decompress and run the kernel
187 * from load address aligned to 2MB addr, otherwise decompress and
188 * run the kernel from CONFIG_PHYSICAL_START
189 */
190
191 /* Start with the delta to where the kernel will run at. */
192#ifdef CONFIG_RELOCATABLE
193 leaq startup_32(%rip) /* - $startup_32 */, %rbp
194 addq $(LARGE_PAGE_SIZE - 1), %rbp
195 andq $LARGE_PAGE_MASK, %rbp
196 movq %rbp, %rbx
197#else
198 movq $CONFIG_PHYSICAL_START, %rbp
199 movq %rbp, %rbx
200#endif
201
202 /* Replace the compressed data size with the uncompressed size */
203 movl input_len(%rip), %eax
204 subq %rax, %rbx
205 movl output_len(%rip), %eax
206 addq %rax, %rbx
207 /* Add 8 bytes for every 32K input block */
208 shrq $12, %rax
209 addq %rax, %rbx
210 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
211 addq $(32768 + 18 + 4095), %rbx
212 andq $~4095, %rbx
213
214/* Copy the compressed kernel to the end of our buffer
215 * where decompression in place becomes safe.
216 */
217 leaq _end(%rip), %r8
218 leaq _end(%rbx), %r9
219 movq $_end /* - $startup_32 */, %rcx
2201: subq $8, %r8
221 subq $8, %r9
222 movq 0(%r8), %rax
223 movq %rax, 0(%r9)
224 subq $8, %rcx
225 jnz 1b
226
227/*
228 * Jump to the relocated address.
229 */
230 leaq relocated(%rbx), %rax
231 jmp *%rax
232
233.section ".text"
234relocated:
235
57/* 236/*
58 * Clear BSS 237 * Clear BSS
59 */ 238 */
60 xorl %eax,%eax 239 xorq %rax, %rax
61 movl $_edata,%edi 240 leaq _edata(%rbx), %rdi
62 movl $_end,%ecx 241 leaq _end(%rbx), %rcx
63 subl %edi,%ecx 242 subq %rdi, %rcx
64 cld 243 cld
65 rep 244 rep
66 stosb 245 stosb
246
247 /* Setup the stack */
248 leaq user_stack_end(%rip), %rsp
249
250 /* zero EFLAGS after setting rsp */
251 pushq $0
252 popfq
253
67/* 254/*
68 * Do the decompression, and jump to the new kernel.. 255 * Do the decompression, and jump to the new kernel..
69 */ 256 */
70 subl $16,%esp # place for structure on the stack 257 pushq %rsi # Save the real mode argument
71 movl %esp,%eax 258 movq %rsi, %rdi # real mode address
72 pushl %esi # real mode pointer as second arg 259 leaq _heap(%rip), %rsi # _heap
73 pushl %eax # address of structure as first arg 260 leaq input_data(%rip), %rdx # input_data
74 call decompress_kernel 261 movl input_len(%rip), %eax
75 orl %eax,%eax 262 movq %rax, %rcx # input_len
76 jnz 3f 263 movq %rbp, %r8 # output
77 addl $8,%esp 264 call decompress_kernel
78 xorl %ebx,%ebx 265 popq %rsi
79 ljmp $(__KERNEL_CS), $__PHYSICAL_START
80 266
81/*
82 * We come here, if we were loaded high.
83 * We need to move the move-in-place routine down to 0x1000
84 * and then start it with the buffer addresses in registers,
85 * which we got from the stack.
86 */
873:
88 movl %esi,%ebx
89 movl $move_routine_start,%esi
90 movl $0x1000,%edi
91 movl $move_routine_end,%ecx
92 subl %esi,%ecx
93 addl $3,%ecx
94 shrl $2,%ecx
95 cld
96 rep
97 movsl
98
99 popl %esi # discard the address
100 addl $4,%esp # real mode pointer
101 popl %esi # low_buffer_start
102 popl %ecx # lcount
103 popl %edx # high_buffer_start
104 popl %eax # hcount
105 movl $__PHYSICAL_START,%edi
106 cli # make sure we don't get interrupted
107 ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
108 267
109/* 268/*
110 * Routine (template) for moving the decompressed kernel in place, 269 * Jump to the decompressed kernel.
111 * if we were high loaded. This _must_ PIC-code !
112 */ 270 */
113move_routine_start: 271 jmp *%rbp
114 movl %ecx,%ebp
115 shrl $2,%ecx
116 rep
117 movsl
118 movl %ebp,%ecx
119 andl $3,%ecx
120 rep
121 movsb
122 movl %edx,%esi
123 movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0
124 addl $3,%ecx
125 shrl $2,%ecx
126 rep
127 movsl
128 movl %ebx,%esi # Restore setup pointer
129 xorl %ebx,%ebx
130 ljmp $(__KERNEL_CS), $__PHYSICAL_START
131move_routine_end:
132 272
133 273 .data
134/* Stack for uncompression */ 274gdt:
135 .align 32 275 .word gdt_end - gdt
136user_stack: 276 .long gdt
277 .word 0
278 .quad 0x0000000000000000 /* NULL descriptor */
279 .quad 0x00af9a000000ffff /* __KERNEL_CS */
280 .quad 0x00cf92000000ffff /* __KERNEL_DS */
281gdt_end:
282 .bss
283/* Stack for uncompression */
284 .balign 4
285user_stack:
137 .fill 4096,4,0 286 .fill 4096,4,0
138stack_start: 287user_stack_end:
139 .long user_stack+4096
140 .word __KERNEL_DS
141
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index 3755b2e394d0..fee54dbf1749 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -9,10 +9,95 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12#define _LINUX_STRING_H_ 1
13#define __LINUX_BITMAP_H 1
14
15#include <linux/linkage.h>
12#include <linux/screen_info.h> 16#include <linux/screen_info.h>
13#include <asm/io.h> 17#include <asm/io.h>
14#include <asm/page.h> 18#include <asm/page.h>
15 19
20/* WARNING!!
21 * This code is compiled with -fPIC and it is relocated dynamically
22 * at run time, but no relocation processing is performed.
23 * This means that it is not safe to place pointers in static structures.
24 */
25
26/*
27 * Getting to provable safe in place decompression is hard.
28 * Worst case behaviours need to be analized.
29 * Background information:
30 *
31 * The file layout is:
32 * magic[2]
33 * method[1]
34 * flags[1]
35 * timestamp[4]
36 * extraflags[1]
37 * os[1]
38 * compressed data blocks[N]
39 * crc[4] orig_len[4]
40 *
41 * resulting in 18 bytes of non compressed data overhead.
42 *
43 * Files divided into blocks
44 * 1 bit (last block flag)
45 * 2 bits (block type)
46 *
47 * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
48 * The smallest block type encoding is always used.
49 *
50 * stored:
51 * 32 bits length in bytes.
52 *
53 * fixed:
54 * magic fixed tree.
55 * symbols.
56 *
57 * dynamic:
58 * dynamic tree encoding.
59 * symbols.
60 *
61 *
62 * The buffer for decompression in place is the length of the
63 * uncompressed data, plus a small amount extra to keep the algorithm safe.
64 * The compressed data is placed at the end of the buffer. The output
65 * pointer is placed at the start of the buffer and the input pointer
66 * is placed where the compressed data starts. Problems will occur
67 * when the output pointer overruns the input pointer.
68 *
69 * The output pointer can only overrun the input pointer if the input
70 * pointer is moving faster than the output pointer. A condition only
71 * triggered by data whose compressed form is larger than the uncompressed
72 * form.
73 *
74 * The worst case at the block level is a growth of the compressed data
75 * of 5 bytes per 32767 bytes.
76 *
77 * The worst case internal to a compressed block is very hard to figure.
78 * The worst case can at least be boundined by having one bit that represents
79 * 32764 bytes and then all of the rest of the bytes representing the very
80 * very last byte.
81 *
82 * All of which is enough to compute an amount of extra data that is required
83 * to be safe. To avoid problems at the block level allocating 5 extra bytes
84 * per 32767 bytes of data is sufficient. To avoind problems internal to a block
85 * adding an extra 32767 bytes (the worst case uncompressed block size) is
86 * sufficient, to ensure that in the worst case the decompressed data for
87 * block will stop the byte before the compressed data for a block begins.
88 * To avoid problems with the compressed data's meta information an extra 18
89 * bytes are needed. Leading to the formula:
90 *
91 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
92 *
93 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
94 * Adding 32768 instead of 32767 just makes for round numbers.
95 * Adding the decompressor_size is necessary as it musht live after all
96 * of the data as well. Last I measured the decompressor is about 14K.
97 * 10K of actuall data and 4K of bss.
98 *
99 */
100
16/* 101/*
17 * gzip declarations 102 * gzip declarations
18 */ 103 */
@@ -28,15 +113,20 @@ typedef unsigned char uch;
28typedef unsigned short ush; 113typedef unsigned short ush;
29typedef unsigned long ulg; 114typedef unsigned long ulg;
30 115
31#define WSIZE 0x8000 /* Window size must be at least 32k, */ 116#define WSIZE 0x80000000 /* Window size must be at least 32k,
32 /* and a power of two */ 117 * and a power of two
118 * We don't actually have a window just
119 * a huge output buffer so I report
120 * a 2G windows size, as that should
121 * always be larger than our output buffer.
122 */
33 123
34static uch *inbuf; /* input buffer */ 124static uch *inbuf; /* input buffer */
35static uch window[WSIZE]; /* Sliding window buffer */ 125static uch *window; /* Sliding window buffer, (and final output buffer) */
36 126
37static unsigned insize = 0; /* valid bytes in inbuf */ 127static unsigned insize; /* valid bytes in inbuf */
38static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ 128static unsigned inptr; /* index of next byte to be processed in inbuf */
39static unsigned outcnt = 0; /* bytes in output buffer */ 129static unsigned outcnt; /* bytes in output buffer */
40 130
41/* gzip flag byte */ 131/* gzip flag byte */
42#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ 132#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
@@ -87,8 +177,6 @@ extern unsigned char input_data[];
87extern int input_len; 177extern int input_len;
88 178
89static long bytes_out = 0; 179static long bytes_out = 0;
90static uch *output_data;
91static unsigned long output_ptr = 0;
92 180
93static void *malloc(int size); 181static void *malloc(int size);
94static void free(void *where); 182static void free(void *where);
@@ -98,17 +186,10 @@ static void *memcpy(void *dest, const void *src, unsigned n);
98 186
99static void putstr(const char *); 187static void putstr(const char *);
100 188
101extern int end; 189static long free_mem_ptr;
102static long free_mem_ptr = (long)&end;
103static long free_mem_end_ptr; 190static long free_mem_end_ptr;
104 191
105#define INPLACE_MOVE_ROUTINE 0x1000 192#define HEAP_SIZE 0x6000
106#define LOW_BUFFER_START 0x2000
107#define LOW_BUFFER_MAX 0x90000
108#define HEAP_SIZE 0x3000
109static unsigned int low_buffer_end, low_buffer_size;
110static int high_loaded =0;
111static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
112 193
113static char *vidmem = (char *)0xb8000; 194static char *vidmem = (char *)0xb8000;
114static int vidport; 195static int vidport;
@@ -218,58 +299,31 @@ static void* memcpy(void* dest, const void* src, unsigned n)
218 */ 299 */
219static int fill_inbuf(void) 300static int fill_inbuf(void)
220{ 301{
221 if (insize != 0) { 302 error("ran out of input data");
222 error("ran out of input data"); 303 return 0;
223 }
224
225 inbuf = input_data;
226 insize = input_len;
227 inptr = 1;
228 return inbuf[0];
229} 304}
230 305
231/* =========================================================================== 306/* ===========================================================================
232 * Write the output window window[0..outcnt-1] and update crc and bytes_out. 307 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
233 * (Used for the decompressed data only.) 308 * (Used for the decompressed data only.)
234 */ 309 */
235static void flush_window_low(void)
236{
237 ulg c = crc; /* temporary variable */
238 unsigned n;
239 uch *in, *out, ch;
240
241 in = window;
242 out = &output_data[output_ptr];
243 for (n = 0; n < outcnt; n++) {
244 ch = *out++ = *in++;
245 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
246 }
247 crc = c;
248 bytes_out += (ulg)outcnt;
249 output_ptr += (ulg)outcnt;
250 outcnt = 0;
251}
252
253static void flush_window_high(void)
254{
255 ulg c = crc; /* temporary variable */
256 unsigned n;
257 uch *in, ch;
258 in = window;
259 for (n = 0; n < outcnt; n++) {
260 ch = *output_data++ = *in++;
261 if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
262 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
263 }
264 crc = c;
265 bytes_out += (ulg)outcnt;
266 outcnt = 0;
267}
268
269static void flush_window(void) 310static void flush_window(void)
270{ 311{
271 if (high_loaded) flush_window_high(); 312 /* With my window equal to my output buffer
272 else flush_window_low(); 313 * I only need to compute the crc here.
314 */
315 ulg c = crc; /* temporary variable */
316 unsigned n;
317 uch *in, ch;
318
319 in = window;
320 for (n = 0; n < outcnt; n++) {
321 ch = *in++;
322 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
323 }
324 crc = c;
325 bytes_out += (ulg)outcnt;
326 outcnt = 0;
273} 327}
274 328
275static void error(char *x) 329static void error(char *x)
@@ -281,57 +335,8 @@ static void error(char *x)
281 while(1); /* Halt */ 335 while(1); /* Halt */
282} 336}
283 337
284static void setup_normal_output_buffer(void) 338asmlinkage void decompress_kernel(void *rmode, unsigned long heap,
285{ 339 uch *input_data, unsigned long input_len, uch *output)
286#ifdef STANDARD_MEMORY_BIOS_CALL
287 if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");
288#else
289 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
290#endif
291 output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */
292 free_mem_end_ptr = (long)real_mode;
293}
294
295struct moveparams {
296 uch *low_buffer_start; int lcount;
297 uch *high_buffer_start; int hcount;
298};
299
300static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
301{
302 high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
303#ifdef STANDARD_MEMORY_BIOS_CALL
304 if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
305#else
306 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
307#endif
308 mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;
309 low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
310 ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
311 low_buffer_size = low_buffer_end - LOW_BUFFER_START;
312 high_loaded = 1;
313 free_mem_end_ptr = (long)high_buffer_start;
314 if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
315 high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
316 mv->hcount = 0; /* say: we need not to move high_buffer */
317 }
318 else mv->hcount = -1;
319 mv->high_buffer_start = high_buffer_start;
320}
321
322static void close_output_buffer_if_we_run_high(struct moveparams *mv)
323{
324 if (bytes_out > low_buffer_size) {
325 mv->lcount = low_buffer_size;
326 if (mv->hcount)
327 mv->hcount = bytes_out - low_buffer_size;
328 } else {
329 mv->lcount = bytes_out;
330 mv->hcount = 0;
331 }
332}
333
334int decompress_kernel(struct moveparams *mv, void *rmode)
335{ 340{
336 real_mode = rmode; 341 real_mode = rmode;
337 342
@@ -346,13 +351,21 @@ int decompress_kernel(struct moveparams *mv, void *rmode)
346 lines = RM_SCREEN_INFO.orig_video_lines; 351 lines = RM_SCREEN_INFO.orig_video_lines;
347 cols = RM_SCREEN_INFO.orig_video_cols; 352 cols = RM_SCREEN_INFO.orig_video_cols;
348 353
349 if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); 354 window = output; /* Output buffer (Normally at 1M) */
350 else setup_output_buffer_if_we_run_high(mv); 355 free_mem_ptr = heap; /* Heap */
356 free_mem_end_ptr = heap + HEAP_SIZE;
357 inbuf = input_data; /* Input buffer */
358 insize = input_len;
359 inptr = 0;
360
361 if ((ulg)output & 0x1fffffUL)
362 error("Destination address not 2M aligned");
363 if ((ulg)output >= 0xffffffffffUL)
364 error("Destination address too large");
351 365
352 makecrc(); 366 makecrc();
353 putstr(".\nDecompressing Linux..."); 367 putstr(".\nDecompressing Linux...");
354 gunzip(); 368 gunzip();
355 putstr("done.\nBooting the kernel.\n"); 369 putstr("done.\nBooting the kernel.\n");
356 if (high_loaded) close_output_buffer_if_we_run_high(mv); 370 return;
357 return high_loaded;
358} 371}
diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds
new file mode 100644
index 000000000000..94c13e557fb4
--- /dev/null
+++ b/arch/x86_64/boot/compressed/vmlinux.lds
@@ -0,0 +1,44 @@
1OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
2OUTPUT_ARCH(i386:x86-64)
3ENTRY(startup_64)
4SECTIONS
5{
6 /* Be careful parts of head.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0;
10 .text : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 *(.text.compressed)
15 _text = .; /* Text */
16 *(.text)
17 *(.text.*)
18 _etext = . ;
19 }
20 .rodata : {
21 _rodata = . ;
22 *(.rodata) /* read-only data */
23 *(.rodata.*)
24 _erodata = . ;
25 }
26 .data : {
27 _data = . ;
28 *(.data)
29 *(.data.*)
30 _edata = . ;
31 }
32 .bss : {
33 _bss = . ;
34 *(.bss)
35 *(.bss.*)
36 *(COMMON)
37 . = ALIGN(8);
38 _end = . ;
39 . = ALIGN(4096);
40 pgtable = . ;
41 . = . + 4096 * 6;
42 _heap = .;
43 }
44}
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr
index 1ed9d791f863..bd1429ce193e 100644
--- a/arch/x86_64/boot/compressed/vmlinux.scr
+++ b/arch/x86_64/boot/compressed/vmlinux.scr
@@ -1,9 +1,10 @@
1SECTIONS 1SECTIONS
2{ 2{
3 .data : { 3 .text.compressed : {
4 input_len = .; 4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .; 5 LONG(input_data_end - input_data) input_data = .;
6 *(.data) 6 *(.data)
7 input_data_end = .; 7 output_len = . - 4;
8 input_data_end = .;
8 } 9 }
9} 10}
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index c211e52f1333..36aa98a6d15c 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -5,6 +5,7 @@
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> 6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> 7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
8 * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
8 */ 9 */
9 10
10 11
@@ -17,95 +18,127 @@
17#include <asm/page.h> 18#include <asm/page.h>
18#include <asm/msr.h> 19#include <asm/msr.h>
19#include <asm/cache.h> 20#include <asm/cache.h>
20 21
21/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 22/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
22 * because we need identity-mapped pages on setup so define __START_KERNEL to 23 * because we need identity-mapped pages.
23 * 0x100000 for this stage 24 *
24 *
25 */ 25 */
26 26
27 .text 27 .text
28 .section .bootstrap.text 28 .section .bootstrap.text
29 .code32 29 .code64
30 .globl startup_32 30 .globl startup_64
31/* %bx: 1 if coming from smp trampoline on secondary cpu */ 31startup_64:
32startup_32: 32
33
34 /* 33 /*
35 * At this point the CPU runs in 32bit protected mode (CS.D = 1) with 34 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
36 * paging disabled and the point of this file is to switch to 64bit 35 * and someone has loaded an identity mapped page table
37 * long mode with a kernel mapping for kerneland to jump into the 36 * for us. These identity mapped page tables map all of the
38 * kernel virtual addresses. 37 * kernel pages and possibly all of memory.
39 * There is no stack until we set one up. 38 *
39 * %esi holds a physical pointer to real_mode_data.
40 *
41 * We come here either directly from a 64bit bootloader, or from
42 * arch/x86_64/boot/compressed/head.S.
43 *
44 * We only come here initially at boot nothing else comes here.
45 *
46 * Since we may be loaded at an address different from what we were
47 * compiled to run at we first fixup the physical addresses in our page
48 * tables and then reload them.
40 */ 49 */
41 50
42 /* Initialize the %ds segment register */ 51 /* Compute the delta between the address I am compiled to run at and the
43 movl $__KERNEL_DS,%eax 52 * address I am actually running at.
44 movl %eax,%ds
45
46 /* Load new GDT with the 64bit segments using 32bit descriptor */
47 lgdt pGDT32 - __START_KERNEL_map
48
49 /* If the CPU doesn't support CPUID this will double fault.
50 * Unfortunately it is hard to check for CPUID without a stack.
51 */ 53 */
52 54 leaq _text(%rip), %rbp
53 /* Check if extended functions are implemented */ 55 subq $_text - __START_KERNEL_map, %rbp
54 movl $0x80000000, %eax 56
55 cpuid 57 /* Is the address not 2M aligned? */
56 cmpl $0x80000000, %eax 58 movq %rbp, %rax
57 jbe no_long_mode 59 andl $~LARGE_PAGE_MASK, %eax
58 /* Check if long mode is implemented */ 60 testl %eax, %eax
59 mov $0x80000001, %eax 61 jnz bad_address
60 cpuid 62
61 btl $29, %edx 63 /* Is the address too large? */
62 jnc no_long_mode 64 leaq _text(%rip), %rdx
63 65 movq $PGDIR_SIZE, %rax
64 /* 66 cmpq %rax, %rdx
65 * Prepare for entering 64bits mode 67 jae bad_address
68
69 /* Fixup the physical addresses in the page table
66 */ 70 */
71 addq %rbp, init_level4_pgt + 0(%rip)
72 addq %rbp, init_level4_pgt + (258*8)(%rip)
73 addq %rbp, init_level4_pgt + (511*8)(%rip)
74
75 addq %rbp, level3_ident_pgt + 0(%rip)
76 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
77
78 /* Add an Identity mapping if I am above 1G */
79 leaq _text(%rip), %rdi
80 andq $LARGE_PAGE_MASK, %rdi
81
82 movq %rdi, %rax
83 shrq $PUD_SHIFT, %rax
84 andq $(PTRS_PER_PUD - 1), %rax
85 jz ident_complete
86
87 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
88 leaq level3_ident_pgt(%rip), %rbx
89 movq %rdx, 0(%rbx, %rax, 8)
90
91 movq %rdi, %rax
92 shrq $PMD_SHIFT, %rax
93 andq $(PTRS_PER_PMD - 1), %rax
94 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
95 leaq level2_spare_pgt(%rip), %rbx
96 movq %rdx, 0(%rbx, %rax, 8)
97ident_complete:
98
99 /* Fixup the kernel text+data virtual addresses
100 */
101 leaq level2_kernel_pgt(%rip), %rdi
102 leaq 4096(%rdi), %r8
103 /* See if it is a valid page table entry */
1041: testq $1, 0(%rdi)
105 jz 2f
106 addq %rbp, 0(%rdi)
107 /* Go to the next page */
1082: addq $8, %rdi
109 cmp %r8, %rdi
110 jne 1b
111
112 /* Fixup phys_base */
113 addq %rbp, phys_base(%rip)
67 114
68 /* Enable PAE mode */ 115#ifdef CONFIG_SMP
69 xorl %eax, %eax 116 addq %rbp, trampoline_level4_pgt + 0(%rip)
70 btsl $5, %eax 117 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
71 movl %eax, %cr4 118#endif
72 119#ifdef CONFIG_ACPI_SLEEP
73 /* Setup early boot stage 4 level pagetables */ 120 addq %rbp, wakeup_level4_pgt + 0(%rip)
74 movl $(init_level4_pgt - __START_KERNEL_map), %eax 121 addq %rbp, wakeup_level4_pgt + (511*8)(%rip)
75 movl %eax, %cr3 122#endif
76
77 /* Setup EFER (Extended Feature Enable Register) */
78 movl $MSR_EFER, %ecx
79 rdmsr
80
81 /* Enable Long Mode */
82 btsl $_EFER_LME, %eax
83
84 /* Make changes effective */
85 wrmsr
86 123
87 xorl %eax, %eax 124 /* Due to ENTRY(), sometimes the empty space gets filled with
88 btsl $31, %eax /* Enable paging and in turn activate Long Mode */ 125 * zeros. Better take a jmp than relying on empty space being
89 btsl $0, %eax /* Enable protected mode */ 126 * filled with 0x90 (nop)
90 /* Make changes effective */
91 movl %eax, %cr0
92 /*
93 * At this point we're in long mode but in 32bit compatibility mode
94 * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
95 * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
96 * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
97 */ 127 */
98 ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) 128 jmp secondary_startup_64
99
100 .code64
101 .org 0x100
102 .globl startup_64
103startup_64:
104ENTRY(secondary_startup_64) 129ENTRY(secondary_startup_64)
105 /* We come here either from startup_32 130 /*
106 * or directly from a 64bit bootloader. 131 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
107 * Since we may have come directly from a bootloader we 132 * and someone has loaded a mapped page table.
108 * reload the page tables here. 133 *
134 * %esi holds a physical pointer to real_mode_data.
135 *
136 * We come here either from startup_64 (using physical addresses)
137 * or from trampoline.S (using virtual addresses).
138 *
139 * Using virtual addresses from trampoline.S removes the need
140 * to have any identity mapped pages in the kernel page table
141 * after the boot processor executes this code.
109 */ 142 */
110 143
111 /* Enable PAE mode and PGE */ 144 /* Enable PAE mode and PGE */
@@ -116,8 +149,14 @@ ENTRY(secondary_startup_64)
116 149
117 /* Setup early boot stage 4 level pagetables. */ 150 /* Setup early boot stage 4 level pagetables. */
118 movq $(init_level4_pgt - __START_KERNEL_map), %rax 151 movq $(init_level4_pgt - __START_KERNEL_map), %rax
152 addq phys_base(%rip), %rax
119 movq %rax, %cr3 153 movq %rax, %cr3
120 154
155 /* Ensure I am executing from virtual addresses */
156 movq $1f, %rax
157 jmp *%rax
1581:
159
121 /* Check if nx is implemented */ 160 /* Check if nx is implemented */
122 movl $0x80000001, %eax 161 movl $0x80000001, %eax
123 cpuid 162 cpuid
@@ -126,17 +165,11 @@ ENTRY(secondary_startup_64)
126 /* Setup EFER (Extended Feature Enable Register) */ 165 /* Setup EFER (Extended Feature Enable Register) */
127 movl $MSR_EFER, %ecx 166 movl $MSR_EFER, %ecx
128 rdmsr 167 rdmsr
129 168 btsl $_EFER_SCE, %eax /* Enable System Call */
130 /* Enable System Call */ 169 btl $20,%edi /* No Execute supported? */
131 btsl $_EFER_SCE, %eax
132
133 /* No Execute supported? */
134 btl $20,%edi
135 jnc 1f 170 jnc 1f
136 btsl $_EFER_NX, %eax 171 btsl $_EFER_NX, %eax
1371: 1721: wrmsr /* Make changes effective */
138 /* Make changes effective */
139 wrmsr
140 173
141 /* Setup cr0 */ 174 /* Setup cr0 */
142#define CR0_PM 1 /* protected mode */ 175#define CR0_PM 1 /* protected mode */
@@ -163,7 +196,7 @@ ENTRY(secondary_startup_64)
163 * addresses where we're currently running on. We have to do that here 196 * addresses where we're currently running on. We have to do that here
164 * because in 32bit we couldn't load a 64bit linear address. 197 * because in 32bit we couldn't load a 64bit linear address.
165 */ 198 */
166 lgdt cpu_gdt_descr 199 lgdt cpu_gdt_descr(%rip)
167 200
168 /* set up data segments. actually 0 would do too */ 201 /* set up data segments. actually 0 would do too */
169 movl $__KERNEL_DS,%eax 202 movl $__KERNEL_DS,%eax
@@ -214,6 +247,9 @@ initial_code:
214init_rsp: 247init_rsp:
215 .quad init_thread_union+THREAD_SIZE-8 248 .quad init_thread_union+THREAD_SIZE-8
216 249
250bad_address:
251 jmp bad_address
252
217ENTRY(early_idt_handler) 253ENTRY(early_idt_handler)
218 cmpl $2,early_recursion_flag(%rip) 254 cmpl $2,early_recursion_flag(%rip)
219 jz 1f 255 jz 1f
@@ -242,23 +278,7 @@ early_idt_msg:
242early_idt_ripmsg: 278early_idt_ripmsg:
243 .asciz "RIP %s\n" 279 .asciz "RIP %s\n"
244 280
245.code32 281.balign PAGE_SIZE
246ENTRY(no_long_mode)
247 /* This isn't an x86-64 CPU so hang */
2481:
249 jmp 1b
250
251.org 0xf00
252 .globl pGDT32
253pGDT32:
254 .word gdt_end-cpu_gdt_table-1
255 .long cpu_gdt_table-__START_KERNEL_map
256
257.org 0xf10
258ljumpvector:
259 .long startup_64-__START_KERNEL_map
260 .word __KERNEL_CS
261
262ENTRY(stext) 282ENTRY(stext)
263ENTRY(_stext) 283ENTRY(_stext)
264 284
@@ -303,7 +323,7 @@ NEXT_PAGE(level2_ident_pgt)
303 * Don't set NX because code runs from these pages. 323 * Don't set NX because code runs from these pages.
304 */ 324 */
305 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 325 PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
306 326
307NEXT_PAGE(level2_kernel_pgt) 327NEXT_PAGE(level2_kernel_pgt)
308 /* 40MB kernel mapping. The kernel code cannot be bigger than that. 328 /* 40MB kernel mapping. The kernel code cannot be bigger than that.
309 When you change this change KERNEL_TEXT_SIZE in page.h too. */ 329 When you change this change KERNEL_TEXT_SIZE in page.h too. */
@@ -313,6 +333,9 @@ NEXT_PAGE(level2_kernel_pgt)
313 /* Module mapping starts here */ 333 /* Module mapping starts here */
314 .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 334 .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
315 335
336NEXT_PAGE(level2_spare_pgt)
337 .fill 512,8,0
338
316#undef PMDS 339#undef PMDS
317#undef NEXT_PAGE 340#undef NEXT_PAGE
318 341
@@ -330,6 +353,10 @@ gdt:
330 .endr 353 .endr
331#endif 354#endif
332 355
356ENTRY(phys_base)
357 /* This must match the first entry in level2_kernel_pgt */
358 .quad 0x0000000000000000
359
333/* We need valid kernel segments for data and code in long mode too 360/* We need valid kernel segments for data and code in long mode too
334 * IRET will check the segment types kkeil 2000/10/28 361 * IRET will check the segment types kkeil 2000/10/28
335 * Also sysret mandates a special GDT layout 362 * Also sysret mandates a special GDT layout
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index bfbe00763c68..16d183f67bc1 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -71,9 +71,10 @@ loop:
71 jmp loop 71 jmp loop
72done: 72done:
73 /* go back to the original page tables */ 73 /* go back to the original page tables */
74 leaq init_level4_pgt(%rip), %rax 74 movq $(init_level4_pgt - __START_KERNEL_map), %rax
75 subq $__START_KERNEL_map, %rax 75 addq phys_base(%rip), %rax
76 movq %rax, %cr3 76 movq %rax, %cr3
77
77 /* Flush TLB, including "global" things (vmalloc) */ 78 /* Flush TLB, including "global" things (vmalloc) */
78 movq mmu_cr4_features(%rip), %rax 79 movq mmu_cr4_features(%rip), %rax
79 movq %rax, %rdx 80 movq %rax, %rdx
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 4974433bbf34..40a24d0df090 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -61,6 +61,8 @@ typedef struct { unsigned long pgd; } pgd_t;
61 61
62typedef struct { unsigned long pgprot; } pgprot_t; 62typedef struct { unsigned long pgprot; } pgprot_t;
63 63
64extern unsigned long phys_base;
65
64#define pte_val(x) ((x).pte) 66#define pte_val(x) ((x).pte)
65#define pmd_val(x) ((x).pmd) 67#define pmd_val(x) ((x).pmd)
66#define pud_val(x) ((x).pud) 68#define pud_val(x) ((x).pud)
@@ -101,14 +103,14 @@ typedef struct { unsigned long pgprot; } pgprot_t;
101#define PAGE_OFFSET __PAGE_OFFSET 103#define PAGE_OFFSET __PAGE_OFFSET
102 104
103/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. 105/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
104 Otherwise you risk miscompilation. */ 106 Otherwise you risk miscompilation. */
105#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) 107#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET)
106/* __pa_symbol should be used for C visible symbols. 108/* __pa_symbol should be used for C visible symbols.
107 This seems to be the official gcc blessed way to do such arithmetic. */ 109 This seems to be the official gcc blessed way to do such arithmetic. */
108#define __pa_symbol(x) \ 110#define __pa_symbol(x) \
109 ({unsigned long v; \ 111 ({unsigned long v; \
110 asm("" : "=r" (v) : "0" (x)); \ 112 asm("" : "=r" (v) : "0" (x)); \
111 (v - __START_KERNEL_map); }) 113 ((v - __START_KERNEL_map) + phys_base); })
112 114
113#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) 115#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
114#ifdef CONFIG_FLATMEM 116#ifdef CONFIG_FLATMEM