aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2013-02-15 12:25:08 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2013-02-15 12:25:08 -0500
commit0da3e7f526fde7a6522a3038b7ce609fc50f6707 (patch)
treec6e2dacd96fe7eac8312f3d7c22e0995dc423879 /arch
parent95c9608478d639dcffc14ea47b31bff021a99ed1 (diff)
parent68d00bbebb5a48b7a9056a8c03476a71ecbc30a6 (diff)
Merge branch 'x86/mm2' into x86/mm
x86/mm2 is testing out fine, but has developed conflicts with x86/mm due to patches in adjacent code. Merge them so we can drop x86/mm2 and have a unified branch. Resolved Conflicts: arch/x86/kernel/setup.c
Diffstat (limited to 'arch')
-rw-r--r--arch/mips/cavium-octeon/dma-octeon.c3
-rw-r--r--arch/sparc/mm/init_64.c24
-rw-r--r--arch/x86/boot/boot.h18
-rw-r--r--arch/x86/boot/cmdline.c12
-rw-r--r--arch/x86/boot/compressed/cmdline.c12
-rw-r--r--arch/x86/boot/compressed/head_64.S48
-rw-r--r--arch/x86/boot/compressed/misc.c2
-rw-r--r--arch/x86/boot/compressed/misc.h1
-rw-r--r--arch/x86/boot/header.S47
-rw-r--r--arch/x86/boot/setup.ld2
-rw-r--r--arch/x86/include/asm/bootparam_utils.h38
-rw-r--r--arch/x86/include/asm/init.h28
-rw-r--r--arch/x86/include/asm/kexec.h6
-rw-r--r--arch/x86/include/asm/numa.h2
-rw-r--r--arch/x86/include/asm/numa_64.h6
-rw-r--r--arch/x86/include/asm/page.h4
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/realmode.h3
-rw-r--r--arch/x86/include/asm/x86_init.h12
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h63
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/amd_gart_64.c5
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/intel.c1
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/head32.c21
-rw-r--r--arch/x86/kernel/head64.c133
-rw-r--r--arch/x86/kernel/head_64.S210
-rw-r--r--arch/x86/kernel/machine_kexec_64.c171
-rw-r--r--arch/x86/kernel/setup.c231
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/kernel/x86_init.c4
-rw-r--r--arch/x86/mm/init.c459
-rw-r--r--arch/x86/mm/init_32.c106
-rw-r--r--arch/x86/mm/init_64.c237
-rw-r--r--arch/x86/mm/mm_internal.h19
-rw-r--r--arch/x86/mm/numa_64.c13
-rw-r--r--arch/x86/mm/pageattr.c16
-rw-r--r--arch/x86/platform/efi/efi.c7
-rw-r--r--arch/x86/power/hibernate_64.c66
-rw-r--r--arch/x86/realmode/init.c45
-rw-r--r--arch/x86/tools/relocs.c6
-rw-r--r--arch/x86/xen/mmu.c28
47 files changed, 1254 insertions, 900 deletions
diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c
index 41dd00884975..02f244475207 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -317,7 +317,8 @@ void __init plat_swiotlb_setup(void)
317 317
318 octeon_swiotlb = alloc_bootmem_low_pages(swiotlbsize); 318 octeon_swiotlb = alloc_bootmem_low_pages(swiotlbsize);
319 319
320 swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1); 320 if (swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1) == -ENOMEM)
321 panic("Cannot allocate SWIOTLB buffer");
321 322
322 mips_dma_map_ops = &octeon_linear_dma_map_ops.dma_map_ops; 323 mips_dma_map_ops = &octeon_linear_dma_map_ops.dma_map_ops;
323} 324}
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index c3b72423c846..fc5a7c4bd9e8 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2021,6 +2021,16 @@ static void __init patch_tlb_miss_handler_bitmap(void)
2021 flushi(&valid_addr_bitmap_insn[0]); 2021 flushi(&valid_addr_bitmap_insn[0]);
2022} 2022}
2023 2023
2024static void __init register_page_bootmem_info(void)
2025{
2026#ifdef CONFIG_NEED_MULTIPLE_NODES
2027 int i;
2028
2029 for_each_online_node(i)
2030 if (NODE_DATA(i)->node_spanned_pages)
2031 register_page_bootmem_info_node(NODE_DATA(i));
2032#endif
2033}
2024void __init mem_init(void) 2034void __init mem_init(void)
2025{ 2035{
2026 unsigned long codepages, datapages, initpages; 2036 unsigned long codepages, datapages, initpages;
@@ -2038,20 +2048,8 @@ void __init mem_init(void)
2038 2048
2039 high_memory = __va(last_valid_pfn << PAGE_SHIFT); 2049 high_memory = __va(last_valid_pfn << PAGE_SHIFT);
2040 2050
2041#ifdef CONFIG_NEED_MULTIPLE_NODES 2051 register_page_bootmem_info();
2042 {
2043 int i;
2044 for_each_online_node(i) {
2045 if (NODE_DATA(i)->node_spanned_pages != 0) {
2046 totalram_pages +=
2047 free_all_bootmem_node(NODE_DATA(i));
2048 }
2049 }
2050 totalram_pages += free_low_memory_core_early(MAX_NUMNODES);
2051 }
2052#else
2053 totalram_pages = free_all_bootmem(); 2052 totalram_pages = free_all_bootmem();
2054#endif
2055 2053
2056 /* We subtract one to account for the mem_map_zero page 2054 /* We subtract one to account for the mem_map_zero page
2057 * allocated below. 2055 * allocated below.
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 18997e5a1053..5b7531966b84 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -285,16 +285,26 @@ struct biosregs {
285void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); 285void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
286 286
287/* cmdline.c */ 287/* cmdline.c */
288int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize); 288int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize);
289int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option); 289int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option);
290static inline int cmdline_find_option(const char *option, char *buffer, int bufsize) 290static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
291{ 291{
292 return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize); 292 unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
293
294 if (cmd_line_ptr >= 0x100000)
295 return -1; /* inaccessible */
296
297 return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize);
293} 298}
294 299
295static inline int cmdline_find_option_bool(const char *option) 300static inline int cmdline_find_option_bool(const char *option)
296{ 301{
297 return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option); 302 unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
303
304 if (cmd_line_ptr >= 0x100000)
305 return -1; /* inaccessible */
306
307 return __cmdline_find_option_bool(cmd_line_ptr, option);
298} 308}
299 309
300 310
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 6b3b6f708c04..625d21b0cd3f 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,7 +27,7 @@ static inline int myisspace(u8 c)
27 * Returns the length of the argument (regardless of if it was 27 * Returns the length of the argument (regardless of if it was
28 * truncated to fit in the buffer), or -1 on not found. 28 * truncated to fit in the buffer), or -1 on not found.
29 */ 29 */
30int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize) 30int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize)
31{ 31{
32 addr_t cptr; 32 addr_t cptr;
33 char c; 33 char c;
@@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
41 st_bufcpy /* Copying this to buffer */ 41 st_bufcpy /* Copying this to buffer */
42 } state = st_wordstart; 42 } state = st_wordstart;
43 43
44 if (!cmdline_ptr || cmdline_ptr >= 0x100000) 44 if (!cmdline_ptr)
45 return -1; /* No command line, or inaccessible */ 45 return -1; /* No command line */
46 46
47 cptr = cmdline_ptr & 0xf; 47 cptr = cmdline_ptr & 0xf;
48 set_fs(cmdline_ptr >> 4); 48 set_fs(cmdline_ptr >> 4);
@@ -99,7 +99,7 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
99 * Returns the position of that option (starts counting with 1) 99 * Returns the position of that option (starts counting with 1)
100 * or 0 on not found 100 * or 0 on not found
101 */ 101 */
102int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) 102int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option)
103{ 103{
104 addr_t cptr; 104 addr_t cptr;
105 char c; 105 char c;
@@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
111 st_wordskip, /* Miscompare, skip */ 111 st_wordskip, /* Miscompare, skip */
112 } state = st_wordstart; 112 } state = st_wordstart;
113 113
114 if (!cmdline_ptr || cmdline_ptr >= 0x100000) 114 if (!cmdline_ptr)
115 return -1; /* No command line, or inaccessible */ 115 return -1; /* No command line */
116 116
117 cptr = cmdline_ptr & 0xf; 117 cptr = cmdline_ptr & 0xf;
118 set_fs(cmdline_ptr >> 4); 118 set_fs(cmdline_ptr >> 4);
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 10f6b1178c68..bffd73b45b1f 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,13 +13,21 @@ static inline char rdfs8(addr_t addr)
13 return *((char *)(fs + addr)); 13 return *((char *)(fs + addr));
14} 14}
15#include "../cmdline.c" 15#include "../cmdline.c"
16static unsigned long get_cmd_line_ptr(void)
17{
18 unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
19
20 cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32;
21
22 return cmd_line_ptr;
23}
16int cmdline_find_option(const char *option, char *buffer, int bufsize) 24int cmdline_find_option(const char *option, char *buffer, int bufsize)
17{ 25{
18 return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize); 26 return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize);
19} 27}
20int cmdline_find_option_bool(const char *option) 28int cmdline_find_option_bool(const char *option)
21{ 29{
22 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); 30 return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
23} 31}
24 32
25#endif 33#endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 2c4b171eec33..d9ae9a4ffcb9 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -37,6 +37,12 @@
37 __HEAD 37 __HEAD
38 .code32 38 .code32
39ENTRY(startup_32) 39ENTRY(startup_32)
40 /*
41 * 32bit entry is 0 and it is ABI so immutable!
42 * If we come here directly from a bootloader,
43 * kernel(text+data+bss+brk) ramdisk, zero_page, command line
44 * all need to be under the 4G limit.
45 */
40 cld 46 cld
41 /* 47 /*
42 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 48 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -154,6 +160,12 @@ ENTRY(startup_32)
154 btsl $_EFER_LME, %eax 160 btsl $_EFER_LME, %eax
155 wrmsr 161 wrmsr
156 162
163 /* After gdt is loaded */
164 xorl %eax, %eax
165 lldt %ax
166 movl $0x20, %eax
167 ltr %ax
168
157 /* 169 /*
158 * Setup for the jump to 64bit mode 170 * Setup for the jump to 64bit mode
159 * 171 *
@@ -176,28 +188,18 @@ ENTRY(startup_32)
176 lret 188 lret
177ENDPROC(startup_32) 189ENDPROC(startup_32)
178 190
179no_longmode:
180 /* This isn't an x86-64 CPU so hang */
1811:
182 hlt
183 jmp 1b
184
185#include "../../kernel/verify_cpu.S"
186
187 /*
188 * Be careful here startup_64 needs to be at a predictable
189 * address so I can export it in an ELF header. Bootloaders
190 * should look at the ELF header to find this address, as
191 * it may change in the future.
192 */
193 .code64 191 .code64
194 .org 0x200 192 .org 0x200
195ENTRY(startup_64) 193ENTRY(startup_64)
196 /* 194 /*
195 * 64bit entry is 0x200 and it is ABI so immutable!
197 * We come here either from startup_32 or directly from a 196 * We come here either from startup_32 or directly from a
198 * 64bit bootloader. If we come here from a bootloader we depend on 197 * 64bit bootloader.
199 * an identity mapped page table being provied that maps our 198 * If we come here from a bootloader, kernel(text+data+bss+brk),
200 * entire text+data+bss and hopefully all of memory. 199 * ramdisk, zero_page, command line could be above 4G.
200 * We depend on an identity mapped page table being provided
201 * that maps our entire kernel(text+data+bss+brk), zero page
202 * and command line.
201 */ 203 */
202#ifdef CONFIG_EFI_STUB 204#ifdef CONFIG_EFI_STUB
203 /* 205 /*
@@ -247,9 +249,6 @@ preferred_addr:
247 movl %eax, %ss 249 movl %eax, %ss
248 movl %eax, %fs 250 movl %eax, %fs
249 movl %eax, %gs 251 movl %eax, %gs
250 lldt %ax
251 movl $0x20, %eax
252 ltr %ax
253 252
254 /* 253 /*
255 * Compute the decompressed kernel start address. It is where 254 * Compute the decompressed kernel start address. It is where
@@ -349,6 +348,15 @@ relocated:
349 */ 348 */
350 jmp *%rbp 349 jmp *%rbp
351 350
351 .code32
352no_longmode:
353 /* This isn't an x86-64 CPU so hang */
3541:
355 hlt
356 jmp 1b
357
358#include "../../kernel/verify_cpu.S"
359
352 .data 360 .data
353gdt: 361gdt:
354 .word gdt_end - gdt 362 .word gdt_end - gdt
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 88f7ff6da404..7cb56c6ca351 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,6 +325,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
325{ 325{
326 real_mode = rmode; 326 real_mode = rmode;
327 327
328 sanitize_boot_params(real_mode);
329
328 if (real_mode->screen_info.orig_video_mode == 7) { 330 if (real_mode->screen_info.orig_video_mode == 7) {
329 vidmem = (char *) 0xb0000; 331 vidmem = (char *) 0xb0000;
330 vidport = 0x3b4; 332 vidport = 0x3b4;
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 0e6dc0ee0eea..674019d8e235 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -18,6 +18,7 @@
18#include <asm/page.h> 18#include <asm/page.h>
19#include <asm/boot.h> 19#include <asm/boot.h>
20#include <asm/bootparam.h> 20#include <asm/bootparam.h>
21#include <asm/bootparam_utils.h>
21 22
22#define BOOT_BOOT_H 23#define BOOT_BOOT_H
23#include "../ctype.h" 24#include "../ctype.h"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 8c132a625b94..9ec06a1f6d61 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -21,6 +21,7 @@
21#include <asm/e820.h> 21#include <asm/e820.h>
22#include <asm/page_types.h> 22#include <asm/page_types.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include <asm/bootparam.h>
24#include "boot.h" 25#include "boot.h"
25#include "voffset.h" 26#include "voffset.h"
26#include "zoffset.h" 27#include "zoffset.h"
@@ -255,6 +256,9 @@ section_table:
255 # header, from the old boot sector. 256 # header, from the old boot sector.
256 257
257 .section ".header", "a" 258 .section ".header", "a"
259 .globl sentinel
260sentinel: .byte 0xff, 0xff /* Used to detect broken loaders */
261
258 .globl hdr 262 .globl hdr
259hdr: 263hdr:
260setup_sects: .byte 0 /* Filled in by build.c */ 264setup_sects: .byte 0 /* Filled in by build.c */
@@ -279,7 +283,7 @@ _start:
279 # Part 2 of the header, from the old setup.S 283 # Part 2 of the header, from the old setup.S
280 284
281 .ascii "HdrS" # header signature 285 .ascii "HdrS" # header signature
282 .word 0x020b # header version number (>= 0x0105) 286 .word 0x020c # header version number (>= 0x0105)
283 # or else old loadlin-1.5 will fail) 287 # or else old loadlin-1.5 will fail)
284 .globl realmode_swtch 288 .globl realmode_swtch
285realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 289realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -297,13 +301,7 @@ type_of_loader: .byte 0 # 0 means ancient bootloader, newer
297 301
298# flags, unused bits must be zero (RFU) bit within loadflags 302# flags, unused bits must be zero (RFU) bit within loadflags
299loadflags: 303loadflags:
300LOADED_HIGH = 1 # If set, the kernel is loaded high 304 .byte LOADED_HIGH # The kernel is to be loaded high
301CAN_USE_HEAP = 0x80 # If set, the loader also has set
302 # heap_end_ptr to tell how much
303 # space behind setup.S can be used for
304 # heap purposes.
305 # Only the loader knows what is free
306 .byte LOADED_HIGH
307 305
308setup_move_size: .word 0x8000 # size to move, when setup is not 306setup_move_size: .word 0x8000 # size to move, when setup is not
309 # loaded at 0x90000. We will move setup 307 # loaded at 0x90000. We will move setup
@@ -369,7 +367,31 @@ relocatable_kernel: .byte 1
369relocatable_kernel: .byte 0 367relocatable_kernel: .byte 0
370#endif 368#endif
371min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment 369min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment
372pad3: .word 0 370
371xloadflags:
372#ifdef CONFIG_X86_64
373# define XLF0 XLF_KERNEL_64 /* 64-bit kernel */
374#else
375# define XLF0 0
376#endif
377
378#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64)
379 /* kernel/boot_param/ramdisk could be loaded above 4g */
380# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
381#else
382# define XLF1 0
383#endif
384
385#ifdef CONFIG_EFI_STUB
386# ifdef CONFIG_X86_64
387# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */
388# else
389# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */
390# endif
391#else
392# define XLF23 0
393#endif
394 .word XLF0 | XLF1 | XLF23
373 395
374cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, 396cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
375 #added with boot protocol 397 #added with boot protocol
@@ -397,8 +419,13 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
397#define INIT_SIZE VO_INIT_SIZE 419#define INIT_SIZE VO_INIT_SIZE
398#endif 420#endif
399init_size: .long INIT_SIZE # kernel initialization size 421init_size: .long INIT_SIZE # kernel initialization size
400handover_offset: .long 0x30 # offset to the handover 422handover_offset:
423#ifdef CONFIG_EFI_STUB
424 .long 0x30 # offset to the handover
401 # protocol entry point 425 # protocol entry point
426#else
427 .long 0
428#endif
402 429
403# End of setup header ##################################################### 430# End of setup header #####################################################
404 431
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 03c0683636b6..96a6c7563538 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -13,7 +13,7 @@ SECTIONS
13 .bstext : { *(.bstext) } 13 .bstext : { *(.bstext) }
14 .bsdata : { *(.bsdata) } 14 .bsdata : { *(.bsdata) }
15 15
16 . = 497; 16 . = 495;
17 .header : { *(.header) } 17 .header : { *(.header) }
18 .entrytext : { *(.entrytext) } 18 .entrytext : { *(.entrytext) }
19 .inittext : { *(.inittext) } 19 .inittext : { *(.inittext) }
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
new file mode 100644
index 000000000000..5b5e9cb774b5
--- /dev/null
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -0,0 +1,38 @@
1#ifndef _ASM_X86_BOOTPARAM_UTILS_H
2#define _ASM_X86_BOOTPARAM_UTILS_H
3
4#include <asm/bootparam.h>
5
6/*
7 * This file is included from multiple environments. Do not
8 * add completing #includes to make it standalone.
9 */
10
11/*
12 * Deal with bootloaders which fail to initialize unknown fields in
13 * boot_params to zero. The list fields in this list are taken from
14 * analysis of kexec-tools; if other broken bootloaders initialize a
15 * different set of fields we will need to figure out how to disambiguate.
16 *
17 */
18static void sanitize_boot_params(struct boot_params *boot_params)
19{
20 if (boot_params->sentinel) {
21 /*fields in boot_params are not valid, clear them */
22 memset(&boot_params->olpc_ofw_header, 0,
23 (char *)&boot_params->alt_mem_k -
24 (char *)&boot_params->olpc_ofw_header);
25 memset(&boot_params->kbd_status, 0,
26 (char *)&boot_params->hdr -
27 (char *)&boot_params->kbd_status);
28 memset(&boot_params->_pad7[0], 0,
29 (char *)&boot_params->edd_mbr_sig_buffer[0] -
30 (char *)&boot_params->_pad7[0]);
31 memset(&boot_params->_pad8[0], 0,
32 (char *)&boot_params->eddbuf[0] -
33 (char *)&boot_params->_pad8[0]);
34 memset(&boot_params->_pad9[0], 0, sizeof(boot_params->_pad9));
35 }
36}
37
38#endif /* _ASM_X86_BOOTPARAM_UTILS_H */
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index adcc0ae73d09..223042086f4e 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -1,20 +1,14 @@
1#ifndef _ASM_X86_INIT_32_H 1#ifndef _ASM_X86_INIT_H
2#define _ASM_X86_INIT_32_H 2#define _ASM_X86_INIT_H
3 3
4#ifdef CONFIG_X86_32 4struct x86_mapping_info {
5extern void __init early_ioremap_page_table_range_init(void); 5 void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
6#endif 6 void *context; /* context for alloc_pgt_page */
7 unsigned long pmd_flag; /* page flag for PMD entry */
8 bool kernel_mapping; /* kernel mapping or ident mapping */
9};
7 10
8extern void __init zone_sizes_init(void); 11int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
12 unsigned long addr, unsigned long end);
9 13
10extern unsigned long __init 14#endif /* _ASM_X86_INIT_H */
11kernel_physical_mapping_init(unsigned long start,
12 unsigned long end,
13 unsigned long page_size_mask);
14
15
16extern unsigned long __initdata pgt_buf_start;
17extern unsigned long __meminitdata pgt_buf_end;
18extern unsigned long __meminitdata pgt_buf_top;
19
20#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 6080d2694bad..17483a492f18 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -48,11 +48,11 @@
48# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64) 48# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
49#else 49#else
50/* Maximum physical address we can use pages from */ 50/* Maximum physical address we can use pages from */
51# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) 51# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1)
52/* Maximum address we can reach in physical address mode */ 52/* Maximum address we can reach in physical address mode */
53# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) 53# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1)
54/* Maximum address we can use for the control pages */ 54/* Maximum address we can use for the control pages */
55# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) 55# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
56 56
57/* Allocate one page for the pdp and the second for the code */ 57/* Allocate one page for the pdp and the second for the code */
58# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) 58# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 49119fcea2dc..52560a2038e1 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu)
54 54
55#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
56# include <asm/numa_32.h> 56# include <asm/numa_32.h>
57#else
58# include <asm/numa_64.h>
59#endif 57#endif
60 58
61#ifdef CONFIG_NUMA 59#ifdef CONFIG_NUMA
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
deleted file mode 100644
index 0c05f7ae46e8..000000000000
--- a/arch/x86/include/asm/numa_64.h
+++ /dev/null
@@ -1,6 +0,0 @@
1#ifndef _ASM_X86_NUMA_64_H
2#define _ASM_X86_NUMA_64_H
3
4extern unsigned long numa_free_all_bootmem(void);
5
6#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 3698a6a0a940..c87892442e53 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -17,6 +17,10 @@
17 17
18struct page; 18struct page;
19 19
20#include <linux/range.h>
21extern struct range pfn_mapped[];
22extern int nr_pfn_mapped;
23
20static inline void clear_user_page(void *page, unsigned long vaddr, 24static inline void clear_user_page(void *page, unsigned long vaddr,
21 struct page *pg) 25 struct page *pg)
22{ 26{
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index e21fdd10479f..54c97879195e 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,6 +51,8 @@ static inline phys_addr_t get_max_mapped(void)
51 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; 51 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
52} 52}
53 53
54bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
55
54extern unsigned long init_memory_mapping(unsigned long start, 56extern unsigned long init_memory_mapping(unsigned long start,
55 unsigned long end); 57 unsigned long end);
56 58
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index bc28e6fe7052..b6e41b8cd659 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -616,6 +616,8 @@ static inline int pgd_none(pgd_t pgd)
616#ifndef __ASSEMBLY__ 616#ifndef __ASSEMBLY__
617 617
618extern int direct_gbpages; 618extern int direct_gbpages;
619void init_mem_mapping(void);
620void early_alloc_pgt_buf(void);
619 621
620/* local pte updates need not use xchg for locking */ 622/* local pte updates need not use xchg for locking */
621static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 623static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16fbbbd..2d883440cb9a 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_PGTABLE_64_DEFS_H 1#ifndef _ASM_X86_PGTABLE_64_DEFS_H
2#define _ASM_X86_PGTABLE_64_DEFS_H 2#define _ASM_X86_PGTABLE_64_DEFS_H
3 3
4#include <asm/sparsemem.h>
5
4#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
5#include <linux/types.h> 7#include <linux/types.h>
6 8
@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
60#define MODULES_END _AC(0xffffffffff000000, UL) 62#define MODULES_END _AC(0xffffffffff000000, UL)
61#define MODULES_LEN (MODULES_END - MODULES_VADDR) 63#define MODULES_LEN (MODULES_END - MODULES_VADDR)
62 64
65#define EARLY_DYNAMIC_PAGE_TABLES 64
66
63#endif /* _ASM_X86_PGTABLE_64_DEFS_H */ 67#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 9f82690f81ed..e6423002c10b 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -321,7 +321,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
321/* Install a pte for a particular vaddr in kernel space. */ 321/* Install a pte for a particular vaddr in kernel space. */
322void set_pte_vaddr(unsigned long vaddr, pte_t pte); 322void set_pte_vaddr(unsigned long vaddr, pte_t pte);
323 323
324extern void native_pagetable_reserve(u64 start, u64 end);
325#ifdef CONFIG_X86_32 324#ifdef CONFIG_X86_32
326extern void native_pagetable_init(void); 325extern void native_pagetable_init(void);
327#else 326#else
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 888184b2fc85..bdee8bd318ea 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -731,6 +731,7 @@ extern void enable_sep_cpu(void);
731extern int sysenter_setup(void); 731extern int sysenter_setup(void);
732 732
733extern void early_trap_init(void); 733extern void early_trap_init(void);
734void early_trap_pf_init(void);
734 735
735/* Defined in head.S */ 736/* Defined in head.S */
736extern struct desc_ptr early_gdt_descr; 737extern struct desc_ptr early_gdt_descr;
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fe1ec5bcd846..9c6b890d5e7a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -58,6 +58,7 @@ extern unsigned char boot_gdt[];
58extern unsigned char secondary_startup_64[]; 58extern unsigned char secondary_startup_64[];
59#endif 59#endif
60 60
61extern void __init setup_real_mode(void); 61void reserve_real_mode(void);
62void setup_real_mode(void);
62 63
63#endif /* _ARCH_X86_REALMODE_H */ 64#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 57693498519c..3b2ce8fc995a 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -69,17 +69,6 @@ struct x86_init_oem {
69}; 69};
70 70
71/** 71/**
72 * struct x86_init_mapping - platform specific initial kernel pagetable setup
73 * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
74 *
75 * For more details on the purpose of this hook, look in
76 * init_memory_mapping and the commit that added it.
77 */
78struct x86_init_mapping {
79 void (*pagetable_reserve)(u64 start, u64 end);
80};
81
82/**
83 * struct x86_init_paging - platform specific paging functions 72 * struct x86_init_paging - platform specific paging functions
84 * @pagetable_init: platform specific paging initialization call to setup 73 * @pagetable_init: platform specific paging initialization call to setup
85 * the kernel pagetables and prepare accessors functions. 74 * the kernel pagetables and prepare accessors functions.
@@ -136,7 +125,6 @@ struct x86_init_ops {
136 struct x86_init_mpparse mpparse; 125 struct x86_init_mpparse mpparse;
137 struct x86_init_irqs irqs; 126 struct x86_init_irqs irqs;
138 struct x86_init_oem oem; 127 struct x86_init_oem oem;
139 struct x86_init_mapping mapping;
140 struct x86_init_paging paging; 128 struct x86_init_paging paging;
141 struct x86_init_timers timers; 129 struct x86_init_timers timers;
142 struct x86_init_iommu iommu; 130 struct x86_init_iommu iommu;
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 92862cd90201..c15ddaf90710 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -1,6 +1,31 @@
1#ifndef _ASM_X86_BOOTPARAM_H 1#ifndef _ASM_X86_BOOTPARAM_H
2#define _ASM_X86_BOOTPARAM_H 2#define _ASM_X86_BOOTPARAM_H
3 3
4/* setup_data types */
5#define SETUP_NONE 0
6#define SETUP_E820_EXT 1
7#define SETUP_DTB 2
8#define SETUP_PCI 3
9
10/* ram_size flags */
11#define RAMDISK_IMAGE_START_MASK 0x07FF
12#define RAMDISK_PROMPT_FLAG 0x8000
13#define RAMDISK_LOAD_FLAG 0x4000
14
15/* loadflags */
16#define LOADED_HIGH (1<<0)
17#define QUIET_FLAG (1<<5)
18#define KEEP_SEGMENTS (1<<6)
19#define CAN_USE_HEAP (1<<7)
20
21/* xloadflags */
22#define XLF_KERNEL_64 (1<<0)
23#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1)
24#define XLF_EFI_HANDOVER_32 (1<<2)
25#define XLF_EFI_HANDOVER_64 (1<<3)
26
27#ifndef __ASSEMBLY__
28
4#include <linux/types.h> 29#include <linux/types.h>
5#include <linux/screen_info.h> 30#include <linux/screen_info.h>
6#include <linux/apm_bios.h> 31#include <linux/apm_bios.h>
@@ -9,12 +34,6 @@
9#include <asm/ist.h> 34#include <asm/ist.h>
10#include <video/edid.h> 35#include <video/edid.h>
11 36
12/* setup data types */
13#define SETUP_NONE 0
14#define SETUP_E820_EXT 1
15#define SETUP_DTB 2
16#define SETUP_PCI 3
17
18/* extensible setup data list node */ 37/* extensible setup data list node */
19struct setup_data { 38struct setup_data {
20 __u64 next; 39 __u64 next;
@@ -28,9 +47,6 @@ struct setup_header {
28 __u16 root_flags; 47 __u16 root_flags;
29 __u32 syssize; 48 __u32 syssize;
30 __u16 ram_size; 49 __u16 ram_size;
31#define RAMDISK_IMAGE_START_MASK 0x07FF
32#define RAMDISK_PROMPT_FLAG 0x8000
33#define RAMDISK_LOAD_FLAG 0x4000
34 __u16 vid_mode; 50 __u16 vid_mode;
35 __u16 root_dev; 51 __u16 root_dev;
36 __u16 boot_flag; 52 __u16 boot_flag;
@@ -42,10 +58,6 @@ struct setup_header {
42 __u16 kernel_version; 58 __u16 kernel_version;
43 __u8 type_of_loader; 59 __u8 type_of_loader;
44 __u8 loadflags; 60 __u8 loadflags;
45#define LOADED_HIGH (1<<0)
46#define QUIET_FLAG (1<<5)
47#define KEEP_SEGMENTS (1<<6)
48#define CAN_USE_HEAP (1<<7)
49 __u16 setup_move_size; 61 __u16 setup_move_size;
50 __u32 code32_start; 62 __u32 code32_start;
51 __u32 ramdisk_image; 63 __u32 ramdisk_image;
@@ -58,7 +70,8 @@ struct setup_header {
58 __u32 initrd_addr_max; 70 __u32 initrd_addr_max;
59 __u32 kernel_alignment; 71 __u32 kernel_alignment;
60 __u8 relocatable_kernel; 72 __u8 relocatable_kernel;
61 __u8 _pad2[3]; 73 __u8 min_alignment;
74 __u16 xloadflags;
62 __u32 cmdline_size; 75 __u32 cmdline_size;
63 __u32 hardware_subarch; 76 __u32 hardware_subarch;
64 __u64 hardware_subarch_data; 77 __u64 hardware_subarch_data;
@@ -106,7 +119,10 @@ struct boot_params {
106 __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ 119 __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
107 struct sys_desc_table sys_desc_table; /* 0x0a0 */ 120 struct sys_desc_table sys_desc_table; /* 0x0a0 */
108 struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */ 121 struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */
109 __u8 _pad4[128]; /* 0x0c0 */ 122 __u32 ext_ramdisk_image; /* 0x0c0 */
123 __u32 ext_ramdisk_size; /* 0x0c4 */
124 __u32 ext_cmd_line_ptr; /* 0x0c8 */
125 __u8 _pad4[116]; /* 0x0cc */
110 struct edid_info edid_info; /* 0x140 */ 126 struct edid_info edid_info; /* 0x140 */
111 struct efi_info efi_info; /* 0x1c0 */ 127 struct efi_info efi_info; /* 0x1c0 */
112 __u32 alt_mem_k; /* 0x1e0 */ 128 __u32 alt_mem_k; /* 0x1e0 */
@@ -115,7 +131,20 @@ struct boot_params {
115 __u8 eddbuf_entries; /* 0x1e9 */ 131 __u8 eddbuf_entries; /* 0x1e9 */
116 __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ 132 __u8 edd_mbr_sig_buf_entries; /* 0x1ea */
117 __u8 kbd_status; /* 0x1eb */ 133 __u8 kbd_status; /* 0x1eb */
118 __u8 _pad6[5]; /* 0x1ec */ 134 __u8 _pad5[3]; /* 0x1ec */
135 /*
136 * The sentinel is set to a nonzero value (0xff) in header.S.
137 *
138 * A bootloader is supposed to only take setup_header and put
139 * it into a clean boot_params buffer. If it turns out that
140 * it is clumsy or too generous with the buffer, it most
141 * probably will pick up the sentinel variable too. The fact
142 * that this variable then is still 0xff will let kernel
143 * know that some variables in boot_params are invalid and
144 * kernel should zero out certain portions of boot_params.
145 */
146 __u8 sentinel; /* 0x1ef */
147 __u8 _pad6[1]; /* 0x1f0 */
119 struct setup_header hdr; /* setup header */ /* 0x1f1 */ 148 struct setup_header hdr; /* setup header */ /* 0x1f1 */
120 __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; 149 __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
121 __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ 150 __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
@@ -134,6 +163,6 @@ enum {
134 X86_NR_SUBARCHS, 163 X86_NR_SUBARCHS,
135}; 164};
136 165
137 166#endif /* __ASSEMBLY__ */
138 167
139#endif /* _ASM_X86_BOOTPARAM_H */ 168#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bacf4b0d91f4..cfc755dc1607 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
51 51
52#ifdef CONFIG_X86_64 52#ifdef CONFIG_X86_64
53# include <asm/proto.h> 53# include <asm/proto.h>
54# include <asm/numa_64.h>
55#endif /* X86 */ 54#endif /* X86 */
56 55
57#define BAD_MADT_ENTRY(entry, end) ( \ 56#define BAD_MADT_ENTRY(entry, end) ( \
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e66311200cbd..b574b295a2f9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
768 aper_base = info.aper_base; 768 aper_base = info.aper_base;
769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
770 770
771 if (end_pfn > max_low_pfn_mapped) { 771 start_pfn = PFN_DOWN(aper_base);
772 start_pfn = (aper_base>>PAGE_SHIFT); 772 if (!pfn_range_is_mapped(start_pfn, end_pfn))
773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
774 }
775 774
776 pr_info("PCI-DMA: using GART IOMMU.\n"); 775 pr_info("PCI-DMA: using GART IOMMU.\n");
777 iommu_size = check_iommu_size(info.aper_base, aper_size); 776 iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 15239fffd6fe..eafb084e80f8 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,6 @@
12#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
13 13
14#ifdef CONFIG_X86_64 14#ifdef CONFIG_X86_64
15# include <asm/numa_64.h>
16# include <asm/mmconfig.h> 15# include <asm/mmconfig.h>
17# include <asm/cacheflush.h> 16# include <asm/cacheflush.h>
18#endif 17#endif
@@ -685,12 +684,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
685 * benefit in doing so. 684 * benefit in doing so.
686 */ 685 */
687 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { 686 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
687 unsigned long pfn = tseg >> PAGE_SHIFT;
688
688 printk(KERN_DEBUG "tseg: %010llx\n", tseg); 689 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
689 if ((tseg>>PMD_SHIFT) < 690 if (pfn_range_is_mapped(pfn, pfn + 1))
690 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
691 ((tseg>>PMD_SHIFT) <
692 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
693 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
694 set_memory_4k((unsigned long)__va(tseg), 1); 691 set_memory_4k((unsigned long)__va(tseg), 1);
695 } 692 }
696 } 693 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fdfefa27b948..1905ce98bee0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -17,7 +17,6 @@
17 17
18#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
19#include <linux/topology.h> 19#include <linux/topology.h>
20#include <asm/numa_64.h>
21#endif 20#endif
22 21
23#include "cpu.h" 22#include "cpu.h"
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index df06ade26bef..d32abeabbda5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
835} 835}
836early_param("mem", parse_memopt); 836early_param("mem", parse_memopt);
837 837
838static int __init parse_memmap_opt(char *p) 838static int __init parse_memmap_one(char *p)
839{ 839{
840 char *oldp; 840 char *oldp;
841 u64 start_at, mem_size; 841 u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
877 877
878 return *p == '\0' ? 0 : -EINVAL; 878 return *p == '\0' ? 0 : -EINVAL;
879} 879}
880static int __init parse_memmap_opt(char *str)
881{
882 while (str) {
883 char *k = strchr(str, ',');
884
885 if (k)
886 *k++ = 0;
887
888 parse_memmap_one(str);
889 str = k;
890 }
891
892 return 0;
893}
880early_param("memmap", parse_memmap_opt); 894early_param("memmap", parse_memmap_opt);
881 895
882void __init finish_e820_parsing(void) 896void __init finish_e820_parsing(void)
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index e17554832991..138463a24877 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,6 +18,7 @@
18#include <asm/io_apic.h> 18#include <asm/io_apic.h>
19#include <asm/bios_ebda.h> 19#include <asm/bios_ebda.h>
20#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
21#include <asm/bootparam_utils.h>
21 22
22static void __init i386_default_early_setup(void) 23static void __init i386_default_early_setup(void)
23{ 24{
@@ -30,19 +31,7 @@ static void __init i386_default_early_setup(void)
30 31
31void __init i386_start_kernel(void) 32void __init i386_start_kernel(void)
32{ 33{
33 memblock_reserve(__pa_symbol(_text), 34 sanitize_boot_params(&boot_params);
34 (unsigned long)__bss_stop - (unsigned long)_text);
35
36#ifdef CONFIG_BLK_DEV_INITRD
37 /* Reserve INITRD */
38 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
39 /* Assume only end is not page aligned */
40 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
41 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
42 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
43 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
44 }
45#endif
46 35
47 /* Call the subarch specific early setup function */ 36 /* Call the subarch specific early setup function */
48 switch (boot_params.hdr.hardware_subarch) { 37 switch (boot_params.hdr.hardware_subarch) {
@@ -57,11 +46,5 @@ void __init i386_start_kernel(void)
57 break; 46 break;
58 } 47 }
59 48
60 /*
61 * At this point everything still needed from the boot loader
62 * or BIOS or kernel text should be early reserved or marked not
63 * RAM in e820. All other memory is free game.
64 */
65
66 start_kernel(); 49 start_kernel();
67} 50}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7b215a50ec1e..57334f4cd3af 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,12 +25,83 @@
25#include <asm/kdebug.h> 25#include <asm/kdebug.h>
26#include <asm/e820.h> 26#include <asm/e820.h>
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/bootparam_utils.h>
28 29
29static void __init zap_identity_mappings(void) 30/*
31 * Manage page tables very early on.
32 */
33extern pgd_t early_level4_pgt[PTRS_PER_PGD];
34extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
35static unsigned int __initdata next_early_pgt = 2;
36
37/* Wipe all early page tables except for the kernel symbol map */
38static void __init reset_early_page_tables(void)
39{
40 unsigned long i;
41
42 for (i = 0; i < PTRS_PER_PGD-1; i++)
43 early_level4_pgt[i].pgd = 0;
44
45 next_early_pgt = 0;
46
47 write_cr3(__pa(early_level4_pgt));
48}
49
50/* Create a new PMD entry */
51int __init early_make_pgtable(unsigned long address)
30{ 52{
31 pgd_t *pgd = pgd_offset_k(0UL); 53 unsigned long physaddr = address - __PAGE_OFFSET;
32 pgd_clear(pgd); 54 unsigned long i;
33 __flush_tlb_all(); 55 pgdval_t pgd, *pgd_p;
56 pudval_t pud, *pud_p;
57 pmdval_t pmd, *pmd_p;
58
59 /* Invalid address or early pgt is done ? */
60 if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
61 return -1;
62
63again:
64 pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
65 pgd = *pgd_p;
66
67 /*
68 * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
69 * critical -- __PAGE_OFFSET would point us back into the dynamic
70 * range and we might end up looping forever...
71 */
72 if (pgd)
73 pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
74 else {
75 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
76 reset_early_page_tables();
77 goto again;
78 }
79
80 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
81 for (i = 0; i < PTRS_PER_PUD; i++)
82 pud_p[i] = 0;
83 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
84 }
85 pud_p += pud_index(address);
86 pud = *pud_p;
87
88 if (pud)
89 pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
90 else {
91 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
92 reset_early_page_tables();
93 goto again;
94 }
95
96 pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
97 for (i = 0; i < PTRS_PER_PMD; i++)
98 pmd_p[i] = 0;
99 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
100 }
101 pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
102 pmd_p[pmd_index(address)] = pmd;
103
104 return 0;
34} 105}
35 106
36/* Don't add a printk in there. printk relies on the PDA which is not initialized 107/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -41,13 +112,25 @@ static void __init clear_bss(void)
41 (unsigned long) __bss_stop - (unsigned long) __bss_start); 112 (unsigned long) __bss_stop - (unsigned long) __bss_start);
42} 113}
43 114
115static unsigned long get_cmd_line_ptr(void)
116{
117 unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
118
119 cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
120
121 return cmd_line_ptr;
122}
123
44static void __init copy_bootdata(char *real_mode_data) 124static void __init copy_bootdata(char *real_mode_data)
45{ 125{
46 char * command_line; 126 char * command_line;
127 unsigned long cmd_line_ptr;
47 128
48 memcpy(&boot_params, real_mode_data, sizeof boot_params); 129 memcpy(&boot_params, real_mode_data, sizeof boot_params);
49 if (boot_params.hdr.cmd_line_ptr) { 130 sanitize_boot_params(&boot_params);
50 command_line = __va(boot_params.hdr.cmd_line_ptr); 131 cmd_line_ptr = get_cmd_line_ptr();
132 if (cmd_line_ptr) {
133 command_line = __va(cmd_line_ptr);
51 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 134 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
52 } 135 }
53} 136}
@@ -70,14 +153,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
70 (__START_KERNEL & PGDIR_MASK))); 153 (__START_KERNEL & PGDIR_MASK)));
71 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); 154 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
72 155
156 /* Kill off the identity-map trampoline */
157 reset_early_page_tables();
158
73 /* clear bss before set_intr_gate with early_idt_handler */ 159 /* clear bss before set_intr_gate with early_idt_handler */
74 clear_bss(); 160 clear_bss();
75 161
76 /* Make NULL pointers segfault */
77 zap_identity_mappings();
78
79 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
80
81 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 162 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
82#ifdef CONFIG_EARLY_PRINTK 163#ifdef CONFIG_EARLY_PRINTK
83 set_intr_gate(i, &early_idt_handlers[i]); 164 set_intr_gate(i, &early_idt_handlers[i]);
@@ -87,37 +168,25 @@ void __init x86_64_start_kernel(char * real_mode_data)
87 } 168 }
88 load_idt((const struct desc_ptr *)&idt_descr); 169 load_idt((const struct desc_ptr *)&idt_descr);
89 170
171 copy_bootdata(__va(real_mode_data));
172
90 if (console_loglevel == 10) 173 if (console_loglevel == 10)
91 early_printk("Kernel alive\n"); 174 early_printk("Kernel alive\n");
92 175
176 clear_page(init_level4_pgt);
177 /* set init_level4_pgt kernel high mapping*/
178 init_level4_pgt[511] = early_level4_pgt[511];
179
93 x86_64_start_reservations(real_mode_data); 180 x86_64_start_reservations(real_mode_data);
94} 181}
95 182
96void __init x86_64_start_reservations(char *real_mode_data) 183void __init x86_64_start_reservations(char *real_mode_data)
97{ 184{
98 copy_bootdata(__va(real_mode_data)); 185 /* version is always not zero if it is copied */
99 186 if (!boot_params.hdr.version)
100 memblock_reserve(__pa_symbol(_text), 187 copy_bootdata(__va(real_mode_data));
101 (unsigned long)__bss_stop - (unsigned long)_text);
102
103#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */
105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
106 /* Assume only end is not page aligned */
107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
110 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
111 }
112#endif
113 188
114 reserve_ebda_region(); 189 reserve_ebda_region();
115 190
116 /*
117 * At this point everything still needed from the boot loader
118 * or BIOS or kernel text should be early reserved or marked not
119 * RAM in e820. All other memory is free game.
120 */
121
122 start_kernel(); 191 start_kernel();
123} 192}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c4b9cc..d94f6d68be2a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
47 .code64 47 .code64
48 .globl startup_64 48 .globl startup_64
49startup_64: 49startup_64:
50
51 /* 50 /*
52 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 51 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
53 * and someone has loaded an identity mapped page table 52 * and someone has loaded an identity mapped page table
54 * for us. These identity mapped page tables map all of the 53 * for us. These identity mapped page tables map all of the
55 * kernel pages and possibly all of memory. 54 * kernel pages and possibly all of memory.
56 * 55 *
57 * %esi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
58 * 57 *
59 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
60 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
66 * tables and then reload them. 65 * tables and then reload them.
67 */ 66 */
68 67
69 /* Compute the delta between the address I am compiled to run at and the 68 /*
69 * Compute the delta between the address I am compiled to run at and the
70 * address I am actually running at. 70 * address I am actually running at.
71 */ 71 */
72 leaq _text(%rip), %rbp 72 leaq _text(%rip), %rbp
@@ -78,45 +78,62 @@ startup_64:
78 testl %eax, %eax 78 testl %eax, %eax
79 jnz bad_address 79 jnz bad_address
80 80
81 /* Is the address too large? */ 81 /*
82 leaq _text(%rip), %rdx 82 * Is the address too large?
83 movq $PGDIR_SIZE, %rax
84 cmpq %rax, %rdx
85 jae bad_address
86
87 /* Fixup the physical addresses in the page table
88 */ 83 */
89 addq %rbp, init_level4_pgt + 0(%rip) 84 leaq _text(%rip), %rax
90 addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) 85 shrq $MAX_PHYSMEM_BITS, %rax
91 addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) 86 jnz bad_address
92 87
93 addq %rbp, level3_ident_pgt + 0(%rip) 88 /*
89 * Fixup the physical addresses in the page table
90 */
91 addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
94 92
95 addq %rbp, level3_kernel_pgt + (510*8)(%rip) 93 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
96 addq %rbp, level3_kernel_pgt + (511*8)(%rip) 94 addq %rbp, level3_kernel_pgt + (511*8)(%rip)
97 95
98 addq %rbp, level2_fixmap_pgt + (506*8)(%rip) 96 addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
99 97
100 /* Add an Identity mapping if I am above 1G */ 98 /*
99 * Set up the identity mapping for the switchover. These
100 * entries should *NOT* have the global bit set! This also
101 * creates a bunch of nonsense entries but that is fine --
102 * it avoids problems around wraparound.
103 */
101 leaq _text(%rip), %rdi 104 leaq _text(%rip), %rdi
102 andq $PMD_PAGE_MASK, %rdi 105 leaq early_level4_pgt(%rip), %rbx
103 106
104 movq %rdi, %rax 107 movq %rdi, %rax
105 shrq $PUD_SHIFT, %rax 108 shrq $PGDIR_SHIFT, %rax
106 andq $(PTRS_PER_PUD - 1), %rax
107 jz ident_complete
108 109
109 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx 110 leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
110 leaq level3_ident_pgt(%rip), %rbx 111 movq %rdx, 0(%rbx,%rax,8)
111 movq %rdx, 0(%rbx, %rax, 8) 112 movq %rdx, 8(%rbx,%rax,8)
112 113
114 addq $4096, %rdx
113 movq %rdi, %rax 115 movq %rdi, %rax
114 shrq $PMD_SHIFT, %rax 116 shrq $PUD_SHIFT, %rax
115 andq $(PTRS_PER_PMD - 1), %rax 117 andl $(PTRS_PER_PUD-1), %eax
116 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx 118 movq %rdx, (4096+0)(%rbx,%rax,8)
117 leaq level2_spare_pgt(%rip), %rbx 119 movq %rdx, (4096+8)(%rbx,%rax,8)
118 movq %rdx, 0(%rbx, %rax, 8) 120
119ident_complete: 121 addq $8192, %rbx
122 movq %rdi, %rax
123 shrq $PMD_SHIFT, %rdi
124 addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
125 leaq (_end - 1)(%rip), %rcx
126 shrq $PMD_SHIFT, %rcx
127 subq %rdi, %rcx
128 incl %ecx
129
1301:
131 andq $(PTRS_PER_PMD - 1), %rdi
132 movq %rax, (%rbx,%rdi,8)
133 incq %rdi
134 addq $PMD_SIZE, %rax
135 decl %ecx
136 jnz 1b
120 137
121 /* 138 /*
122 * Fixup the kernel text+data virtual addresses. Note that 139 * Fixup the kernel text+data virtual addresses. Note that
@@ -124,7 +141,6 @@ ident_complete:
124 * cleanup_highmap() fixes this up along with the mappings 141 * cleanup_highmap() fixes this up along with the mappings
125 * beyond _end. 142 * beyond _end.
126 */ 143 */
127
128 leaq level2_kernel_pgt(%rip), %rdi 144 leaq level2_kernel_pgt(%rip), %rdi
129 leaq 4096(%rdi), %r8 145 leaq 4096(%rdi), %r8
130 /* See if it is a valid page table entry */ 146 /* See if it is a valid page table entry */
@@ -139,17 +155,14 @@ ident_complete:
139 /* Fixup phys_base */ 155 /* Fixup phys_base */
140 addq %rbp, phys_base(%rip) 156 addq %rbp, phys_base(%rip)
141 157
142 /* Due to ENTRY(), sometimes the empty space gets filled with 158 movq $(early_level4_pgt - __START_KERNEL_map), %rax
143 * zeros. Better take a jmp than relying on empty space being 159 jmp 1f
144 * filled with 0x90 (nop)
145 */
146 jmp secondary_startup_64
147ENTRY(secondary_startup_64) 160ENTRY(secondary_startup_64)
148 /* 161 /*
149 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 162 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
150 * and someone has loaded a mapped page table. 163 * and someone has loaded a mapped page table.
151 * 164 *
152 * %esi holds a physical pointer to real_mode_data. 165 * %rsi holds a physical pointer to real_mode_data.
153 * 166 *
154 * We come here either from startup_64 (using physical addresses) 167 * We come here either from startup_64 (using physical addresses)
155 * or from trampoline.S (using virtual addresses). 168 * or from trampoline.S (using virtual addresses).
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64)
159 * after the boot processor executes this code. 172 * after the boot processor executes this code.
160 */ 173 */
161 174
175 movq $(init_level4_pgt - __START_KERNEL_map), %rax
1761:
177
162 /* Enable PAE mode and PGE */ 178 /* Enable PAE mode and PGE */
163 movl $(X86_CR4_PAE | X86_CR4_PGE), %eax 179 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
164 movq %rax, %cr4 180 movq %rcx, %cr4
165 181
166 /* Setup early boot stage 4 level pagetables. */ 182 /* Setup early boot stage 4 level pagetables. */
167 movq $(init_level4_pgt - __START_KERNEL_map), %rax
168 addq phys_base(%rip), %rax 183 addq phys_base(%rip), %rax
169 movq %rax, %cr3 184 movq %rax, %cr3
170 185
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64)
196 movq %rax, %cr0 211 movq %rax, %cr0
197 212
198 /* Setup a boot time stack */ 213 /* Setup a boot time stack */
199 movq stack_start(%rip),%rsp 214 movq stack_start(%rip), %rsp
200 215
201 /* zero EFLAGS after setting rsp */ 216 /* zero EFLAGS after setting rsp */
202 pushq $0 217 pushq $0
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64)
236 movl initial_gs+4(%rip),%edx 251 movl initial_gs+4(%rip),%edx
237 wrmsr 252 wrmsr
238 253
239 /* esi is pointer to real mode structure with interesting info. 254 /* rsi is pointer to real mode structure with interesting info.
240 pass it to C */ 255 pass it to C */
241 movl %esi, %edi 256 movq %rsi, %rdi
242 257
243 /* Finally jump to run C code and to be on real kernel address 258 /* Finally jump to run C code and to be on real kernel address
244 * Since we are running on identity-mapped space we have to jump 259 * Since we are running on identity-mapped space we have to jump
245 * to the full 64bit address, this is only possible as indirect 260 * to the full 64bit address, this is only possible as indirect
246 * jump. In addition we need to ensure %cs is set so we make this 261 * jump. In addition we need to ensure %cs is set so we make this
247 * a far return. 262 * a far return.
263 *
264 * Note: do not change to far jump indirect with 64bit offset.
265 *
266 * AMD does not support far jump indirect with 64bit offset.
267 * AMD64 Architecture Programmer's Manual, Volume 3: states only
268 * JMP FAR mem16:16 FF /5 Far jump indirect,
269 * with the target specified by a far pointer in memory.
270 * JMP FAR mem16:32 FF /5 Far jump indirect,
271 * with the target specified by a far pointer in memory.
272 *
273 * Intel64 does support 64bit offset.
274 * Software Developer Manual Vol 2: states:
275 * FF /5 JMP m16:16 Jump far, absolute indirect,
276 * address given in m16:16
277 * FF /5 JMP m16:32 Jump far, absolute indirect,
278 * address given in m16:32.
279 * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
280 * address given in m16:64.
248 */ 281 */
249 movq initial_code(%rip),%rax 282 movq initial_code(%rip),%rax
250 pushq $0 # fake return address to stop unwinder 283 pushq $0 # fake return address to stop unwinder
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0)
270 303
271 /* SMP bootup changes these two */ 304 /* SMP bootup changes these two */
272 __REFDATA 305 __REFDATA
273 .align 8 306 .balign 8
274 ENTRY(initial_code) 307 GLOBAL(initial_code)
275 .quad x86_64_start_kernel 308 .quad x86_64_start_kernel
276 ENTRY(initial_gs) 309 GLOBAL(initial_gs)
277 .quad INIT_PER_CPU_VAR(irq_stack_union) 310 .quad INIT_PER_CPU_VAR(irq_stack_union)
278 311
279 ENTRY(stack_start) 312 GLOBAL(stack_start)
280 .quad init_thread_union+THREAD_SIZE-8 313 .quad init_thread_union+THREAD_SIZE-8
281 .word 0 314 .word 0
282 __FINITDATA 315 __FINITDATA
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0)
284bad_address: 317bad_address:
285 jmp bad_address 318 jmp bad_address
286 319
287 .section ".init.text","ax" 320 __INIT
288 .globl early_idt_handlers 321 .globl early_idt_handlers
289early_idt_handlers: 322early_idt_handlers:
290 # 104(%rsp) %rflags 323 # 104(%rsp) %rflags
@@ -321,14 +354,22 @@ ENTRY(early_idt_handler)
321 pushq %r11 # 0(%rsp) 354 pushq %r11 # 0(%rsp)
322 355
323 cmpl $__KERNEL_CS,96(%rsp) 356 cmpl $__KERNEL_CS,96(%rsp)
324 jne 10f 357 jne 11f
358
359 cmpl $14,72(%rsp) # Page fault?
360 jnz 10f
361 GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
362 call early_make_pgtable
363 andl %eax,%eax
364 jz 20f # All good
325 365
36610:
326 leaq 88(%rsp),%rdi # Pointer to %rip 367 leaq 88(%rsp),%rdi # Pointer to %rip
327 call early_fixup_exception 368 call early_fixup_exception
328 andl %eax,%eax 369 andl %eax,%eax
329 jnz 20f # Found an exception entry 370 jnz 20f # Found an exception entry
330 371
33110: 37211:
332#ifdef CONFIG_EARLY_PRINTK 373#ifdef CONFIG_EARLY_PRINTK
333 GET_CR2_INTO(%r9) # can clobber any volatile register if pv 374 GET_CR2_INTO(%r9) # can clobber any volatile register if pv
334 movl 80(%rsp),%r8d # error code 375 movl 80(%rsp),%r8d # error code
@@ -350,7 +391,7 @@ ENTRY(early_idt_handler)
3501: hlt 3911: hlt
351 jmp 1b 392 jmp 1b
352 393
35320: # Exception table entry found 39420: # Exception table entry found or page table generated
354 popq %r11 395 popq %r11
355 popq %r10 396 popq %r10
356 popq %r9 397 popq %r9
@@ -364,6 +405,8 @@ ENTRY(early_idt_handler)
364 decl early_recursion_flag(%rip) 405 decl early_recursion_flag(%rip)
365 INTERRUPT_RETURN 406 INTERRUPT_RETURN
366 407
408 __INITDATA
409
367 .balign 4 410 .balign 4
368early_recursion_flag: 411early_recursion_flag:
369 .long 0 412 .long 0
@@ -374,11 +417,10 @@ early_idt_msg:
374early_idt_ripmsg: 417early_idt_ripmsg:
375 .asciz "RIP %s\n" 418 .asciz "RIP %s\n"
376#endif /* CONFIG_EARLY_PRINTK */ 419#endif /* CONFIG_EARLY_PRINTK */
377 .previous
378 420
379#define NEXT_PAGE(name) \ 421#define NEXT_PAGE(name) \
380 .balign PAGE_SIZE; \ 422 .balign PAGE_SIZE; \
381ENTRY(name) 423GLOBAL(name)
382 424
383/* Automate the creation of 1 to 1 mapping pmd entries */ 425/* Automate the creation of 1 to 1 mapping pmd entries */
384#define PMDS(START, PERM, COUNT) \ 426#define PMDS(START, PERM, COUNT) \
@@ -388,24 +430,37 @@ ENTRY(name)
388 i = i + 1 ; \ 430 i = i + 1 ; \
389 .endr 431 .endr
390 432
433 __INITDATA
434NEXT_PAGE(early_level4_pgt)
435 .fill 511,8,0
436 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
437
438NEXT_PAGE(early_dynamic_pgts)
439 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
440
391 .data 441 .data
392 /* 442
393 * This default setting generates an ident mapping at address 0x100000 443#ifndef CONFIG_XEN
394 * and a mapping for the kernel that precisely maps virtual address
395 * 0xffffffff80000000 to physical address 0x000000. (always using
396 * 2Mbyte large pages provided by PAE mode)
397 */
398NEXT_PAGE(init_level4_pgt) 444NEXT_PAGE(init_level4_pgt)
399 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 445 .fill 512,8,0
400 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 446#else
401 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 447NEXT_PAGE(init_level4_pgt)
402 .org init_level4_pgt + L4_START_KERNEL*8, 0 448 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
449 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
450 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
451 .org init_level4_pgt + L4_START_KERNEL*8, 0
403 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 452 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
404 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 453 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
405 454
406NEXT_PAGE(level3_ident_pgt) 455NEXT_PAGE(level3_ident_pgt)
407 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 456 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
408 .fill 511,8,0 457 .fill 511, 8, 0
458NEXT_PAGE(level2_ident_pgt)
459 /* Since I easily can, map the first 1G.
460 * Don't set NX because code runs from these pages.
461 */
462 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
463#endif
409 464
410NEXT_PAGE(level3_kernel_pgt) 465NEXT_PAGE(level3_kernel_pgt)
411 .fill L3_START_KERNEL,8,0 466 .fill L3_START_KERNEL,8,0
@@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt)
413 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 468 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
414 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 469 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
415 470
416NEXT_PAGE(level2_fixmap_pgt)
417 .fill 506,8,0
418 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
419 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
420 .fill 5,8,0
421
422NEXT_PAGE(level1_fixmap_pgt)
423 .fill 512,8,0
424
425NEXT_PAGE(level2_ident_pgt)
426 /* Since I easily can, map the first 1G.
427 * Don't set NX because code runs from these pages.
428 */
429 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
430
431NEXT_PAGE(level2_kernel_pgt) 471NEXT_PAGE(level2_kernel_pgt)
432 /* 472 /*
433 * 512 MB kernel mapping. We spend a full page on this pagetable 473 * 512 MB kernel mapping. We spend a full page on this pagetable
@@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt)
442 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, 482 PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
443 KERNEL_IMAGE_SIZE/PMD_SIZE) 483 KERNEL_IMAGE_SIZE/PMD_SIZE)
444 484
445NEXT_PAGE(level2_spare_pgt) 485NEXT_PAGE(level2_fixmap_pgt)
446 .fill 512, 8, 0 486 .fill 506,8,0
487 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
488 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
489 .fill 5,8,0
490
491NEXT_PAGE(level1_fixmap_pgt)
492 .fill 512,8,0
447 493
448#undef PMDS 494#undef PMDS
449#undef NEXT_PAGE
450 495
451 .data 496 .data
452 .align 16 497 .align 16
@@ -472,6 +517,5 @@ ENTRY(nmi_idt_table)
472 .skip IDT_ENTRIES * 16 517 .skip IDT_ENTRIES * 16
473 518
474 __PAGE_ALIGNED_BSS 519 __PAGE_ALIGNED_BSS
475 .align PAGE_SIZE 520NEXT_PAGE(empty_zero_page)
476ENTRY(empty_zero_page)
477 .skip PAGE_SIZE 521 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db39db6..4eabc160696f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -16,125 +16,12 @@
16#include <linux/io.h> 16#include <linux/io.h>
17#include <linux/suspend.h> 17#include <linux/suspend.h>
18 18
19#include <asm/init.h>
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/debugreg.h> 23#include <asm/debugreg.h>
23 24
24static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
25 unsigned long addr)
26{
27 pud_t *pud;
28 pmd_t *pmd;
29 struct page *page;
30 int result = -ENOMEM;
31
32 addr &= PMD_MASK;
33 pgd += pgd_index(addr);
34 if (!pgd_present(*pgd)) {
35 page = kimage_alloc_control_pages(image, 0);
36 if (!page)
37 goto out;
38 pud = (pud_t *)page_address(page);
39 clear_page(pud);
40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
41 }
42 pud = pud_offset(pgd, addr);
43 if (!pud_present(*pud)) {
44 page = kimage_alloc_control_pages(image, 0);
45 if (!page)
46 goto out;
47 pmd = (pmd_t *)page_address(page);
48 clear_page(pmd);
49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
50 }
51 pmd = pmd_offset(pud, addr);
52 if (!pmd_present(*pmd))
53 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
54 result = 0;
55out:
56 return result;
57}
58
59static void init_level2_page(pmd_t *level2p, unsigned long addr)
60{
61 unsigned long end_addr;
62
63 addr &= PAGE_MASK;
64 end_addr = addr + PUD_SIZE;
65 while (addr < end_addr) {
66 set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
67 addr += PMD_SIZE;
68 }
69}
70
71static int init_level3_page(struct kimage *image, pud_t *level3p,
72 unsigned long addr, unsigned long last_addr)
73{
74 unsigned long end_addr;
75 int result;
76
77 result = 0;
78 addr &= PAGE_MASK;
79 end_addr = addr + PGDIR_SIZE;
80 while ((addr < last_addr) && (addr < end_addr)) {
81 struct page *page;
82 pmd_t *level2p;
83
84 page = kimage_alloc_control_pages(image, 0);
85 if (!page) {
86 result = -ENOMEM;
87 goto out;
88 }
89 level2p = (pmd_t *)page_address(page);
90 init_level2_page(level2p, addr);
91 set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
92 addr += PUD_SIZE;
93 }
94 /* clear the unused entries */
95 while (addr < end_addr) {
96 pud_clear(level3p++);
97 addr += PUD_SIZE;
98 }
99out:
100 return result;
101}
102
103
104static int init_level4_page(struct kimage *image, pgd_t *level4p,
105 unsigned long addr, unsigned long last_addr)
106{
107 unsigned long end_addr;
108 int result;
109
110 result = 0;
111 addr &= PAGE_MASK;
112 end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
113 while ((addr < last_addr) && (addr < end_addr)) {
114 struct page *page;
115 pud_t *level3p;
116
117 page = kimage_alloc_control_pages(image, 0);
118 if (!page) {
119 result = -ENOMEM;
120 goto out;
121 }
122 level3p = (pud_t *)page_address(page);
123 result = init_level3_page(image, level3p, addr, last_addr);
124 if (result)
125 goto out;
126 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
127 addr += PGDIR_SIZE;
128 }
129 /* clear the unused entries */
130 while (addr < end_addr) {
131 pgd_clear(level4p++);
132 addr += PGDIR_SIZE;
133 }
134out:
135 return result;
136}
137
138static void free_transition_pgtable(struct kimage *image) 25static void free_transition_pgtable(struct kimage *image)
139{ 26{
140 free_page((unsigned long)image->arch.pud); 27 free_page((unsigned long)image->arch.pud);
@@ -184,22 +71,62 @@ err:
184 return result; 71 return result;
185} 72}
186 73
74static void *alloc_pgt_page(void *data)
75{
76 struct kimage *image = (struct kimage *)data;
77 struct page *page;
78 void *p = NULL;
79
80 page = kimage_alloc_control_pages(image, 0);
81 if (page) {
82 p = page_address(page);
83 clear_page(p);
84 }
85
86 return p;
87}
187 88
188static int init_pgtable(struct kimage *image, unsigned long start_pgtable) 89static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
189{ 90{
91 struct x86_mapping_info info = {
92 .alloc_pgt_page = alloc_pgt_page,
93 .context = image,
94 .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
95 };
96 unsigned long mstart, mend;
190 pgd_t *level4p; 97 pgd_t *level4p;
191 int result; 98 int result;
99 int i;
100
192 level4p = (pgd_t *)__va(start_pgtable); 101 level4p = (pgd_t *)__va(start_pgtable);
193 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 102 clear_page(level4p);
194 if (result) 103 for (i = 0; i < nr_pfn_mapped; i++) {
195 return result; 104 mstart = pfn_mapped[i].start << PAGE_SHIFT;
105 mend = pfn_mapped[i].end << PAGE_SHIFT;
106
107 result = kernel_ident_mapping_init(&info,
108 level4p, mstart, mend);
109 if (result)
110 return result;
111 }
112
196 /* 113 /*
197 * image->start may be outside 0 ~ max_pfn, for example when 114 * segments's mem ranges could be outside 0 ~ max_pfn,
198 * jump back to original kernel from kexeced kernel 115 * for example when jump back to original kernel from kexeced kernel.
116 * or first kernel is booted with user mem map, and second kernel
117 * could be loaded out of that range.
199 */ 118 */
200 result = init_one_level2_page(image, level4p, image->start); 119 for (i = 0; i < image->nr_segments; i++) {
201 if (result) 120 mstart = image->segment[i].mem;
202 return result; 121 mend = mstart + image->segment[i].memsz;
122
123 result = kernel_ident_mapping_init(&info,
124 level4p, mstart, mend);
125
126 if (result)
127 return result;
128 }
129
203 return init_transition_pgtable(image, level4p); 130 return init_transition_pgtable(image, level4p);
204} 131}
205 132
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0aebd776018e..878cf1d326e5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -108,17 +108,16 @@
108#include <asm/topology.h> 108#include <asm/topology.h>
109#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/amd_nb.h> 110#include <asm/amd_nb.h>
111#ifdef CONFIG_X86_64
112#include <asm/numa_64.h>
113#endif
114#include <asm/mce.h> 111#include <asm/mce.h>
115#include <asm/alternative.h> 112#include <asm/alternative.h>
116#include <asm/prom.h> 113#include <asm/prom.h>
117 114
118/* 115/*
119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 116 * max_low_pfn_mapped: highest direct mapped pfn under 4GB
120 * The direct mapping extends to max_pfn_mapped, so that we can directly access 117 * max_pfn_mapped: highest direct mapped pfn over 4GB
121 * apertures, ACPI and other tables without having to play with fixmaps. 118 *
119 * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
120 * represented by pfn_mapped
122 */ 121 */
123unsigned long max_low_pfn_mapped; 122unsigned long max_low_pfn_mapped;
124unsigned long max_pfn_mapped; 123unsigned long max_pfn_mapped;
@@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align)
276 return ret; 275 return ret;
277} 276}
278 277
279#ifdef CONFIG_X86_64 278#ifdef CONFIG_X86_32
280static void __init init_gbpages(void)
281{
282 if (direct_gbpages && cpu_has_gbpages)
283 printk(KERN_INFO "Using GB pages for direct mapping\n");
284 else
285 direct_gbpages = 0;
286}
287#else
288static inline void init_gbpages(void)
289{
290}
291static void __init cleanup_highmap(void) 279static void __init cleanup_highmap(void)
292{ 280{
293} 281}
@@ -306,27 +294,43 @@ static void __init reserve_brk(void)
306 294
307#ifdef CONFIG_BLK_DEV_INITRD 295#ifdef CONFIG_BLK_DEV_INITRD
308 296
297static u64 __init get_ramdisk_image(void)
298{
299 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
300
301 ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
302
303 return ramdisk_image;
304}
305static u64 __init get_ramdisk_size(void)
306{
307 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
308
309 ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
310
311 return ramdisk_size;
312}
313
309#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 314#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
310static void __init relocate_initrd(void) 315static void __init relocate_initrd(void)
311{ 316{
312 /* Assume only end is not page aligned */ 317 /* Assume only end is not page aligned */
313 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 318 u64 ramdisk_image = get_ramdisk_image();
314 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 319 u64 ramdisk_size = get_ramdisk_size();
315 u64 area_size = PAGE_ALIGN(ramdisk_size); 320 u64 area_size = PAGE_ALIGN(ramdisk_size);
316 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
317 u64 ramdisk_here; 321 u64 ramdisk_here;
318 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
319 char *p, *q; 323 char *p, *q;
320 324
321 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into directly mapped mem */
322 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, 326 ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
323 PAGE_SIZE); 327 area_size, PAGE_SIZE);
324 328
325 if (!ramdisk_here) 329 if (!ramdisk_here)
326 panic("Cannot find place for new RAMDISK of size %lld\n", 330 panic("Cannot find place for new RAMDISK of size %lld\n",
327 ramdisk_size); 331 ramdisk_size);
328 332
329 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the mem currently occupied by
330 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
331 memblock_reserve(ramdisk_here, area_size); 335 memblock_reserve(ramdisk_here, area_size);
332 initrd_start = ramdisk_here + PAGE_OFFSET; 336 initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -336,17 +340,7 @@ static void __init relocate_initrd(void)
336 340
337 q = (char *)initrd_start; 341 q = (char *)initrd_start;
338 342
339 /* Copy any lowmem portion of the initrd */ 343 /* Copy the initrd */
340 if (ramdisk_image < end_of_lowmem) {
341 clen = end_of_lowmem - ramdisk_image;
342 p = (char *)__va(ramdisk_image);
343 memcpy(q, p, clen);
344 q += clen;
345 ramdisk_image += clen;
346 ramdisk_size -= clen;
347 }
348
349 /* Copy the highmem portion of the initrd */
350 while (ramdisk_size) { 344 while (ramdisk_size) {
351 slop = ramdisk_image & ~PAGE_MASK; 345 slop = ramdisk_image & ~PAGE_MASK;
352 clen = ramdisk_size; 346 clen = ramdisk_size;
@@ -360,22 +354,35 @@ static void __init relocate_initrd(void)
360 ramdisk_image += clen; 354 ramdisk_image += clen;
361 ramdisk_size -= clen; 355 ramdisk_size -= clen;
362 } 356 }
363 /* high pages is not converted by early_res_to_bootmem */ 357
364 ramdisk_image = boot_params.hdr.ramdisk_image; 358 ramdisk_image = get_ramdisk_image();
365 ramdisk_size = boot_params.hdr.ramdisk_size; 359 ramdisk_size = get_ramdisk_size();
366 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 360 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
367 " [mem %#010llx-%#010llx]\n", 361 " [mem %#010llx-%#010llx]\n",
368 ramdisk_image, ramdisk_image + ramdisk_size - 1, 362 ramdisk_image, ramdisk_image + ramdisk_size - 1,
369 ramdisk_here, ramdisk_here + ramdisk_size - 1); 363 ramdisk_here, ramdisk_here + ramdisk_size - 1);
370} 364}
371 365
366static void __init early_reserve_initrd(void)
367{
368 /* Assume only end is not page aligned */
369 u64 ramdisk_image = get_ramdisk_image();
370 u64 ramdisk_size = get_ramdisk_size();
371 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
372
373 if (!boot_params.hdr.type_of_loader ||
374 !ramdisk_image || !ramdisk_size)
375 return; /* No initrd provided by bootloader */
376
377 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
378}
372static void __init reserve_initrd(void) 379static void __init reserve_initrd(void)
373{ 380{
374 /* Assume only end is not page aligned */ 381 /* Assume only end is not page aligned */
375 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 382 u64 ramdisk_image = get_ramdisk_image();
376 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 383 u64 ramdisk_size = get_ramdisk_size();
377 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 384 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
378 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 385 u64 mapped_size;
379 386
380 if (!boot_params.hdr.type_of_loader || 387 if (!boot_params.hdr.type_of_loader ||
381 !ramdisk_image || !ramdisk_size) 388 !ramdisk_image || !ramdisk_size)
@@ -383,22 +390,18 @@ static void __init reserve_initrd(void)
383 390
384 initrd_start = 0; 391 initrd_start = 0;
385 392
386 if (ramdisk_size >= (end_of_lowmem>>1)) { 393 mapped_size = memblock_mem_size(max_pfn_mapped);
394 if (ramdisk_size >= (mapped_size>>1))
387 panic("initrd too large to handle, " 395 panic("initrd too large to handle, "
388 "disabling initrd (%lld needed, %lld available)\n", 396 "disabling initrd (%lld needed, %lld available)\n",
389 ramdisk_size, end_of_lowmem>>1); 397 ramdisk_size, mapped_size>>1);
390 }
391 398
392 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, 399 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
393 ramdisk_end - 1); 400 ramdisk_end - 1);
394 401
395 402 if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
396 if (ramdisk_end <= end_of_lowmem) { 403 PFN_DOWN(ramdisk_end))) {
397 /* All in lowmem, easy case */ 404 /* All are mapped, easy case */
398 /*
399 * don't need to reserve again, already reserved early
400 * in i386_start_kernel
401 */
402 initrd_start = ramdisk_image + PAGE_OFFSET; 405 initrd_start = ramdisk_image + PAGE_OFFSET;
403 initrd_end = initrd_start + ramdisk_size; 406 initrd_end = initrd_start + ramdisk_size;
404 return; 407 return;
@@ -409,6 +412,9 @@ static void __init reserve_initrd(void)
409 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); 412 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
410} 413}
411#else 414#else
415static void __init early_reserve_initrd(void)
416{
417}
412static void __init reserve_initrd(void) 418static void __init reserve_initrd(void)
413{ 419{
414} 420}
@@ -419,8 +425,6 @@ static void __init parse_setup_data(void)
419 struct setup_data *data; 425 struct setup_data *data;
420 u64 pa_data; 426 u64 pa_data;
421 427
422 if (boot_params.hdr.version < 0x0209)
423 return;
424 pa_data = boot_params.hdr.setup_data; 428 pa_data = boot_params.hdr.setup_data;
425 while (pa_data) { 429 while (pa_data) {
426 u32 data_len, map_len; 430 u32 data_len, map_len;
@@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void)
456 u64 pa_data; 460 u64 pa_data;
457 int found = 0; 461 int found = 0;
458 462
459 if (boot_params.hdr.version < 0x0209)
460 return;
461 pa_data = boot_params.hdr.setup_data; 463 pa_data = boot_params.hdr.setup_data;
462 while (pa_data) { 464 while (pa_data) {
463 data = early_memremap(pa_data, sizeof(*data)); 465 data = early_memremap(pa_data, sizeof(*data));
@@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
481 struct setup_data *data; 483 struct setup_data *data;
482 u64 pa_data; 484 u64 pa_data;
483 485
484 if (boot_params.hdr.version < 0x0209)
485 return;
486 pa_data = boot_params.hdr.setup_data; 486 pa_data = boot_params.hdr.setup_data;
487 while (pa_data) { 487 while (pa_data) {
488 data = early_memremap(pa_data, sizeof(*data)); 488 data = early_memremap(pa_data, sizeof(*data));
@@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void)
501/* 501/*
502 * Keep the crash kernel below this limit. On 32 bits earlier kernels 502 * Keep the crash kernel below this limit. On 32 bits earlier kernels
503 * would limit the kernel to the low 512 MiB due to mapping restrictions. 503 * would limit the kernel to the low 512 MiB due to mapping restrictions.
504 * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
505 * limit once kexec-tools are fixed.
506 */ 504 */
507#ifdef CONFIG_X86_32 505#ifdef CONFIG_X86_32
508# define CRASH_KERNEL_ADDR_MAX (512 << 20) 506# define CRASH_KERNEL_ADDR_MAX (512 << 20)
509#else 507#else
510# define CRASH_KERNEL_ADDR_MAX (896 << 20) 508# define CRASH_KERNEL_ADDR_MAX MAXMEM
511#endif 509#endif
512 510
511static void __init reserve_crashkernel_low(void)
512{
513#ifdef CONFIG_X86_64
514 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long low_base = 0, low_size = 0;
516 unsigned long total_low_mem;
517 unsigned long long base;
518 int ret;
519
520 total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
521 ret = parse_crashkernel_low(boot_command_line, total_low_mem,
522 &low_size, &base);
523 if (ret != 0 || low_size <= 0)
524 return;
525
526 low_base = memblock_find_in_range(low_size, (1ULL<<32),
527 low_size, alignment);
528
529 if (!low_base) {
530 pr_info("crashkernel low reservation failed - No suitable area found.\n");
531
532 return;
533 }
534
535 memblock_reserve(low_base, low_size);
536 pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
537 (unsigned long)(low_size >> 20),
538 (unsigned long)(low_base >> 20),
539 (unsigned long)(total_low_mem >> 20));
540 crashk_low_res.start = low_base;
541 crashk_low_res.end = low_base + low_size - 1;
542 insert_resource(&iomem_resource, &crashk_low_res);
543#endif
544}
545
513static void __init reserve_crashkernel(void) 546static void __init reserve_crashkernel(void)
514{ 547{
548 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long total_mem; 549 unsigned long long total_mem;
516 unsigned long long crash_size, crash_base; 550 unsigned long long crash_size, crash_base;
517 int ret; 551 int ret;
@@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void)
525 559
526 /* 0 means: find the address automatically */ 560 /* 0 means: find the address automatically */
527 if (crash_base <= 0) { 561 if (crash_base <= 0) {
528 const unsigned long long alignment = 16<<20; /* 16M */
529
530 /* 562 /*
531 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX 563 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
532 */ 564 */
@@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void)
537 pr_info("crashkernel reservation failed - No suitable area found.\n"); 569 pr_info("crashkernel reservation failed - No suitable area found.\n");
538 return; 570 return;
539 } 571 }
572
540 } else { 573 } else {
541 unsigned long long start; 574 unsigned long long start;
542 575
@@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void)
558 crashk_res.start = crash_base; 591 crashk_res.start = crash_base;
559 crashk_res.end = crash_base + crash_size - 1; 592 crashk_res.end = crash_base + crash_size - 1;
560 insert_resource(&iomem_resource, &crashk_res); 593 insert_resource(&iomem_resource, &crashk_res);
594
595 if (crash_base >= (1ULL<<32))
596 reserve_crashkernel_low();
561} 597}
562#else 598#else
563static void __init reserve_crashkernel(void) 599static void __init reserve_crashkernel(void)
@@ -708,6 +744,27 @@ static void __init trim_bios_range(void)
708 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 744 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
709} 745}
710 746
747/* called before trim_bios_range() to spare extra sanitize */
748static void __init e820_add_kernel_range(void)
749{
750 u64 start = __pa_symbol(_text);
751 u64 size = __pa_symbol(_end) - start;
752
753 /*
754 * Complain if .text .data and .bss are not marked as E820_RAM and
755 * attempt to fix it by adding the range. We may have a confused BIOS,
756 * or the user may have used memmap=exactmap or memmap=xxM$yyM to
757 * exclude kernel range. If we really are running on top non-RAM,
758 * we will crash later anyways.
759 */
760 if (e820_all_mapped(start, start + size, E820_RAM))
761 return;
762
763 pr_warn(".text .data .bss are not marked as E820_RAM!\n");
764 e820_remove_range(start, size, E820_RAM, 0);
765 e820_add_region(start, size, E820_RAM);
766}
767
711static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; 768static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
712 769
713static int __init parse_reservelow(char *p) 770static int __init parse_reservelow(char *p)
@@ -752,6 +809,17 @@ static void __init trim_low_memory_range(void)
752 809
753void __init setup_arch(char **cmdline_p) 810void __init setup_arch(char **cmdline_p)
754{ 811{
812 memblock_reserve(__pa_symbol(_text),
813 (unsigned long)__bss_stop - (unsigned long)_text);
814
815 early_reserve_initrd();
816
817 /*
818 * At this point everything still needed from the boot loader
819 * or BIOS or kernel text should be early reserved or marked not
820 * RAM in e820. All other memory is free game.
821 */
822
755#ifdef CONFIG_X86_32 823#ifdef CONFIG_X86_32
756 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 824 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
757 visws_early_detect(); 825 visws_early_detect();
@@ -910,6 +978,7 @@ void __init setup_arch(char **cmdline_p)
910 insert_resource(&iomem_resource, &data_resource); 978 insert_resource(&iomem_resource, &data_resource);
911 insert_resource(&iomem_resource, &bss_resource); 979 insert_resource(&iomem_resource, &bss_resource);
912 980
981 e820_add_kernel_range();
913 trim_bios_range(); 982 trim_bios_range();
914#ifdef CONFIG_X86_32 983#ifdef CONFIG_X86_32
915 if (ppro_with_ram_bug()) { 984 if (ppro_with_ram_bug()) {
@@ -959,6 +1028,8 @@ void __init setup_arch(char **cmdline_p)
959 1028
960 reserve_ibft_region(); 1029 reserve_ibft_region();
961 1030
1031 early_alloc_pgt_buf();
1032
962 /* 1033 /*
963 * Need to conclude brk, before memblock_x86_fill() 1034 * Need to conclude brk, before memblock_x86_fill()
964 * it could use memblock_find_in_range, could overlap with 1035 * it could use memblock_find_in_range, could overlap with
@@ -968,7 +1039,7 @@ void __init setup_arch(char **cmdline_p)
968 1039
969 cleanup_highmap(); 1040 cleanup_highmap();
970 1041
971 memblock.current_limit = get_max_mapped(); 1042 memblock.current_limit = ISA_END_ADDRESS;
972 memblock_x86_fill(); 1043 memblock_x86_fill();
973 1044
974 /* 1045 /*
@@ -985,42 +1056,22 @@ void __init setup_arch(char **cmdline_p)
985 setup_bios_corruption_check(); 1056 setup_bios_corruption_check();
986#endif 1057#endif
987 1058
1059#ifdef CONFIG_X86_32
988 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", 1060 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
989 (max_pfn_mapped<<PAGE_SHIFT) - 1); 1061 (max_pfn_mapped<<PAGE_SHIFT) - 1);
1062#endif
990 1063
991 setup_real_mode(); 1064 reserve_real_mode();
992 1065
993 trim_platform_memory_ranges(); 1066 trim_platform_memory_ranges();
994 trim_low_memory_range(); 1067 trim_low_memory_range();
995 1068
996 init_gbpages(); 1069 init_mem_mapping();
997
998 /* max_pfn_mapped is updated here */
999 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
1000 max_pfn_mapped = max_low_pfn_mapped;
1001
1002#ifdef CONFIG_X86_64
1003 if (max_pfn > max_low_pfn) {
1004 int i;
1005 unsigned long start, end;
1006 unsigned long start_pfn, end_pfn;
1007
1008 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
1009 NULL) {
1010 1070
1011 end = PFN_PHYS(end_pfn); 1071 early_trap_pf_init();
1012 if (end <= (1UL<<32))
1013 continue;
1014 1072
1015 start = PFN_PHYS(start_pfn); 1073 setup_real_mode();
1016 max_pfn_mapped = init_memory_mapping(
1017 max((1UL<<32), start), end);
1018 }
1019 1074
1020 /* can we preseve max_low_pfn ?*/
1021 max_low_pfn = max_pfn;
1022 }
1023#endif
1024 memblock.current_limit = get_max_mapped(); 1075 memblock.current_limit = get_max_mapped();
1025 dma_contiguous_reserve(0); 1076 dma_contiguous_reserve(0);
1026 1077
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca11f4e9..68bda7a84159 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -688,10 +688,19 @@ void __init early_trap_init(void)
688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
689 /* int3 can be called from all */ 689 /* int3 can be called from all */
690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
691#ifdef CONFIG_X86_32
691 set_intr_gate(X86_TRAP_PF, &page_fault); 692 set_intr_gate(X86_TRAP_PF, &page_fault);
693#endif
692 load_idt(&idt_descr); 694 load_idt(&idt_descr);
693} 695}
694 696
697void __init early_trap_pf_init(void)
698{
699#ifdef CONFIG_X86_64
700 set_intr_gate(X86_TRAP_PF, &page_fault);
701#endif
702}
703
695void __init trap_init(void) 704void __init trap_init(void)
696{ 705{
697 int i; 706 int i;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 7a3d075a814a..50cf83ecd32e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = {
62 .banner = default_banner, 62 .banner = default_banner,
63 }, 63 },
64 64
65 .mapping = {
66 .pagetable_reserve = native_pagetable_reserve,
67 },
68
69 .paging = { 65 .paging = {
70 .pagetable_init = native_pagetable_init, 66 .pagetable_init = native_pagetable_init,
71 }, 67 },
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d7aea41563b3..d41815265a0b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -17,86 +17,132 @@
17#include <asm/proto.h> 17#include <asm/proto.h>
18#include <asm/dma.h> /* for MAX_DMA_PFN */ 18#include <asm/dma.h> /* for MAX_DMA_PFN */
19 19
20unsigned long __initdata pgt_buf_start; 20#include "mm_internal.h"
21unsigned long __meminitdata pgt_buf_end;
22unsigned long __meminitdata pgt_buf_top;
23 21
24int after_bootmem; 22static unsigned long __initdata pgt_buf_start;
23static unsigned long __initdata pgt_buf_end;
24static unsigned long __initdata pgt_buf_top;
25 25
26int direct_gbpages 26static unsigned long min_pfn_mapped;
27#ifdef CONFIG_DIRECT_GBPAGES
28 = 1
29#endif
30;
31 27
32struct map_range { 28static bool __initdata can_use_brk_pgt = true;
33 unsigned long start;
34 unsigned long end;
35 unsigned page_size_mask;
36};
37 29
38/* 30/*
39 * First calculate space needed for kernel direct mapping page tables to cover 31 * Pages returned are already directly mapped.
40 * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB 32 *
41 * pages. Then find enough contiguous space for those page tables. 33 * Changing that is likely to break Xen, see commit:
34 *
35 * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
36 *
37 * for detailed information.
42 */ 38 */
43static void __init find_early_table_space(struct map_range *mr, int nr_range) 39__ref void *alloc_low_pages(unsigned int num)
44{ 40{
41 unsigned long pfn;
45 int i; 42 int i;
46 unsigned long puds = 0, pmds = 0, ptes = 0, tables;
47 unsigned long start = 0, good_end;
48 phys_addr_t base;
49 43
50 for (i = 0; i < nr_range; i++) { 44 if (after_bootmem) {
51 unsigned long range, extra; 45 unsigned int order;
52 46
53 range = mr[i].end - mr[i].start; 47 order = get_order((unsigned long)num << PAGE_SHIFT);
54 puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; 48 return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
49 __GFP_ZERO, order);
50 }
55 51
56 if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { 52 if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
57 extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); 53 unsigned long ret;
58 pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; 54 if (min_pfn_mapped >= max_pfn_mapped)
59 } else { 55 panic("alloc_low_page: ran out of memory");
60 pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; 56 ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
61 } 57 max_pfn_mapped << PAGE_SHIFT,
58 PAGE_SIZE * num , PAGE_SIZE);
59 if (!ret)
60 panic("alloc_low_page: can not alloc memory");
61 memblock_reserve(ret, PAGE_SIZE * num);
62 pfn = ret >> PAGE_SHIFT;
63 } else {
64 pfn = pgt_buf_end;
65 pgt_buf_end += num;
66 printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
67 pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
68 }
62 69
63 if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { 70 for (i = 0; i < num; i++) {
64 extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); 71 void *adr;
65#ifdef CONFIG_X86_32 72
66 extra += PMD_SIZE; 73 adr = __va((pfn + i) << PAGE_SHIFT);
67#endif 74 clear_page(adr);
68 ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
69 } else {
70 ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
71 }
72 } 75 }
73 76
74 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); 77 return __va(pfn << PAGE_SHIFT);
75 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); 78}
76 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
77 79
78#ifdef CONFIG_X86_32 80/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
79 /* for fixmap */ 81#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
80 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 82RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
81#endif 83void __init early_alloc_pgt_buf(void)
82 good_end = max_pfn_mapped << PAGE_SHIFT; 84{
85 unsigned long tables = INIT_PGT_BUF_SIZE;
86 phys_addr_t base;
83 87
84 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); 88 base = __pa(extend_brk(tables, PAGE_SIZE));
85 if (!base)
86 panic("Cannot find space for the kernel page tables");
87 89
88 pgt_buf_start = base >> PAGE_SHIFT; 90 pgt_buf_start = base >> PAGE_SHIFT;
89 pgt_buf_end = pgt_buf_start; 91 pgt_buf_end = pgt_buf_start;
90 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); 92 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
93}
94
95int after_bootmem;
96
97int direct_gbpages
98#ifdef CONFIG_DIRECT_GBPAGES
99 = 1
100#endif
101;
91 102
92 printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", 103static void __init init_gbpages(void)
93 mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, 104{
94 (pgt_buf_top << PAGE_SHIFT) - 1); 105#ifdef CONFIG_X86_64
106 if (direct_gbpages && cpu_has_gbpages)
107 printk(KERN_INFO "Using GB pages for direct mapping\n");
108 else
109 direct_gbpages = 0;
110#endif
95} 111}
96 112
97void __init native_pagetable_reserve(u64 start, u64 end) 113struct map_range {
114 unsigned long start;
115 unsigned long end;
116 unsigned page_size_mask;
117};
118
119static int page_size_mask;
120
121static void __init probe_page_size_mask(void)
98{ 122{
99 memblock_reserve(start, end - start); 123 init_gbpages();
124
125#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
126 /*
127 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
128 * This will simplify cpa(), which otherwise needs to support splitting
129 * large pages into small in interrupt context, etc.
130 */
131 if (direct_gbpages)
132 page_size_mask |= 1 << PG_LEVEL_1G;
133 if (cpu_has_pse)
134 page_size_mask |= 1 << PG_LEVEL_2M;
135#endif
136
137 /* Enable PSE if available */
138 if (cpu_has_pse)
139 set_in_cr4(X86_CR4_PSE);
140
141 /* Enable PGE if available */
142 if (cpu_has_pge) {
143 set_in_cr4(X86_CR4_PGE);
144 __supported_pte_mask |= _PAGE_GLOBAL;
145 }
100} 146}
101 147
102#ifdef CONFIG_X86_32 148#ifdef CONFIG_X86_32
@@ -122,58 +168,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
122} 168}
123 169
124/* 170/*
125 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 171 * adjust the page_size_mask for small range to go with
126 * This runs before bootmem is initialized and gets pages directly from 172 * big page size instead small one if nearby are ram too.
127 * the physical memory. To access them they are temporarily mapped.
128 */ 173 */
129unsigned long __init_refok init_memory_mapping(unsigned long start, 174static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
130 unsigned long end) 175 int nr_range)
131{ 176{
132 unsigned long page_size_mask = 0; 177 int i;
133 unsigned long start_pfn, end_pfn;
134 unsigned long ret = 0;
135 unsigned long pos;
136
137 struct map_range mr[NR_RANGE_MR];
138 int nr_range, i;
139 int use_pse, use_gbpages;
140 178
141 printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", 179 for (i = 0; i < nr_range; i++) {
142 start, end - 1); 180 if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
181 !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
182 unsigned long start = round_down(mr[i].start, PMD_SIZE);
183 unsigned long end = round_up(mr[i].end, PMD_SIZE);
143 184
144#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) 185#ifdef CONFIG_X86_32
145 /* 186 if ((end >> PAGE_SHIFT) > max_low_pfn)
146 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 187 continue;
147 * This will simplify cpa(), which otherwise needs to support splitting
148 * large pages into small in interrupt context, etc.
149 */
150 use_pse = use_gbpages = 0;
151#else
152 use_pse = cpu_has_pse;
153 use_gbpages = direct_gbpages;
154#endif 188#endif
155 189
156 /* Enable PSE if available */ 190 if (memblock_is_region_memory(start, end - start))
157 if (cpu_has_pse) 191 mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
158 set_in_cr4(X86_CR4_PSE); 192 }
193 if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
194 !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
195 unsigned long start = round_down(mr[i].start, PUD_SIZE);
196 unsigned long end = round_up(mr[i].end, PUD_SIZE);
159 197
160 /* Enable PGE if available */ 198 if (memblock_is_region_memory(start, end - start))
161 if (cpu_has_pge) { 199 mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
162 set_in_cr4(X86_CR4_PGE); 200 }
163 __supported_pte_mask |= _PAGE_GLOBAL;
164 } 201 }
202}
165 203
166 if (use_gbpages) 204static int __meminit split_mem_range(struct map_range *mr, int nr_range,
167 page_size_mask |= 1 << PG_LEVEL_1G; 205 unsigned long start,
168 if (use_pse) 206 unsigned long end)
169 page_size_mask |= 1 << PG_LEVEL_2M; 207{
208 unsigned long start_pfn, end_pfn, limit_pfn;
209 unsigned long pfn;
210 int i;
170 211
171 memset(mr, 0, sizeof(mr)); 212 limit_pfn = PFN_DOWN(end);
172 nr_range = 0;
173 213
174 /* head if not big page alignment ? */ 214 /* head if not big page alignment ? */
175 start_pfn = start >> PAGE_SHIFT; 215 pfn = start_pfn = PFN_DOWN(start);
176 pos = start_pfn << PAGE_SHIFT;
177#ifdef CONFIG_X86_32 216#ifdef CONFIG_X86_32
178 /* 217 /*
179 * Don't use a large page for the first 2/4MB of memory 218 * Don't use a large page for the first 2/4MB of memory
@@ -181,66 +220,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
181 * and overlapping MTRRs into large pages can cause 220 * and overlapping MTRRs into large pages can cause
182 * slowdowns. 221 * slowdowns.
183 */ 222 */
184 if (pos == 0) 223 if (pfn == 0)
185 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); 224 end_pfn = PFN_DOWN(PMD_SIZE);
186 else 225 else
187 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 226 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
188 << (PMD_SHIFT - PAGE_SHIFT);
189#else /* CONFIG_X86_64 */ 227#else /* CONFIG_X86_64 */
190 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) 228 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
191 << (PMD_SHIFT - PAGE_SHIFT);
192#endif 229#endif
193 if (end_pfn > (end >> PAGE_SHIFT)) 230 if (end_pfn > limit_pfn)
194 end_pfn = end >> PAGE_SHIFT; 231 end_pfn = limit_pfn;
195 if (start_pfn < end_pfn) { 232 if (start_pfn < end_pfn) {
196 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 233 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
197 pos = end_pfn << PAGE_SHIFT; 234 pfn = end_pfn;
198 } 235 }
199 236
200 /* big page (2M) range */ 237 /* big page (2M) range */
201 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 238 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
202 << (PMD_SHIFT - PAGE_SHIFT);
203#ifdef CONFIG_X86_32 239#ifdef CONFIG_X86_32
204 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 240 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
205#else /* CONFIG_X86_64 */ 241#else /* CONFIG_X86_64 */
206 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) 242 end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
207 << (PUD_SHIFT - PAGE_SHIFT); 243 if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
208 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) 244 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
209 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
210#endif 245#endif
211 246
212 if (start_pfn < end_pfn) { 247 if (start_pfn < end_pfn) {
213 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 248 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
214 page_size_mask & (1<<PG_LEVEL_2M)); 249 page_size_mask & (1<<PG_LEVEL_2M));
215 pos = end_pfn << PAGE_SHIFT; 250 pfn = end_pfn;
216 } 251 }
217 252
218#ifdef CONFIG_X86_64 253#ifdef CONFIG_X86_64
219 /* big page (1G) range */ 254 /* big page (1G) range */
220 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) 255 start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
221 << (PUD_SHIFT - PAGE_SHIFT); 256 end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
222 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
223 if (start_pfn < end_pfn) { 257 if (start_pfn < end_pfn) {
224 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 258 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
225 page_size_mask & 259 page_size_mask &
226 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); 260 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
227 pos = end_pfn << PAGE_SHIFT; 261 pfn = end_pfn;
228 } 262 }
229 263
230 /* tail is not big page (1G) alignment */ 264 /* tail is not big page (1G) alignment */
231 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 265 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
232 << (PMD_SHIFT - PAGE_SHIFT); 266 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
233 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
234 if (start_pfn < end_pfn) { 267 if (start_pfn < end_pfn) {
235 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 268 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
236 page_size_mask & (1<<PG_LEVEL_2M)); 269 page_size_mask & (1<<PG_LEVEL_2M));
237 pos = end_pfn << PAGE_SHIFT; 270 pfn = end_pfn;
238 } 271 }
239#endif 272#endif
240 273
241 /* tail is not big page (2M) alignment */ 274 /* tail is not big page (2M) alignment */
242 start_pfn = pos>>PAGE_SHIFT; 275 start_pfn = pfn;
243 end_pfn = end>>PAGE_SHIFT; 276 end_pfn = limit_pfn;
244 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 277 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
245 278
246 /* try to merge same page size and continuous */ 279 /* try to merge same page size and continuous */
@@ -257,59 +290,169 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
257 nr_range--; 290 nr_range--;
258 } 291 }
259 292
293 if (!after_bootmem)
294 adjust_range_page_size_mask(mr, nr_range);
295
260 for (i = 0; i < nr_range; i++) 296 for (i = 0; i < nr_range; i++)
261 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", 297 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
262 mr[i].start, mr[i].end - 1, 298 mr[i].start, mr[i].end - 1,
263 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( 299 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
264 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 300 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
265 301
266 /* 302 return nr_range;
267 * Find space for the kernel direct mapping tables. 303}
268 * 304
269 * Later we should allocate these tables in the local node of the 305struct range pfn_mapped[E820_X_MAX];
270 * memory mapped. Unfortunately this is done currently before the 306int nr_pfn_mapped;
271 * nodes are discovered. 307
272 */ 308static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
273 if (!after_bootmem) 309{
274 find_early_table_space(mr, nr_range); 310 nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
311 nr_pfn_mapped, start_pfn, end_pfn);
312 nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
313
314 max_pfn_mapped = max(max_pfn_mapped, end_pfn);
315
316 if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
317 max_low_pfn_mapped = max(max_low_pfn_mapped,
318 min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
319}
320
321bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
322{
323 int i;
324
325 for (i = 0; i < nr_pfn_mapped; i++)
326 if ((start_pfn >= pfn_mapped[i].start) &&
327 (end_pfn <= pfn_mapped[i].end))
328 return true;
329
330 return false;
331}
332
333/*
334 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
335 * This runs before bootmem is initialized and gets pages directly from
336 * the physical memory. To access them they are temporarily mapped.
337 */
338unsigned long __init_refok init_memory_mapping(unsigned long start,
339 unsigned long end)
340{
341 struct map_range mr[NR_RANGE_MR];
342 unsigned long ret = 0;
343 int nr_range, i;
344
345 pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
346 start, end - 1);
347
348 memset(mr, 0, sizeof(mr));
349 nr_range = split_mem_range(mr, 0, start, end);
275 350
276 for (i = 0; i < nr_range; i++) 351 for (i = 0; i < nr_range; i++)
277 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 352 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
278 mr[i].page_size_mask); 353 mr[i].page_size_mask);
279 354
280#ifdef CONFIG_X86_32 355 add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
281 early_ioremap_page_table_range_init();
282 356
283 load_cr3(swapper_pg_dir); 357 return ret >> PAGE_SHIFT;
284#endif 358}
285 359
286 __flush_tlb_all(); 360/*
361 * would have hole in the middle or ends, and only ram parts will be mapped.
362 */
363static unsigned long __init init_range_memory_mapping(
364 unsigned long r_start,
365 unsigned long r_end)
366{
367 unsigned long start_pfn, end_pfn;
368 unsigned long mapped_ram_size = 0;
369 int i;
287 370
288 /* 371 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
289 * Reserve the kernel pagetable pages we used (pgt_buf_start - 372 u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
290 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) 373 u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
291 * so that they can be reused for other purposes. 374 if (start >= end)
292 * 375 continue;
293 * On native it just means calling memblock_reserve, on Xen it also
294 * means marking RW the pagetable pages that we allocated before
295 * but that haven't been used.
296 *
297 * In fact on xen we mark RO the whole range pgt_buf_start -
298 * pgt_buf_top, because we have to make sure that when
299 * init_memory_mapping reaches the pagetable pages area, it maps
300 * RO all the pagetable pages, including the ones that are beyond
301 * pgt_buf_end at that time.
302 */
303 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
304 x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
305 PFN_PHYS(pgt_buf_end));
306 376
307 if (!after_bootmem) 377 /*
308 early_memtest(start, end); 378 * if it is overlapping with brk pgt, we need to
379 * alloc pgt buf from memblock instead.
380 */
381 can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
382 min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
383 init_memory_mapping(start, end);
384 mapped_ram_size += end - start;
385 can_use_brk_pgt = true;
386 }
309 387
310 return ret >> PAGE_SHIFT; 388 return mapped_ram_size;
311} 389}
312 390
391/* (PUD_SHIFT-PMD_SHIFT)/2 */
392#define STEP_SIZE_SHIFT 5
393void __init init_mem_mapping(void)
394{
395 unsigned long end, real_end, start, last_start;
396 unsigned long step_size;
397 unsigned long addr;
398 unsigned long mapped_ram_size = 0;
399 unsigned long new_mapped_ram_size;
400
401 probe_page_size_mask();
402
403#ifdef CONFIG_X86_64
404 end = max_pfn << PAGE_SHIFT;
405#else
406 end = max_low_pfn << PAGE_SHIFT;
407#endif
408
409 /* the ISA range is always mapped regardless of memory holes */
410 init_memory_mapping(0, ISA_END_ADDRESS);
411
412 /* xen has big range in reserved near end of ram, skip it at first */
413 addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
414 PAGE_SIZE);
415 real_end = addr + PMD_SIZE;
416
417 /* step_size need to be small so pgt_buf from BRK could cover it */
418 step_size = PMD_SIZE;
419 max_pfn_mapped = 0; /* will get exact value next */
420 min_pfn_mapped = real_end >> PAGE_SHIFT;
421 last_start = start = real_end;
422 while (last_start > ISA_END_ADDRESS) {
423 if (last_start > step_size) {
424 start = round_down(last_start - 1, step_size);
425 if (start < ISA_END_ADDRESS)
426 start = ISA_END_ADDRESS;
427 } else
428 start = ISA_END_ADDRESS;
429 new_mapped_ram_size = init_range_memory_mapping(start,
430 last_start);
431 last_start = start;
432 min_pfn_mapped = last_start >> PAGE_SHIFT;
433 /* only increase step_size after big range get mapped */
434 if (new_mapped_ram_size > mapped_ram_size)
435 step_size <<= STEP_SIZE_SHIFT;
436 mapped_ram_size += new_mapped_ram_size;
437 }
438
439 if (real_end < end)
440 init_range_memory_mapping(real_end, end);
441
442#ifdef CONFIG_X86_64
443 if (max_pfn > max_low_pfn) {
444 /* can we preseve max_low_pfn ?*/
445 max_low_pfn = max_pfn;
446 }
447#else
448 early_ioremap_page_table_range_init();
449#endif
450
451 load_cr3(swapper_pg_dir);
452 __flush_tlb_all();
453
454 early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
455}
313 456
314/* 457/*
315 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 458 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 745d66b843c8..b299724f6e34 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -53,25 +53,14 @@
53#include <asm/page_types.h> 53#include <asm/page_types.h>
54#include <asm/init.h> 54#include <asm/init.h>
55 55
56#include "mm_internal.h"
57
56unsigned long highstart_pfn, highend_pfn; 58unsigned long highstart_pfn, highend_pfn;
57 59
58static noinline int do_test_wp_bit(void); 60static noinline int do_test_wp_bit(void);
59 61
60bool __read_mostly __vmalloc_start_set = false; 62bool __read_mostly __vmalloc_start_set = false;
61 63
62static __init void *alloc_low_page(void)
63{
64 unsigned long pfn = pgt_buf_end++;
65 void *adr;
66
67 if (pfn >= pgt_buf_top)
68 panic("alloc_low_page: ran out of memory");
69
70 adr = __va(pfn * PAGE_SIZE);
71 clear_page(adr);
72 return adr;
73}
74
75/* 64/*
76 * Creates a middle page table and puts a pointer to it in the 65 * Creates a middle page table and puts a pointer to it in the
77 * given global directory entry. This only returns the gd entry 66 * given global directory entry. This only returns the gd entry
@@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
84 73
85#ifdef CONFIG_X86_PAE 74#ifdef CONFIG_X86_PAE
86 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 75 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
87 if (after_bootmem) 76 pmd_table = (pmd_t *)alloc_low_page();
88 pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
89 else
90 pmd_table = (pmd_t *)alloc_low_page();
91 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 77 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
92 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 78 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
93 pud = pud_offset(pgd, 0); 79 pud = pud_offset(pgd, 0);
@@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
109static pte_t * __init one_page_table_init(pmd_t *pmd) 95static pte_t * __init one_page_table_init(pmd_t *pmd)
110{ 96{
111 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 97 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
112 pte_t *page_table = NULL; 98 pte_t *page_table = (pte_t *)alloc_low_page();
113
114 if (after_bootmem) {
115#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
116 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
117#endif
118 if (!page_table)
119 page_table =
120 (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
121 } else
122 page_table = (pte_t *)alloc_low_page();
123 99
124 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 100 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
125 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 101 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
146 return one_page_table_init(pmd) + pte_idx; 122 return one_page_table_init(pmd) + pte_idx;
147} 123}
148 124
125static unsigned long __init
126page_table_range_init_count(unsigned long start, unsigned long end)
127{
128 unsigned long count = 0;
129#ifdef CONFIG_HIGHMEM
130 int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
131 int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
132 int pgd_idx, pmd_idx;
133 unsigned long vaddr;
134
135 if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
136 return 0;
137
138 vaddr = start;
139 pgd_idx = pgd_index(vaddr);
140
141 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
142 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
143 pmd_idx++) {
144 if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
145 (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
146 count++;
147 vaddr += PMD_SIZE;
148 }
149 pmd_idx = 0;
150 }
151#endif
152 return count;
153}
154
149static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 155static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
150 unsigned long vaddr, pte_t *lastpte) 156 unsigned long vaddr, pte_t *lastpte,
157 void **adr)
151{ 158{
152#ifdef CONFIG_HIGHMEM 159#ifdef CONFIG_HIGHMEM
153 /* 160 /*
@@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
161 168
162 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 169 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
163 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 170 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
164 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 171 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
165 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
166 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
167 pte_t *newpte; 172 pte_t *newpte;
168 int i; 173 int i;
169 174
170 BUG_ON(after_bootmem); 175 BUG_ON(after_bootmem);
171 newpte = alloc_low_page(); 176 newpte = *adr;
172 for (i = 0; i < PTRS_PER_PTE; i++) 177 for (i = 0; i < PTRS_PER_PTE; i++)
173 set_pte(newpte + i, pte[i]); 178 set_pte(newpte + i, pte[i]);
179 *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
174 180
175 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); 181 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
176 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); 182 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
204 pgd_t *pgd; 210 pgd_t *pgd;
205 pmd_t *pmd; 211 pmd_t *pmd;
206 pte_t *pte = NULL; 212 pte_t *pte = NULL;
213 unsigned long count = page_table_range_init_count(start, end);
214 void *adr = NULL;
215
216 if (count)
217 adr = alloc_low_pages(count);
207 218
208 vaddr = start; 219 vaddr = start;
209 pgd_idx = pgd_index(vaddr); 220 pgd_idx = pgd_index(vaddr);
@@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
216 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); 227 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
217 pmd++, pmd_idx++) { 228 pmd++, pmd_idx++) {
218 pte = page_table_kmap_check(one_page_table_init(pmd), 229 pte = page_table_kmap_check(one_page_table_init(pmd),
219 pmd, vaddr, pte); 230 pmd, vaddr, pte, &adr);
220 231
221 vaddr += PMD_SIZE; 232 vaddr += PMD_SIZE;
222 } 233 }
@@ -310,6 +321,7 @@ repeat:
310 __pgprot(PTE_IDENT_ATTR | 321 __pgprot(PTE_IDENT_ATTR |
311 _PAGE_PSE); 322 _PAGE_PSE);
312 323
324 pfn &= PMD_MASK >> PAGE_SHIFT;
313 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 325 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
314 PAGE_OFFSET + PAGE_SIZE-1; 326 PAGE_OFFSET + PAGE_SIZE-1;
315 327
@@ -455,9 +467,14 @@ void __init native_pagetable_init(void)
455 467
456 /* 468 /*
457 * Remove any mappings which extend past the end of physical 469 * Remove any mappings which extend past the end of physical
458 * memory from the boot time page table: 470 * memory from the boot time page table.
471 * In virtual address space, we should have at least two pages
472 * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
473 * definition. And max_low_pfn is set to VMALLOC_END physical
474 * address. If initial memory mapping is doing right job, we
475 * should have pte used near max_low_pfn or one pmd is not present.
459 */ 476 */
460 for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { 477 for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
461 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); 478 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
462 pgd = base + pgd_index(va); 479 pgd = base + pgd_index(va);
463 if (!pgd_present(*pgd)) 480 if (!pgd_present(*pgd))
@@ -468,10 +485,19 @@ void __init native_pagetable_init(void)
468 if (!pmd_present(*pmd)) 485 if (!pmd_present(*pmd))
469 break; 486 break;
470 487
488 /* should not be large page here */
489 if (pmd_large(*pmd)) {
490 pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
491 pfn, pmd, __pa(pmd));
492 BUG_ON(1);
493 }
494
471 pte = pte_offset_kernel(pmd, va); 495 pte = pte_offset_kernel(pmd, va);
472 if (!pte_present(*pte)) 496 if (!pte_present(*pte))
473 break; 497 break;
474 498
499 printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
500 pfn, pmd, __pa(pmd), pte, __pa(pte));
475 pte_clear(NULL, va, pte); 501 pte_clear(NULL, va, pte);
476 } 502 }
477 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); 503 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
@@ -550,7 +576,7 @@ early_param("highmem", parse_highmem);
550 * artificially via the highmem=x boot parameter then create 576 * artificially via the highmem=x boot parameter then create
551 * it: 577 * it:
552 */ 578 */
553void __init lowmem_pfn_init(void) 579static void __init lowmem_pfn_init(void)
554{ 580{
555 /* max_low_pfn is 0, we already have early_res support */ 581 /* max_low_pfn is 0, we already have early_res support */
556 max_low_pfn = max_pfn; 582 max_low_pfn = max_pfn;
@@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void)
586 * We have more RAM than fits into lowmem - we try to put it into 612 * We have more RAM than fits into lowmem - we try to put it into
587 * highmem, also taking the highmem=x boot parameter into account: 613 * highmem, also taking the highmem=x boot parameter into account:
588 */ 614 */
589void __init highmem_pfn_init(void) 615static void __init highmem_pfn_init(void)
590{ 616{
591 max_low_pfn = MAXMEM_PFN; 617 max_low_pfn = MAXMEM_PFN;
592 618
@@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void)
669 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 695 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
670 max_pfn_mapped<<PAGE_SHIFT); 696 max_pfn_mapped<<PAGE_SHIFT);
671 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 697 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
672
673 after_bootmem = 1;
674} 698}
675 699
676/* 700/*
@@ -753,6 +777,8 @@ void __init mem_init(void)
753 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 777 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
754 reservedpages++; 778 reservedpages++;
755 779
780 after_bootmem = 1;
781
756 codesize = (unsigned long) &_etext - (unsigned long) &_text; 782 codesize = (unsigned long) &_etext - (unsigned long) &_text;
757 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 783 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
758 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 784 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 287c6d6a9ef1..edaa2daf4b37 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,82 @@
54#include <asm/uv/uv.h> 54#include <asm/uv/uv.h>
55#include <asm/setup.h> 55#include <asm/setup.h>
56 56
57#include "mm_internal.h"
58
59static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
60 unsigned long addr, unsigned long end)
61{
62 addr &= PMD_MASK;
63 for (; addr < end; addr += PMD_SIZE) {
64 pmd_t *pmd = pmd_page + pmd_index(addr);
65
66 if (!pmd_present(*pmd))
67 set_pmd(pmd, __pmd(addr | pmd_flag));
68 }
69}
70static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
71 unsigned long addr, unsigned long end)
72{
73 unsigned long next;
74
75 for (; addr < end; addr = next) {
76 pud_t *pud = pud_page + pud_index(addr);
77 pmd_t *pmd;
78
79 next = (addr & PUD_MASK) + PUD_SIZE;
80 if (next > end)
81 next = end;
82
83 if (pud_present(*pud)) {
84 pmd = pmd_offset(pud, 0);
85 ident_pmd_init(info->pmd_flag, pmd, addr, next);
86 continue;
87 }
88 pmd = (pmd_t *)info->alloc_pgt_page(info->context);
89 if (!pmd)
90 return -ENOMEM;
91 ident_pmd_init(info->pmd_flag, pmd, addr, next);
92 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
93 }
94
95 return 0;
96}
97
98int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
99 unsigned long addr, unsigned long end)
100{
101 unsigned long next;
102 int result;
103 int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
104
105 for (; addr < end; addr = next) {
106 pgd_t *pgd = pgd_page + pgd_index(addr) + off;
107 pud_t *pud;
108
109 next = (addr & PGDIR_MASK) + PGDIR_SIZE;
110 if (next > end)
111 next = end;
112
113 if (pgd_present(*pgd)) {
114 pud = pud_offset(pgd, 0);
115 result = ident_pud_init(info, pud, addr, next);
116 if (result)
117 return result;
118 continue;
119 }
120
121 pud = (pud_t *)info->alloc_pgt_page(info->context);
122 if (!pud)
123 return -ENOMEM;
124 result = ident_pud_init(info, pud, addr, next);
125 if (result)
126 return result;
127 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
128 }
129
130 return 0;
131}
132
57static int __init parse_direct_gbpages_off(char *arg) 133static int __init parse_direct_gbpages_off(char *arg)
58{ 134{
59 direct_gbpages = 0; 135 direct_gbpages = 0;
@@ -302,10 +378,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
302void __init cleanup_highmap(void) 378void __init cleanup_highmap(void)
303{ 379{
304 unsigned long vaddr = __START_KERNEL_map; 380 unsigned long vaddr = __START_KERNEL_map;
305 unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); 381 unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
306 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; 382 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
307 pmd_t *pmd = level2_kernel_pgt; 383 pmd_t *pmd = level2_kernel_pgt;
308 384
385 /*
386 * Native path, max_pfn_mapped is not set yet.
387 * Xen has valid max_pfn_mapped set in
388 * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
389 */
390 if (max_pfn_mapped)
391 vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
392
309 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { 393 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
310 if (pmd_none(*pmd)) 394 if (pmd_none(*pmd))
311 continue; 395 continue;
@@ -314,69 +398,24 @@ void __init cleanup_highmap(void)
314 } 398 }
315} 399}
316 400
317static __ref void *alloc_low_page(unsigned long *phys)
318{
319 unsigned long pfn = pgt_buf_end++;
320 void *adr;
321
322 if (after_bootmem) {
323 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
324 *phys = __pa(adr);
325
326 return adr;
327 }
328
329 if (pfn >= pgt_buf_top)
330 panic("alloc_low_page: ran out of memory");
331
332 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
333 clear_page(adr);
334 *phys = pfn * PAGE_SIZE;
335 return adr;
336}
337
338static __ref void *map_low_page(void *virt)
339{
340 void *adr;
341 unsigned long phys, left;
342
343 if (after_bootmem)
344 return virt;
345
346 phys = __pa(virt);
347 left = phys & (PAGE_SIZE - 1);
348 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
349 adr = (void *)(((unsigned long)adr) | left);
350
351 return adr;
352}
353
354static __ref void unmap_low_page(void *adr)
355{
356 if (after_bootmem)
357 return;
358
359 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
360}
361
362static unsigned long __meminit 401static unsigned long __meminit
363phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, 402phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
364 pgprot_t prot) 403 pgprot_t prot)
365{ 404{
366 unsigned pages = 0; 405 unsigned long pages = 0, next;
367 unsigned long last_map_addr = end; 406 unsigned long last_map_addr = end;
368 int i; 407 int i;
369 408
370 pte_t *pte = pte_page + pte_index(addr); 409 pte_t *pte = pte_page + pte_index(addr);
371 410
372 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { 411 for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
373 412 next = (addr & PAGE_MASK) + PAGE_SIZE;
374 if (addr >= end) { 413 if (addr >= end) {
375 if (!after_bootmem) { 414 if (!after_bootmem &&
376 for(; i < PTRS_PER_PTE; i++, pte++) 415 !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
377 set_pte(pte, __pte(0)); 416 !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
378 } 417 set_pte(pte, __pte(0));
379 break; 418 continue;
380 } 419 }
381 420
382 /* 421 /*
@@ -414,28 +453,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
414 int i = pmd_index(address); 453 int i = pmd_index(address);
415 454
416 for (; i < PTRS_PER_PMD; i++, address = next) { 455 for (; i < PTRS_PER_PMD; i++, address = next) {
417 unsigned long pte_phys;
418 pmd_t *pmd = pmd_page + pmd_index(address); 456 pmd_t *pmd = pmd_page + pmd_index(address);
419 pte_t *pte; 457 pte_t *pte;
420 pgprot_t new_prot = prot; 458 pgprot_t new_prot = prot;
421 459
460 next = (address & PMD_MASK) + PMD_SIZE;
422 if (address >= end) { 461 if (address >= end) {
423 if (!after_bootmem) { 462 if (!after_bootmem &&
424 for (; i < PTRS_PER_PMD; i++, pmd++) 463 !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
425 set_pmd(pmd, __pmd(0)); 464 !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
426 } 465 set_pmd(pmd, __pmd(0));
427 break; 466 continue;
428 } 467 }
429 468
430 next = (address & PMD_MASK) + PMD_SIZE;
431
432 if (pmd_val(*pmd)) { 469 if (pmd_val(*pmd)) {
433 if (!pmd_large(*pmd)) { 470 if (!pmd_large(*pmd)) {
434 spin_lock(&init_mm.page_table_lock); 471 spin_lock(&init_mm.page_table_lock);
435 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); 472 pte = (pte_t *)pmd_page_vaddr(*pmd);
436 last_map_addr = phys_pte_init(pte, address, 473 last_map_addr = phys_pte_init(pte, address,
437 end, prot); 474 end, prot);
438 unmap_low_page(pte);
439 spin_unlock(&init_mm.page_table_lock); 475 spin_unlock(&init_mm.page_table_lock);
440 continue; 476 continue;
441 } 477 }
@@ -464,19 +500,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
464 pages++; 500 pages++;
465 spin_lock(&init_mm.page_table_lock); 501 spin_lock(&init_mm.page_table_lock);
466 set_pte((pte_t *)pmd, 502 set_pte((pte_t *)pmd,
467 pfn_pte(address >> PAGE_SHIFT, 503 pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
468 __pgprot(pgprot_val(prot) | _PAGE_PSE))); 504 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
469 spin_unlock(&init_mm.page_table_lock); 505 spin_unlock(&init_mm.page_table_lock);
470 last_map_addr = next; 506 last_map_addr = next;
471 continue; 507 continue;
472 } 508 }
473 509
474 pte = alloc_low_page(&pte_phys); 510 pte = alloc_low_page();
475 last_map_addr = phys_pte_init(pte, address, end, new_prot); 511 last_map_addr = phys_pte_init(pte, address, end, new_prot);
476 unmap_low_page(pte);
477 512
478 spin_lock(&init_mm.page_table_lock); 513 spin_lock(&init_mm.page_table_lock);
479 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); 514 pmd_populate_kernel(&init_mm, pmd, pte);
480 spin_unlock(&init_mm.page_table_lock); 515 spin_unlock(&init_mm.page_table_lock);
481 } 516 }
482 update_page_count(PG_LEVEL_2M, pages); 517 update_page_count(PG_LEVEL_2M, pages);
@@ -492,27 +527,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
492 int i = pud_index(addr); 527 int i = pud_index(addr);
493 528
494 for (; i < PTRS_PER_PUD; i++, addr = next) { 529 for (; i < PTRS_PER_PUD; i++, addr = next) {
495 unsigned long pmd_phys;
496 pud_t *pud = pud_page + pud_index(addr); 530 pud_t *pud = pud_page + pud_index(addr);
497 pmd_t *pmd; 531 pmd_t *pmd;
498 pgprot_t prot = PAGE_KERNEL; 532 pgprot_t prot = PAGE_KERNEL;
499 533
500 if (addr >= end)
501 break;
502
503 next = (addr & PUD_MASK) + PUD_SIZE; 534 next = (addr & PUD_MASK) + PUD_SIZE;
504 535 if (addr >= end) {
505 if (!after_bootmem && !e820_any_mapped(addr, next, 0)) { 536 if (!after_bootmem &&
506 set_pud(pud, __pud(0)); 537 !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
538 !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
539 set_pud(pud, __pud(0));
507 continue; 540 continue;
508 } 541 }
509 542
510 if (pud_val(*pud)) { 543 if (pud_val(*pud)) {
511 if (!pud_large(*pud)) { 544 if (!pud_large(*pud)) {
512 pmd = map_low_page(pmd_offset(pud, 0)); 545 pmd = pmd_offset(pud, 0);
513 last_map_addr = phys_pmd_init(pmd, addr, end, 546 last_map_addr = phys_pmd_init(pmd, addr, end,
514 page_size_mask, prot); 547 page_size_mask, prot);
515 unmap_low_page(pmd);
516 __flush_tlb_all(); 548 __flush_tlb_all();
517 continue; 549 continue;
518 } 550 }
@@ -541,19 +573,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
541 pages++; 573 pages++;
542 spin_lock(&init_mm.page_table_lock); 574 spin_lock(&init_mm.page_table_lock);
543 set_pte((pte_t *)pud, 575 set_pte((pte_t *)pud,
544 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 576 pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
577 PAGE_KERNEL_LARGE));
545 spin_unlock(&init_mm.page_table_lock); 578 spin_unlock(&init_mm.page_table_lock);
546 last_map_addr = next; 579 last_map_addr = next;
547 continue; 580 continue;
548 } 581 }
549 582
550 pmd = alloc_low_page(&pmd_phys); 583 pmd = alloc_low_page();
551 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, 584 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
552 prot); 585 prot);
553 unmap_low_page(pmd);
554 586
555 spin_lock(&init_mm.page_table_lock); 587 spin_lock(&init_mm.page_table_lock);
556 pud_populate(&init_mm, pud, __va(pmd_phys)); 588 pud_populate(&init_mm, pud, pmd);
557 spin_unlock(&init_mm.page_table_lock); 589 spin_unlock(&init_mm.page_table_lock);
558 } 590 }
559 __flush_tlb_all(); 591 __flush_tlb_all();
@@ -578,28 +610,23 @@ kernel_physical_mapping_init(unsigned long start,
578 610
579 for (; start < end; start = next) { 611 for (; start < end; start = next) {
580 pgd_t *pgd = pgd_offset_k(start); 612 pgd_t *pgd = pgd_offset_k(start);
581 unsigned long pud_phys;
582 pud_t *pud; 613 pud_t *pud;
583 614
584 next = (start + PGDIR_SIZE) & PGDIR_MASK; 615 next = (start & PGDIR_MASK) + PGDIR_SIZE;
585 if (next > end)
586 next = end;
587 616
588 if (pgd_val(*pgd)) { 617 if (pgd_val(*pgd)) {
589 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); 618 pud = (pud_t *)pgd_page_vaddr(*pgd);
590 last_map_addr = phys_pud_init(pud, __pa(start), 619 last_map_addr = phys_pud_init(pud, __pa(start),
591 __pa(end), page_size_mask); 620 __pa(end), page_size_mask);
592 unmap_low_page(pud);
593 continue; 621 continue;
594 } 622 }
595 623
596 pud = alloc_low_page(&pud_phys); 624 pud = alloc_low_page();
597 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), 625 last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
598 page_size_mask); 626 page_size_mask);
599 unmap_low_page(pud);
600 627
601 spin_lock(&init_mm.page_table_lock); 628 spin_lock(&init_mm.page_table_lock);
602 pgd_populate(&init_mm, pgd, __va(pud_phys)); 629 pgd_populate(&init_mm, pgd, pud);
603 spin_unlock(&init_mm.page_table_lock); 630 spin_unlock(&init_mm.page_table_lock);
604 pgd_changed = true; 631 pgd_changed = true;
605 } 632 }
@@ -664,13 +691,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
664{ 691{
665 struct pglist_data *pgdat = NODE_DATA(nid); 692 struct pglist_data *pgdat = NODE_DATA(nid);
666 struct zone *zone = pgdat->node_zones + ZONE_NORMAL; 693 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
667 unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; 694 unsigned long start_pfn = start >> PAGE_SHIFT;
668 unsigned long nr_pages = size >> PAGE_SHIFT; 695 unsigned long nr_pages = size >> PAGE_SHIFT;
669 int ret; 696 int ret;
670 697
671 last_mapped_pfn = init_memory_mapping(start, start + size); 698 init_memory_mapping(start, start + size);
672 if (last_mapped_pfn > max_pfn_mapped)
673 max_pfn_mapped = last_mapped_pfn;
674 699
675 ret = __add_pages(nid, zone, start_pfn, nr_pages); 700 ret = __add_pages(nid, zone, start_pfn, nr_pages);
676 WARN_ON_ONCE(ret); 701 WARN_ON_ONCE(ret);
@@ -686,6 +711,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory);
686 711
687static struct kcore_list kcore_vsyscall; 712static struct kcore_list kcore_vsyscall;
688 713
714static void __init register_page_bootmem_info(void)
715{
716#ifdef CONFIG_NUMA
717 int i;
718
719 for_each_online_node(i)
720 register_page_bootmem_info_node(NODE_DATA(i));
721#endif
722}
723
689void __init mem_init(void) 724void __init mem_init(void)
690{ 725{
691 long codesize, reservedpages, datasize, initsize; 726 long codesize, reservedpages, datasize, initsize;
@@ -698,11 +733,8 @@ void __init mem_init(void)
698 reservedpages = 0; 733 reservedpages = 0;
699 734
700 /* this will put all low memory onto the freelists */ 735 /* this will put all low memory onto the freelists */
701#ifdef CONFIG_NUMA 736 register_page_bootmem_info();
702 totalram_pages = numa_free_all_bootmem();
703#else
704 totalram_pages = free_all_bootmem(); 737 totalram_pages = free_all_bootmem();
705#endif
706 738
707 absent_pages = absent_pages_in_range(0, max_pfn); 739 absent_pages = absent_pages_in_range(0, max_pfn);
708 reservedpages = max_pfn - totalram_pages - absent_pages; 740 reservedpages = max_pfn - totalram_pages - absent_pages;
@@ -776,6 +808,7 @@ void mark_rodata_ro(void)
776 unsigned long end = (unsigned long) &__end_rodata_hpage_align; 808 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
777 unsigned long text_end = PFN_ALIGN(&__stop___ex_table); 809 unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
778 unsigned long rodata_end = PFN_ALIGN(&__end_rodata); 810 unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
811 unsigned long all_end = PFN_ALIGN(&_end);
779 812
780 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 813 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
781 (end - start) >> 10); 814 (end - start) >> 10);
@@ -784,10 +817,10 @@ void mark_rodata_ro(void)
784 kernel_set_to_readonly = 1; 817 kernel_set_to_readonly = 1;
785 818
786 /* 819 /*
787 * The rodata section (but not the kernel text!) should also be 820 * The rodata/data/bss/brk section (but not the kernel text!)
788 * not-executable. 821 * should also be not-executable.
789 */ 822 */
790 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); 823 set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
791 824
792 rodata_test(); 825 rodata_test();
793 826
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 000000000000..6b563a118891
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,19 @@
1#ifndef __X86_MM_INTERNAL_H
2#define __X86_MM_INTERNAL_H
3
4void *alloc_low_pages(unsigned int num);
5static inline void *alloc_low_page(void)
6{
7 return alloc_low_pages(1);
8}
9
10void early_ioremap_page_table_range_init(void);
11
12unsigned long kernel_physical_mapping_init(unsigned long start,
13 unsigned long end,
14 unsigned long page_size_mask);
15void zone_sizes_init(void);
16
17extern int after_bootmem;
18
19#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 92e27119ee1a..9405ffc91502 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -10,16 +10,3 @@ void __init initmem_init(void)
10{ 10{
11 x86_numa_init(); 11 x86_numa_init();
12} 12}
13
14unsigned long __init numa_free_all_bootmem(void)
15{
16 unsigned long pages = 0;
17 int i;
18
19 for_each_online_node(i)
20 pages += free_all_bootmem_node(NODE_DATA(i));
21
22 pages += free_low_memory_core_early(MAX_NUMNODES);
23
24 return pages;
25}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6d13d2a3f825..a1b1c88f9caf 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -579,16 +579,10 @@ static int split_large_page(pte_t *kpte, unsigned long address)
579 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 579 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
580 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 580 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
581 581
582 if (address >= (unsigned long)__va(0) && 582 if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
583 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) 583 PFN_DOWN(__pa(address)) + 1))
584 split_page_count(level); 584 split_page_count(level);
585 585
586#ifdef CONFIG_X86_64
587 if (address >= (unsigned long)__va(1UL<<32) &&
588 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
589 split_page_count(level);
590#endif
591
592 /* 586 /*
593 * Install the new, split up pagetable. 587 * Install the new, split up pagetable.
594 * 588 *
@@ -757,13 +751,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
757 unsigned long vaddr; 751 unsigned long vaddr;
758 int ret; 752 int ret;
759 753
760 if (cpa->pfn >= max_pfn_mapped) 754 if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
761 return 0; 755 return 0;
762 756
763#ifdef CONFIG_X86_64
764 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
765 return 0;
766#endif
767 /* 757 /*
768 * No need to redo, when the primary call touched the direct 758 * No need to redo, when the primary call touched the direct
769 * mapping already: 759 * mapping already:
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 1b600266265e..1743c1c92411 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -835,7 +835,7 @@ void __init efi_enter_virtual_mode(void)
835 efi_memory_desc_t *md, *prev_md = NULL; 835 efi_memory_desc_t *md, *prev_md = NULL;
836 efi_status_t status; 836 efi_status_t status;
837 unsigned long size; 837 unsigned long size;
838 u64 end, systab, end_pfn; 838 u64 end, systab, start_pfn, end_pfn;
839 void *p, *va, *new_memmap = NULL; 839 void *p, *va, *new_memmap = NULL;
840 int count = 0; 840 int count = 0;
841 841
@@ -888,10 +888,9 @@ void __init efi_enter_virtual_mode(void)
888 size = md->num_pages << EFI_PAGE_SHIFT; 888 size = md->num_pages << EFI_PAGE_SHIFT;
889 end = md->phys_addr + size; 889 end = md->phys_addr + size;
890 890
891 start_pfn = PFN_DOWN(md->phys_addr);
891 end_pfn = PFN_UP(end); 892 end_pfn = PFN_UP(end);
892 if (end_pfn <= max_low_pfn_mapped 893 if (pfn_range_is_mapped(start_pfn, end_pfn)) {
893 || (end_pfn > (1UL << (32 - PAGE_SHIFT))
894 && end_pfn <= max_pfn_mapped)) {
895 va = __va(md->phys_addr); 894 va = __va(md->phys_addr);
896 895
897 if (!(md->attribute & EFI_MEMORY_WB)) 896 if (!(md->attribute & EFI_MEMORY_WB))
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 460f314d13e5..a0fde91c16cf 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,8 @@
11#include <linux/gfp.h> 11#include <linux/gfp.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14
15#include <asm/init.h>
14#include <asm/proto.h> 16#include <asm/proto.h>
15#include <asm/page.h> 17#include <asm/page.h>
16#include <asm/pgtable.h> 18#include <asm/pgtable.h>
@@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt;
39 41
40void *relocated_restore_code; 42void *relocated_restore_code;
41 43
42static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) 44static void *alloc_pgt_page(void *context)
43{ 45{
44 long i, j; 46 return (void *)get_safe_page(GFP_ATOMIC);
45
46 i = pud_index(address);
47 pud = pud + i;
48 for (; i < PTRS_PER_PUD; pud++, i++) {
49 unsigned long paddr;
50 pmd_t *pmd;
51
52 paddr = address + i*PUD_SIZE;
53 if (paddr >= end)
54 break;
55
56 pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
57 if (!pmd)
58 return -ENOMEM;
59 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
60 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
61 unsigned long pe;
62
63 if (paddr >= end)
64 break;
65 pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
66 pe &= __supported_pte_mask;
67 set_pmd(pmd, __pmd(pe));
68 }
69 }
70 return 0;
71} 47}
72 48
73static int set_up_temporary_mappings(void) 49static int set_up_temporary_mappings(void)
74{ 50{
75 unsigned long start, end, next; 51 struct x86_mapping_info info = {
76 int error; 52 .alloc_pgt_page = alloc_pgt_page,
53 .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
54 .kernel_mapping = true,
55 };
56 unsigned long mstart, mend;
57 int result;
58 int i;
77 59
78 temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); 60 temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
79 if (!temp_level4_pgt) 61 if (!temp_level4_pgt)
@@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void)
84 init_level4_pgt[pgd_index(__START_KERNEL_map)]); 66 init_level4_pgt[pgd_index(__START_KERNEL_map)]);
85 67
86 /* Set up the direct mapping from scratch */ 68 /* Set up the direct mapping from scratch */
87 start = (unsigned long)pfn_to_kaddr(0); 69 for (i = 0; i < nr_pfn_mapped; i++) {
88 end = (unsigned long)pfn_to_kaddr(max_pfn); 70 mstart = pfn_mapped[i].start << PAGE_SHIFT;
89 71 mend = pfn_mapped[i].end << PAGE_SHIFT;
90 for (; start < end; start = next) { 72
91 pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); 73 result = kernel_ident_mapping_init(&info, temp_level4_pgt,
92 if (!pud) 74 mstart, mend);
93 return -ENOMEM; 75
94 next = start + PGDIR_SIZE; 76 if (result)
95 if (next > end) 77 return result;
96 next = end;
97 if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
98 return error;
99 set_pgd(temp_level4_pgt + pgd_index(start),
100 mk_kernel_pgd(__pa(pud)));
101 } 78 }
79
102 return 0; 80 return 0;
103} 81}
104 82
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 80450261215c..a44f457e70a1 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -8,9 +8,26 @@
8struct real_mode_header *real_mode_header; 8struct real_mode_header *real_mode_header;
9u32 *trampoline_cr4_features; 9u32 *trampoline_cr4_features;
10 10
11void __init setup_real_mode(void) 11void __init reserve_real_mode(void)
12{ 12{
13 phys_addr_t mem; 13 phys_addr_t mem;
14 unsigned char *base;
15 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
16
17 /* Has to be under 1M so we can execute real-mode AP code. */
18 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
19 if (!mem)
20 panic("Cannot allocate trampoline\n");
21
22 base = __va(mem);
23 memblock_reserve(mem, size);
24 real_mode_header = (struct real_mode_header *) base;
25 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
26 base, (unsigned long long)mem, size);
27}
28
29void __init setup_real_mode(void)
30{
14 u16 real_mode_seg; 31 u16 real_mode_seg;
15 u32 *rel; 32 u32 *rel;
16 u32 count; 33 u32 count;
@@ -25,16 +42,7 @@ void __init setup_real_mode(void)
25 u64 efer; 42 u64 efer;
26#endif 43#endif
27 44
28 /* Has to be in very low memory so we can execute real-mode AP code. */ 45 base = (unsigned char *)real_mode_header;
29 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
30 if (!mem)
31 panic("Cannot allocate trampoline\n");
32
33 base = __va(mem);
34 memblock_reserve(mem, size);
35 real_mode_header = (struct real_mode_header *) base;
36 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
37 base, (unsigned long long)mem, size);
38 46
39 memcpy(base, real_mode_blob, size); 47 memcpy(base, real_mode_blob, size);
40 48
@@ -78,16 +86,18 @@ void __init setup_real_mode(void)
78 *trampoline_cr4_features = read_cr4(); 86 *trampoline_cr4_features = read_cr4();
79 87
80 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 88 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
81 trampoline_pgd[0] = __pa_symbol(level3_ident_pgt) + _KERNPG_TABLE; 89 trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
82 trampoline_pgd[511] = __pa_symbol(level3_kernel_pgt) + _KERNPG_TABLE; 90 trampoline_pgd[511] = init_level4_pgt[511].pgd;
83#endif 91#endif
84} 92}
85 93
86/* 94/*
87 * set_real_mode_permissions() gets called very early, to guarantee the 95 * reserve_real_mode() gets called very early, to guarantee the
88 * availability of low memory. This is before the proper kernel page 96 * availability of low memory. This is before the proper kernel page
89 * tables are set up, so we cannot set page permissions in that 97 * tables are set up, so we cannot set page permissions in that
90 * function. Thus, we use an arch_initcall instead. 98 * function. Also trampoline code will be executed by APs so we
99 * need to mark it executable at do_pre_smp_initcalls() at least,
100 * thus run it as a early_initcall().
91 */ 101 */
92static int __init set_real_mode_permissions(void) 102static int __init set_real_mode_permissions(void)
93{ 103{
@@ -111,5 +121,4 @@ static int __init set_real_mode_permissions(void)
111 121
112 return 0; 122 return 0;
113} 123}
114 124early_initcall(set_real_mode_permissions);
115arch_initcall(set_real_mode_permissions);
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 5a1847d61930..79d67bd507fa 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -814,12 +814,14 @@ int main(int argc, char **argv)
814 read_relocs(fp); 814 read_relocs(fp);
815 if (show_absolute_syms) { 815 if (show_absolute_syms) {
816 print_absolute_symbols(); 816 print_absolute_symbols();
817 return 0; 817 goto out;
818 } 818 }
819 if (show_absolute_relocs) { 819 if (show_absolute_relocs) {
820 print_absolute_relocs(); 820 print_absolute_relocs();
821 return 0; 821 goto out;
822 } 822 }
823 emit_relocs(as_text, use_real_mode); 823 emit_relocs(as_text, use_real_mode);
824out:
825 fclose(fp);
824 return 0; 826 return 0;
825} 827}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 01de35c77221..f5e86eee4e0e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm)
1178 1178
1179static void xen_post_allocator_init(void); 1179static void xen_post_allocator_init(void);
1180 1180
1181static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1182{
1183 /* reserve the range used */
1184 native_pagetable_reserve(start, end);
1185
1186 /* set as RW the rest */
1187 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1188 PFN_PHYS(pgt_buf_top));
1189 while (end < PFN_PHYS(pgt_buf_top)) {
1190 make_lowmem_page_readwrite(__va(end));
1191 end += PAGE_SIZE;
1192 }
1193}
1194
1195#ifdef CONFIG_X86_64 1181#ifdef CONFIG_X86_64
1196static void __init xen_cleanhighmap(unsigned long vaddr, 1182static void __init xen_cleanhighmap(unsigned long vaddr,
1197 unsigned long vaddr_end) 1183 unsigned long vaddr_end)
@@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1503#else /* CONFIG_X86_64 */ 1489#else /* CONFIG_X86_64 */
1504static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) 1490static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1505{ 1491{
1506 unsigned long pfn = pte_pfn(pte);
1507
1508 /*
1509 * If the new pfn is within the range of the newly allocated
1510 * kernel pagetable, and it isn't being mapped into an
1511 * early_ioremap fixmap slot as a freshly allocated page, make sure
1512 * it is RO.
1513 */
1514 if (((!is_early_ioremap_ptep(ptep) &&
1515 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1516 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1517 pte = pte_wrprotect(pte);
1518
1519 return pte; 1492 return pte;
1520} 1493}
1521#endif /* CONFIG_X86_64 */ 1494#endif /* CONFIG_X86_64 */
@@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2197 2170
2198void __init xen_init_mmu_ops(void) 2171void __init xen_init_mmu_ops(void)
2199{ 2172{
2200 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2201 x86_init.paging.pagetable_init = xen_pagetable_init; 2173 x86_init.paging.pagetable_init = xen_pagetable_init;
2202 pv_mmu_ops = xen_mmu_ops; 2174 pv_mmu_ops = xen_mmu_ops;
2203 2175