aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-21 21:06:55 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-21 21:06:55 -0500
commit2ef14f465b9e096531343f5b734cffc5f759f4a6 (patch)
tree07b504d7105842a4b1a74cf1e153023a02fb9c1e /arch/x86
parentcb715a836642e0ec69350670d1c2f800f3e2d2e4 (diff)
parent0da3e7f526fde7a6522a3038b7ce609fc50f6707 (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm changes from Peter Anvin: "This is a huge set of several partly interrelated (and concurrently developed) changes, which is why the branch history is messier than one would like. The *really* big items are two humonguous patchsets mostly developed by Yinghai Lu at my request, which completely revamps the way we create initial page tables. In particular, rather than estimating how much memory we will need for page tables and then build them into that memory -- a calculation that has shown to be incredibly fragile -- we now build them (on 64 bits) with the aid of a "pseudo-linear mode" -- a #PF handler which creates temporary page tables on demand. This has several advantages: 1. It makes it much easier to support things that need access to data very early (a followon patchset uses this to load microcode way early in the kernel startup). 2. It allows the kernel and all the kernel data objects to be invoked from above the 4 GB limit. This allows kdump to work on very large systems. 3. It greatly reduces the difference between Xen and native (Xen's equivalent of the #PF handler are the temporary page tables created by the domain builder), eliminating a bunch of fragile hooks. The patch series also gets us a bit closer to W^X. Additional work in this pull is the 64-bit get_user() work which you were also involved with, and a bunch of cleanups/speedups to __phys_addr()/__pa()." * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (105 commits) x86, mm: Move reserving low memory later in initialization x86, doc: Clarify the use of asm("%edx") in uaccess.h x86, mm: Redesign get_user with a __builtin_choose_expr hack x86: Be consistent with data size in getuser.S x86, mm: Use a bitfield to mask nuisance get_user() warnings x86/kvm: Fix compile warning in kvm_register_steal_time() x86-32: Add support for 64bit get_user() x86-32, mm: Remove reference to alloc_remap() x86-32, mm: Remove reference to resume_map_numa_kva() x86-32, mm: Rip out x86_32 NUMA remapping code x86/numa: Use __pa_nodebug() instead x86: Don't panic if can not alloc buffer for swiotlb mm: Add alloc_bootmem_low_pages_nopanic() x86, 64bit, mm: hibernate use generic mapping_init x86, 64bit, mm: Mark data/bss/brk to nx x86: Merge early kernel reserve for 32bit and 64bit x86: Add Crash kernel low reservation x86, kdump: Remove crashkernel range find limit for 64bit memblock: Add memblock_mem_size() x86, boot: Not need to check setup_header version for setup_data ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/boot/boot.h18
-rw-r--r--arch/x86/boot/cmdline.c12
-rw-r--r--arch/x86/boot/compressed/cmdline.c12
-rw-r--r--arch/x86/boot/compressed/head_64.S48
-rw-r--r--arch/x86/boot/header.S10
-rw-r--r--arch/x86/include/asm/init.h28
-rw-r--r--arch/x86/include/asm/kexec.h6
-rw-r--r--arch/x86/include/asm/mmzone_32.h6
-rw-r--r--arch/x86/include/asm/numa.h2
-rw-r--r--arch/x86/include/asm/numa_64.h6
-rw-r--r--arch/x86/include/asm/page.h7
-rw-r--r--arch/x86/include/asm/page_32.h1
-rw-r--r--arch/x86/include/asm/page_64.h36
-rw-r--r--arch/x86/include/asm/page_64_types.h22
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h17
-rw-r--r--arch/x86/include/asm/pgtable_64.h5
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/realmode.h3
-rw-r--r--arch/x86/include/asm/uaccess.h55
-rw-r--r--arch/x86/include/asm/x86_init.h12
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_gart_64.c5
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/head32.c20
-rw-r--r--arch/x86/kernel/head64.c131
-rw-r--r--arch/x86/kernel/head_64.S210
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c1
-rw-r--r--arch/x86/kernel/kvm.c11
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c171
-rw-r--r--arch/x86/kernel/setup.c260
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c4
-rw-r--r--arch/x86/lguest/boot.c3
-rw-r--r--arch/x86/lib/getuser.S43
-rw-r--r--arch/x86/mm/init.c459
-rw-r--r--arch/x86/mm/init_32.c106
-rw-r--r--arch/x86/mm/init_64.c255
-rw-r--r--arch/x86/mm/mm_internal.h19
-rw-r--r--arch/x86/mm/numa.c32
-rw-r--r--arch/x86/mm/numa_32.c161
-rw-r--r--arch/x86/mm/numa_64.c13
-rw-r--r--arch/x86/mm/numa_internal.h6
-rw-r--r--arch/x86/mm/pageattr.c66
-rw-r--r--arch/x86/mm/pat.c4
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/physaddr.c60
-rw-r--r--arch/x86/platform/efi/efi.c11
-rw-r--r--arch/x86/power/hibernate_32.c2
-rw-r--r--arch/x86/power/hibernate_64.c66
-rw-r--r--arch/x86/realmode/init.c49
-rw-r--r--arch/x86/xen/mmu.c28
62 files changed, 1384 insertions, 1192 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b44c0b50e569..ff0e5f3c844e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1277,10 +1277,6 @@ config NODES_SHIFT
1277 Specify the maximum number of NUMA Nodes available on the target 1277 Specify the maximum number of NUMA Nodes available on the target
1278 system. Increases memory reserved to accommodate various tables. 1278 system. Increases memory reserved to accommodate various tables.
1279 1279
1280config HAVE_ARCH_ALLOC_REMAP
1281 def_bool y
1282 depends on X86_32 && NUMA
1283
1284config ARCH_HAVE_MEMORY_PRESENT 1280config ARCH_HAVE_MEMORY_PRESENT
1285 def_bool y 1281 def_bool y
1286 depends on X86_32 && DISCONTIGMEM 1282 depends on X86_32 && DISCONTIGMEM
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 18997e5a1053..5b7531966b84 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -285,16 +285,26 @@ struct biosregs {
285void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); 285void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
286 286
287/* cmdline.c */ 287/* cmdline.c */
288int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize); 288int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize);
289int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option); 289int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option);
290static inline int cmdline_find_option(const char *option, char *buffer, int bufsize) 290static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
291{ 291{
292 return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize); 292 unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
293
294 if (cmd_line_ptr >= 0x100000)
295 return -1; /* inaccessible */
296
297 return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize);
293} 298}
294 299
295static inline int cmdline_find_option_bool(const char *option) 300static inline int cmdline_find_option_bool(const char *option)
296{ 301{
297 return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option); 302 unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
303
304 if (cmd_line_ptr >= 0x100000)
305 return -1; /* inaccessible */
306
307 return __cmdline_find_option_bool(cmd_line_ptr, option);
298} 308}
299 309
300 310
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 6b3b6f708c04..625d21b0cd3f 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,7 +27,7 @@ static inline int myisspace(u8 c)
27 * Returns the length of the argument (regardless of if it was 27 * Returns the length of the argument (regardless of if it was
28 * truncated to fit in the buffer), or -1 on not found. 28 * truncated to fit in the buffer), or -1 on not found.
29 */ 29 */
30int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize) 30int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize)
31{ 31{
32 addr_t cptr; 32 addr_t cptr;
33 char c; 33 char c;
@@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
41 st_bufcpy /* Copying this to buffer */ 41 st_bufcpy /* Copying this to buffer */
42 } state = st_wordstart; 42 } state = st_wordstart;
43 43
44 if (!cmdline_ptr || cmdline_ptr >= 0x100000) 44 if (!cmdline_ptr)
45 return -1; /* No command line, or inaccessible */ 45 return -1; /* No command line */
46 46
47 cptr = cmdline_ptr & 0xf; 47 cptr = cmdline_ptr & 0xf;
48 set_fs(cmdline_ptr >> 4); 48 set_fs(cmdline_ptr >> 4);
@@ -99,7 +99,7 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
99 * Returns the position of that option (starts counting with 1) 99 * Returns the position of that option (starts counting with 1)
100 * or 0 on not found 100 * or 0 on not found
101 */ 101 */
102int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) 102int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option)
103{ 103{
104 addr_t cptr; 104 addr_t cptr;
105 char c; 105 char c;
@@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
111 st_wordskip, /* Miscompare, skip */ 111 st_wordskip, /* Miscompare, skip */
112 } state = st_wordstart; 112 } state = st_wordstart;
113 113
114 if (!cmdline_ptr || cmdline_ptr >= 0x100000) 114 if (!cmdline_ptr)
115 return -1; /* No command line, or inaccessible */ 115 return -1; /* No command line */
116 116
117 cptr = cmdline_ptr & 0xf; 117 cptr = cmdline_ptr & 0xf;
118 set_fs(cmdline_ptr >> 4); 118 set_fs(cmdline_ptr >> 4);
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 10f6b1178c68..bffd73b45b1f 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,13 +13,21 @@ static inline char rdfs8(addr_t addr)
13 return *((char *)(fs + addr)); 13 return *((char *)(fs + addr));
14} 14}
15#include "../cmdline.c" 15#include "../cmdline.c"
16static unsigned long get_cmd_line_ptr(void)
17{
18 unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
19
20 cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32;
21
22 return cmd_line_ptr;
23}
16int cmdline_find_option(const char *option, char *buffer, int bufsize) 24int cmdline_find_option(const char *option, char *buffer, int bufsize)
17{ 25{
18 return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize); 26 return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize);
19} 27}
20int cmdline_find_option_bool(const char *option) 28int cmdline_find_option_bool(const char *option)
21{ 29{
22 return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); 30 return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
23} 31}
24 32
25#endif 33#endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f5d1aaa0dec8..c1d383d1fb7e 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -37,6 +37,12 @@
37 __HEAD 37 __HEAD
38 .code32 38 .code32
39ENTRY(startup_32) 39ENTRY(startup_32)
40 /*
41 * 32bit entry is 0 and it is ABI so immutable!
42 * If we come here directly from a bootloader,
43 * kernel(text+data+bss+brk) ramdisk, zero_page, command line
44 * all need to be under the 4G limit.
45 */
40 cld 46 cld
41 /* 47 /*
42 * Test KEEP_SEGMENTS flag to see if the bootloader is asking 48 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -154,6 +160,12 @@ ENTRY(startup_32)
154 btsl $_EFER_LME, %eax 160 btsl $_EFER_LME, %eax
155 wrmsr 161 wrmsr
156 162
163 /* After gdt is loaded */
164 xorl %eax, %eax
165 lldt %ax
166 movl $0x20, %eax
167 ltr %ax
168
157 /* 169 /*
158 * Setup for the jump to 64bit mode 170 * Setup for the jump to 64bit mode
159 * 171 *
@@ -176,28 +188,18 @@ ENTRY(startup_32)
176 lret 188 lret
177ENDPROC(startup_32) 189ENDPROC(startup_32)
178 190
179no_longmode:
180 /* This isn't an x86-64 CPU so hang */
1811:
182 hlt
183 jmp 1b
184
185#include "../../kernel/verify_cpu.S"
186
187 /*
188 * Be careful here startup_64 needs to be at a predictable
189 * address so I can export it in an ELF header. Bootloaders
190 * should look at the ELF header to find this address, as
191 * it may change in the future.
192 */
193 .code64 191 .code64
194 .org 0x200 192 .org 0x200
195ENTRY(startup_64) 193ENTRY(startup_64)
196 /* 194 /*
195 * 64bit entry is 0x200 and it is ABI so immutable!
197 * We come here either from startup_32 or directly from a 196 * We come here either from startup_32 or directly from a
198 * 64bit bootloader. If we come here from a bootloader we depend on 197 * 64bit bootloader.
199 * an identity mapped page table being provied that maps our 198 * If we come here from a bootloader, kernel(text+data+bss+brk),
200 * entire text+data+bss and hopefully all of memory. 199 * ramdisk, zero_page, command line could be above 4G.
200 * We depend on an identity mapped page table being provided
201 * that maps our entire kernel(text+data+bss+brk), zero page
202 * and command line.
201 */ 203 */
202#ifdef CONFIG_EFI_STUB 204#ifdef CONFIG_EFI_STUB
203 /* 205 /*
@@ -247,9 +249,6 @@ preferred_addr:
247 movl %eax, %ss 249 movl %eax, %ss
248 movl %eax, %fs 250 movl %eax, %fs
249 movl %eax, %gs 251 movl %eax, %gs
250 lldt %ax
251 movl $0x20, %eax
252 ltr %ax
253 252
254 /* 253 /*
255 * Compute the decompressed kernel start address. It is where 254 * Compute the decompressed kernel start address. It is where
@@ -349,6 +348,15 @@ relocated:
349 */ 348 */
350 jmp *%rbp 349 jmp *%rbp
351 350
351 .code32
352no_longmode:
353 /* This isn't an x86-64 CPU so hang */
3541:
355 hlt
356 jmp 1b
357
358#include "../../kernel/verify_cpu.S"
359
352 .data 360 .data
353gdt: 361gdt:
354 .word gdt_end - gdt 362 .word gdt_end - gdt
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 944ce595f767..9ec06a1f6d61 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -374,6 +374,14 @@ xloadflags:
374#else 374#else
375# define XLF0 0 375# define XLF0 0
376#endif 376#endif
377
378#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64)
379 /* kernel/boot_param/ramdisk could be loaded above 4g */
380# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
381#else
382# define XLF1 0
383#endif
384
377#ifdef CONFIG_EFI_STUB 385#ifdef CONFIG_EFI_STUB
378# ifdef CONFIG_X86_64 386# ifdef CONFIG_X86_64
379# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */ 387# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */
@@ -383,7 +391,7 @@ xloadflags:
383#else 391#else
384# define XLF23 0 392# define XLF23 0
385#endif 393#endif
386 .word XLF0 | XLF23 394 .word XLF0 | XLF1 | XLF23
387 395
388cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, 396cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
389 #added with boot protocol 397 #added with boot protocol
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index adcc0ae73d09..223042086f4e 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -1,20 +1,14 @@
1#ifndef _ASM_X86_INIT_32_H 1#ifndef _ASM_X86_INIT_H
2#define _ASM_X86_INIT_32_H 2#define _ASM_X86_INIT_H
3 3
4#ifdef CONFIG_X86_32 4struct x86_mapping_info {
5extern void __init early_ioremap_page_table_range_init(void); 5 void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
6#endif 6 void *context; /* context for alloc_pgt_page */
7 unsigned long pmd_flag; /* page flag for PMD entry */
8 bool kernel_mapping; /* kernel mapping or ident mapping */
9};
7 10
8extern void __init zone_sizes_init(void); 11int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
12 unsigned long addr, unsigned long end);
9 13
10extern unsigned long __init 14#endif /* _ASM_X86_INIT_H */
11kernel_physical_mapping_init(unsigned long start,
12 unsigned long end,
13 unsigned long page_size_mask);
14
15
16extern unsigned long __initdata pgt_buf_start;
17extern unsigned long __meminitdata pgt_buf_end;
18extern unsigned long __meminitdata pgt_buf_top;
19
20#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 6080d2694bad..17483a492f18 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -48,11 +48,11 @@
48# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64) 48# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
49#else 49#else
50/* Maximum physical address we can use pages from */ 50/* Maximum physical address we can use pages from */
51# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) 51# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1)
52/* Maximum address we can reach in physical address mode */ 52/* Maximum address we can reach in physical address mode */
53# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) 53# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1)
54/* Maximum address we can use for the control pages */ 54/* Maximum address we can use for the control pages */
55# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) 55# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
56 56
57/* Allocate one page for the pdp and the second for the code */ 57/* Allocate one page for the pdp and the second for the code */
58# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) 58# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index eb05fb3b02fb..8a9b3e288cb4 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -14,12 +14,6 @@ extern struct pglist_data *node_data[];
14 14
15#include <asm/numaq.h> 15#include <asm/numaq.h>
16 16
17extern void resume_map_numa_kva(pgd_t *pgd);
18
19#else /* !CONFIG_NUMA */
20
21static inline void resume_map_numa_kva(pgd_t *pgd) {}
22
23#endif /* CONFIG_NUMA */ 17#endif /* CONFIG_NUMA */
24 18
25#ifdef CONFIG_DISCONTIGMEM 19#ifdef CONFIG_DISCONTIGMEM
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 49119fcea2dc..52560a2038e1 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu)
54 54
55#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
56# include <asm/numa_32.h> 56# include <asm/numa_32.h>
57#else
58# include <asm/numa_64.h>
59#endif 57#endif
60 58
61#ifdef CONFIG_NUMA 59#ifdef CONFIG_NUMA
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
deleted file mode 100644
index 0c05f7ae46e8..000000000000
--- a/arch/x86/include/asm/numa_64.h
+++ /dev/null
@@ -1,6 +0,0 @@
1#ifndef _ASM_X86_NUMA_64_H
2#define _ASM_X86_NUMA_64_H
3
4extern unsigned long numa_free_all_bootmem(void);
5
6#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 8ca82839288a..c87892442e53 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -17,6 +17,10 @@
17 17
18struct page; 18struct page;
19 19
20#include <linux/range.h>
21extern struct range pfn_mapped[];
22extern int nr_pfn_mapped;
23
20static inline void clear_user_page(void *page, unsigned long vaddr, 24static inline void clear_user_page(void *page, unsigned long vaddr,
21 struct page *pg) 25 struct page *pg)
22{ 26{
@@ -44,7 +48,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
44 * case properly. Once all supported versions of gcc understand it, we can 48 * case properly. Once all supported versions of gcc understand it, we can
45 * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) 49 * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
46 */ 50 */
47#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) 51#define __pa_symbol(x) \
52 __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
48 53
49#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) 54#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
50 55
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index da4e762406f7..4d550d04b609 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -15,6 +15,7 @@ extern unsigned long __phys_addr(unsigned long);
15#else 15#else
16#define __phys_addr(x) __phys_addr_nodebug(x) 16#define __phys_addr(x) __phys_addr_nodebug(x)
17#endif 17#endif
18#define __phys_addr_symbol(x) __phys_addr(x)
18#define __phys_reloc_hide(x) RELOC_HIDE((x), 0) 19#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
19 20
20#ifdef CONFIG_FLATMEM 21#ifdef CONFIG_FLATMEM
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 072694ed81a5..0f1ddee6a0ce 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -3,4 +3,40 @@
3 3
4#include <asm/page_64_types.h> 4#include <asm/page_64_types.h>
5 5
6#ifndef __ASSEMBLY__
7
8/* duplicated to the one in bootmem.h */
9extern unsigned long max_pfn;
10extern unsigned long phys_base;
11
12static inline unsigned long __phys_addr_nodebug(unsigned long x)
13{
14 unsigned long y = x - __START_KERNEL_map;
15
16 /* use the carry flag to determine if x was < __START_KERNEL_map */
17 x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
18
19 return x;
20}
21
22#ifdef CONFIG_DEBUG_VIRTUAL
23extern unsigned long __phys_addr(unsigned long);
24extern unsigned long __phys_addr_symbol(unsigned long);
25#else
26#define __phys_addr(x) __phys_addr_nodebug(x)
27#define __phys_addr_symbol(x) \
28 ((unsigned long)(x) - __START_KERNEL_map + phys_base)
29#endif
30
31#define __phys_reloc_hide(x) (x)
32
33#ifdef CONFIG_FLATMEM
34#define pfn_valid(pfn) ((pfn) < max_pfn)
35#endif
36
37void clear_page(void *page);
38void copy_page(void *to, void *from);
39
40#endif /* !__ASSEMBLY__ */
41
6#endif /* _ASM_X86_PAGE_64_H */ 42#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 320f7bb95f76..8b491e66eaa8 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -50,26 +50,4 @@
50#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) 50#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
51#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) 51#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
52 52
53#ifndef __ASSEMBLY__
54void clear_page(void *page);
55void copy_page(void *to, void *from);
56
57/* duplicated to the one in bootmem.h */
58extern unsigned long max_pfn;
59extern unsigned long phys_base;
60
61extern unsigned long __phys_addr(unsigned long);
62#define __phys_reloc_hide(x) (x)
63
64#define vmemmap ((struct page *)VMEMMAP_START)
65
66extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
67extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
68
69#endif /* !__ASSEMBLY__ */
70
71#ifdef CONFIG_FLATMEM
72#define pfn_valid(pfn) ((pfn) < max_pfn)
73#endif
74
75#endif /* _ASM_X86_PAGE_64_DEFS_H */ 53#endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index e21fdd10479f..54c97879195e 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,6 +51,8 @@ static inline phys_addr_t get_max_mapped(void)
51 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; 51 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
52} 52}
53 53
54bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
55
54extern unsigned long init_memory_mapping(unsigned long start, 56extern unsigned long init_memory_mapping(unsigned long start,
55 unsigned long end); 57 unsigned long end);
56 58
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index fc304279b559..1e672234c4ff 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -395,6 +395,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
395 395
396#ifndef __ASSEMBLY__ 396#ifndef __ASSEMBLY__
397#include <linux/mm_types.h> 397#include <linux/mm_types.h>
398#include <linux/log2.h>
398 399
399static inline int pte_none(pte_t pte) 400static inline int pte_none(pte_t pte)
400{ 401{
@@ -620,6 +621,8 @@ static inline int pgd_none(pgd_t pgd)
620#ifndef __ASSEMBLY__ 621#ifndef __ASSEMBLY__
621 622
622extern int direct_gbpages; 623extern int direct_gbpages;
624void init_mem_mapping(void);
625void early_alloc_pgt_buf(void);
623 626
624/* local pte updates need not use xchg for locking */ 627/* local pte updates need not use xchg for locking */
625static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 628static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
@@ -786,6 +789,20 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
786 memcpy(dst, src, count * sizeof(pgd_t)); 789 memcpy(dst, src, count * sizeof(pgd_t));
787} 790}
788 791
792#define PTE_SHIFT ilog2(PTRS_PER_PTE)
793static inline int page_level_shift(enum pg_level level)
794{
795 return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
796}
797static inline unsigned long page_level_size(enum pg_level level)
798{
799 return 1UL << page_level_shift(level);
800}
801static inline unsigned long page_level_mask(enum pg_level level)
802{
803 return ~(page_level_size(level) - 1);
804}
805
789/* 806/*
790 * The x86 doesn't have any external MMU info: the kernel page 807 * The x86 doesn't have any external MMU info: the kernel page
791 * tables contain all the necessary information. 808 * tables contain all the necessary information.
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 615b0c78449f..e22c1dbf7feb 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -180,6 +180,11 @@ extern void cleanup_highmap(void);
180 180
181#define __HAVE_ARCH_PTE_SAME 181#define __HAVE_ARCH_PTE_SAME
182 182
183#define vmemmap ((struct page *)VMEMMAP_START)
184
185extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
186extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
187
183#endif /* !__ASSEMBLY__ */ 188#endif /* !__ASSEMBLY__ */
184 189
185#endif /* _ASM_X86_PGTABLE_64_H */ 190#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16fbbbd..2d883440cb9a 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_PGTABLE_64_DEFS_H 1#ifndef _ASM_X86_PGTABLE_64_DEFS_H
2#define _ASM_X86_PGTABLE_64_DEFS_H 2#define _ASM_X86_PGTABLE_64_DEFS_H
3 3
4#include <asm/sparsemem.h>
5
4#ifndef __ASSEMBLY__ 6#ifndef __ASSEMBLY__
5#include <linux/types.h> 7#include <linux/types.h>
6 8
@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
60#define MODULES_END _AC(0xffffffffff000000, UL) 62#define MODULES_END _AC(0xffffffffff000000, UL)
61#define MODULES_LEN (MODULES_END - MODULES_VADDR) 63#define MODULES_LEN (MODULES_END - MODULES_VADDR)
62 64
65#define EARLY_DYNAMIC_PAGE_TABLES 64
66
63#endif /* _ASM_X86_PGTABLE_64_DEFS_H */ 67#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3c32db8c539d..e6423002c10b 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -321,7 +321,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
321/* Install a pte for a particular vaddr in kernel space. */ 321/* Install a pte for a particular vaddr in kernel space. */
322void set_pte_vaddr(unsigned long vaddr, pte_t pte); 322void set_pte_vaddr(unsigned long vaddr, pte_t pte);
323 323
324extern void native_pagetable_reserve(u64 start, u64 end);
325#ifdef CONFIG_X86_32 324#ifdef CONFIG_X86_32
326extern void native_pagetable_init(void); 325extern void native_pagetable_init(void);
327#else 326#else
@@ -331,7 +330,7 @@ extern void native_pagetable_init(void);
331struct seq_file; 330struct seq_file;
332extern void arch_report_meminfo(struct seq_file *m); 331extern void arch_report_meminfo(struct seq_file *m);
333 332
334enum { 333enum pg_level {
335 PG_LEVEL_NONE, 334 PG_LEVEL_NONE,
336 PG_LEVEL_4K, 335 PG_LEVEL_4K,
337 PG_LEVEL_2M, 336 PG_LEVEL_2M,
@@ -352,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
352 * as a pte too. 351 * as a pte too.
353 */ 352 */
354extern pte_t *lookup_address(unsigned long address, unsigned int *level); 353extern pte_t *lookup_address(unsigned long address, unsigned int *level);
354extern phys_addr_t slow_virt_to_phys(void *__address);
355 355
356#endif /* !__ASSEMBLY__ */ 356#endif /* !__ASSEMBLY__ */
357 357
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d172588efae5..8277941cbe99 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -721,6 +721,7 @@ extern void enable_sep_cpu(void);
721extern int sysenter_setup(void); 721extern int sysenter_setup(void);
722 722
723extern void early_trap_init(void); 723extern void early_trap_init(void);
724void early_trap_pf_init(void);
724 725
725/* Defined in head.S */ 726/* Defined in head.S */
726extern struct desc_ptr early_gdt_descr; 727extern struct desc_ptr early_gdt_descr;
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fe1ec5bcd846..9c6b890d5e7a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -58,6 +58,7 @@ extern unsigned char boot_gdt[];
58extern unsigned char secondary_startup_64[]; 58extern unsigned char secondary_startup_64[];
59#endif 59#endif
60 60
61extern void __init setup_real_mode(void); 61void reserve_real_mode(void);
62void setup_real_mode(void);
62 63
63#endif /* _ARCH_X86_REALMODE_H */ 64#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 1709801d18ec..5ee26875baea 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -125,13 +125,12 @@ extern int __get_user_4(void);
125extern int __get_user_8(void); 125extern int __get_user_8(void);
126extern int __get_user_bad(void); 126extern int __get_user_bad(void);
127 127
128#define __get_user_x(size, ret, x, ptr) \ 128/*
129 asm volatile("call __get_user_" #size \ 129 * This is a type: either unsigned long, if the argument fits into
130 : "=a" (ret), "=d" (x) \ 130 * that type, or otherwise unsigned long long.
131 : "0" (ptr)) \ 131 */
132 132#define __inttype(x) \
133/* Careful: we have to cast the result to the type of the pointer 133__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
134 * for sign reasons */
135 134
136/** 135/**
137 * get_user: - Get a simple variable from user space. 136 * get_user: - Get a simple variable from user space.
@@ -150,38 +149,26 @@ extern int __get_user_bad(void);
150 * Returns zero on success, or -EFAULT on error. 149 * Returns zero on success, or -EFAULT on error.
151 * On error, the variable @x is set to zero. 150 * On error, the variable @x is set to zero.
152 */ 151 */
153#ifdef CONFIG_X86_32 152/*
154#define __get_user_8(__ret_gu, __val_gu, ptr) \ 153 * Careful: we have to cast the result to the type of the pointer
155 __get_user_x(X, __ret_gu, __val_gu, ptr) 154 * for sign reasons.
156#else 155 *
157#define __get_user_8(__ret_gu, __val_gu, ptr) \ 156 * The use of %edx as the register specifier is a bit of a
158 __get_user_x(8, __ret_gu, __val_gu, ptr) 157 * simplification, as gcc only cares about it as the starting point
159#endif 158 * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
160 159 * (%ecx being the next register in gcc's x86 register sequence), and
160 * %rdx on 64 bits.
161 */
161#define get_user(x, ptr) \ 162#define get_user(x, ptr) \
162({ \ 163({ \
163 int __ret_gu; \ 164 int __ret_gu; \
164 unsigned long __val_gu; \ 165 register __inttype(*(ptr)) __val_gu asm("%edx"); \
165 __chk_user_ptr(ptr); \ 166 __chk_user_ptr(ptr); \
166 might_fault(); \ 167 might_fault(); \
167 switch (sizeof(*(ptr))) { \ 168 asm volatile("call __get_user_%P3" \
168 case 1: \ 169 : "=a" (__ret_gu), "=r" (__val_gu) \
169 __get_user_x(1, __ret_gu, __val_gu, ptr); \ 170 : "0" (ptr), "i" (sizeof(*(ptr)))); \
170 break; \ 171 (x) = (__typeof__(*(ptr))) __val_gu; \
171 case 2: \
172 __get_user_x(2, __ret_gu, __val_gu, ptr); \
173 break; \
174 case 4: \
175 __get_user_x(4, __ret_gu, __val_gu, ptr); \
176 break; \
177 case 8: \
178 __get_user_8(__ret_gu, __val_gu, ptr); \
179 break; \
180 default: \
181 __get_user_x(X, __ret_gu, __val_gu, ptr); \
182 break; \
183 } \
184 (x) = (__typeof__(*(ptr)))__val_gu; \
185 __ret_gu; \ 172 __ret_gu; \
186}) 173})
187 174
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 7669941cc9d2..d8d99222b36a 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -69,17 +69,6 @@ struct x86_init_oem {
69}; 69};
70 70
71/** 71/**
72 * struct x86_init_mapping - platform specific initial kernel pagetable setup
73 * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
74 *
75 * For more details on the purpose of this hook, look in
76 * init_memory_mapping and the commit that added it.
77 */
78struct x86_init_mapping {
79 void (*pagetable_reserve)(u64 start, u64 end);
80};
81
82/**
83 * struct x86_init_paging - platform specific paging functions 72 * struct x86_init_paging - platform specific paging functions
84 * @pagetable_init: platform specific paging initialization call to setup 73 * @pagetable_init: platform specific paging initialization call to setup
85 * the kernel pagetables and prepare accessors functions. 74 * the kernel pagetables and prepare accessors functions.
@@ -136,7 +125,6 @@ struct x86_init_ops {
136 struct x86_init_mpparse mpparse; 125 struct x86_init_mpparse mpparse;
137 struct x86_init_irqs irqs; 126 struct x86_init_irqs irqs;
138 struct x86_init_oem oem; 127 struct x86_init_oem oem;
139 struct x86_init_mapping mapping;
140 struct x86_init_paging paging; 128 struct x86_init_paging paging;
141 struct x86_init_timers timers; 129 struct x86_init_timers timers;
142 struct x86_init_iommu iommu; 130 struct x86_init_iommu iommu;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bacf4b0d91f4..cfc755dc1607 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
51 51
52#ifdef CONFIG_X86_64 52#ifdef CONFIG_X86_64
53# include <asm/proto.h> 53# include <asm/proto.h>
54# include <asm/numa_64.h>
55#endif /* X86 */ 54#endif /* X86 */
56 55
57#define BAD_MADT_ENTRY(entry, end) ( \ 56#define BAD_MADT_ENTRY(entry, end) ( \
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d5e0d717005a..0532f5d6e4ef 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void)
69 69
70#ifndef CONFIG_64BIT 70#ifndef CONFIG_64BIT
71 header->pmode_entry = (u32)&wakeup_pmode_return; 71 header->pmode_entry = (u32)&wakeup_pmode_return;
72 header->pmode_cr3 = (u32)__pa(&initial_page_table); 72 header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
73 saved_magic = 0x12345678; 73 saved_magic = 0x12345678;
74#else /* CONFIG_64BIT */ 74#else /* CONFIG_64BIT */
75#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e66311200cbd..b574b295a2f9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
768 aper_base = info.aper_base; 768 aper_base = info.aper_base;
769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
770 770
771 if (end_pfn > max_low_pfn_mapped) { 771 start_pfn = PFN_DOWN(aper_base);
772 start_pfn = (aper_base>>PAGE_SHIFT); 772 if (!pfn_range_is_mapped(start_pfn, end_pfn))
773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
774 }
775 774
776 pr_info("PCI-DMA: using GART IOMMU.\n"); 775 pr_info("PCI-DMA: using GART IOMMU.\n");
777 iommu_size = check_iommu_size(info.aper_base, aper_size); 776 iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 9c2aa89a11cb..9a9110918ca7 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -28,6 +28,7 @@
28#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/ipi.h> 29#include <asm/ipi.h>
30#include <asm/apic_flat_64.h> 30#include <asm/apic_flat_64.h>
31#include <asm/pgtable.h>
31 32
32static int numachip_system __read_mostly; 33static int numachip_system __read_mostly;
33 34
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 84bee67141ad..edd77e7508b3 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,6 @@
12#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
13 13
14#ifdef CONFIG_X86_64 14#ifdef CONFIG_X86_64
15# include <asm/numa_64.h>
16# include <asm/mmconfig.h> 15# include <asm/mmconfig.h>
17# include <asm/cacheflush.h> 16# include <asm/cacheflush.h>
18#endif 17#endif
@@ -680,12 +679,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
680 * benefit in doing so. 679 * benefit in doing so.
681 */ 680 */
682 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { 681 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
682 unsigned long pfn = tseg >> PAGE_SHIFT;
683
683 printk(KERN_DEBUG "tseg: %010llx\n", tseg); 684 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
684 if ((tseg>>PMD_SHIFT) < 685 if (pfn_range_is_mapped(pfn, pfn + 1))
685 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
686 ((tseg>>PMD_SHIFT) <
687 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
688 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
689 set_memory_4k((unsigned long)__va(tseg), 1); 686 set_memory_4k((unsigned long)__va(tseg), 1);
690 } 687 }
691 } 688 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcaabd0432c5..1905ce98bee0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -17,7 +17,6 @@
17 17
18#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
19#include <linux/topology.h> 19#include <linux/topology.h>
20#include <asm/numa_64.h>
21#endif 20#endif
22 21
23#include "cpu.h" 22#include "cpu.h"
@@ -168,7 +167,7 @@ int __cpuinit ppro_with_ram_bug(void)
168#ifdef CONFIG_X86_F00F_BUG 167#ifdef CONFIG_X86_F00F_BUG
169static void __cpuinit trap_init_f00f_bug(void) 168static void __cpuinit trap_init_f00f_bug(void)
170{ 169{
171 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 170 __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
172 171
173 /* 172 /*
174 * Update the IDT descriptor and reload the IDT so that 173 * Update the IDT descriptor and reload the IDT so that
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index df06ade26bef..d32abeabbda5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
835} 835}
836early_param("mem", parse_memopt); 836early_param("mem", parse_memopt);
837 837
838static int __init parse_memmap_opt(char *p) 838static int __init parse_memmap_one(char *p)
839{ 839{
840 char *oldp; 840 char *oldp;
841 u64 start_at, mem_size; 841 u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
877 877
878 return *p == '\0' ? 0 : -EINVAL; 878 return *p == '\0' ? 0 : -EINVAL;
879} 879}
880static int __init parse_memmap_opt(char *str)
881{
882 while (str) {
883 char *k = strchr(str, ',');
884
885 if (k)
886 *k++ = 0;
887
888 parse_memmap_one(str);
889 str = k;
890 }
891
892 return 0;
893}
880early_param("memmap", parse_memmap_opt); 894early_param("memmap", parse_memmap_opt);
881 895
882void __init finish_e820_parsing(void) 896void __init finish_e820_parsing(void)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1d414029f1d8..42a392a9fd02 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
89 * kernel identity mapping to modify code. 89 * kernel identity mapping to modify code.
90 */ 90 */
91 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 91 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
92 ip = (unsigned long)__va(__pa(ip)); 92 ip = (unsigned long)__va(__pa_symbol(ip));
93 93
94 return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); 94 return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
95} 95}
@@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, int size)
279 * kernel identity mapping to modify code. 279 * kernel identity mapping to modify code.
280 */ 280 */
281 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 281 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
282 ip = (unsigned long)__va(__pa(ip)); 282 ip = (unsigned long)__va(__pa_symbol(ip));
283 283
284 return probe_kernel_write((void *)ip, val, size); 284 return probe_kernel_write((void *)ip, val, size);
285} 285}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 6773c918b8cc..138463a24877 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -33,20 +33,6 @@ void __init i386_start_kernel(void)
33{ 33{
34 sanitize_boot_params(&boot_params); 34 sanitize_boot_params(&boot_params);
35 35
36 memblock_reserve(__pa_symbol(&_text),
37 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
38
39#ifdef CONFIG_BLK_DEV_INITRD
40 /* Reserve INITRD */
41 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
42 /* Assume only end is not page aligned */
43 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
44 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
45 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
46 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
47 }
48#endif
49
50 /* Call the subarch specific early setup function */ 36 /* Call the subarch specific early setup function */
51 switch (boot_params.hdr.hardware_subarch) { 37 switch (boot_params.hdr.hardware_subarch) {
52 case X86_SUBARCH_MRST: 38 case X86_SUBARCH_MRST:
@@ -60,11 +46,5 @@ void __init i386_start_kernel(void)
60 break; 46 break;
61 } 47 }
62 48
63 /*
64 * At this point everything still needed from the boot loader
65 * or BIOS or kernel text should be early reserved or marked not
66 * RAM in e820. All other memory is free game.
67 */
68
69 start_kernel(); 49 start_kernel();
70} 50}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 849fc9e63c2f..57334f4cd3af 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,11 +27,81 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/bootparam_utils.h> 28#include <asm/bootparam_utils.h>
29 29
30static void __init zap_identity_mappings(void) 30/*
31 * Manage page tables very early on.
32 */
33extern pgd_t early_level4_pgt[PTRS_PER_PGD];
34extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
35static unsigned int __initdata next_early_pgt = 2;
36
37/* Wipe all early page tables except for the kernel symbol map */
38static void __init reset_early_page_tables(void)
39{
40 unsigned long i;
41
42 for (i = 0; i < PTRS_PER_PGD-1; i++)
43 early_level4_pgt[i].pgd = 0;
44
45 next_early_pgt = 0;
46
47 write_cr3(__pa(early_level4_pgt));
48}
49
50/* Create a new PMD entry */
51int __init early_make_pgtable(unsigned long address)
31{ 52{
32 pgd_t *pgd = pgd_offset_k(0UL); 53 unsigned long physaddr = address - __PAGE_OFFSET;
33 pgd_clear(pgd); 54 unsigned long i;
34 __flush_tlb_all(); 55 pgdval_t pgd, *pgd_p;
56 pudval_t pud, *pud_p;
57 pmdval_t pmd, *pmd_p;
58
59 /* Invalid address or early pgt is done ? */
60 if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
61 return -1;
62
63again:
64 pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
65 pgd = *pgd_p;
66
67 /*
68 * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
69 * critical -- __PAGE_OFFSET would point us back into the dynamic
70 * range and we might end up looping forever...
71 */
72 if (pgd)
73 pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
74 else {
75 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
76 reset_early_page_tables();
77 goto again;
78 }
79
80 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
81 for (i = 0; i < PTRS_PER_PUD; i++)
82 pud_p[i] = 0;
83 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
84 }
85 pud_p += pud_index(address);
86 pud = *pud_p;
87
88 if (pud)
89 pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
90 else {
91 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
92 reset_early_page_tables();
93 goto again;
94 }
95
96 pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
97 for (i = 0; i < PTRS_PER_PMD; i++)
98 pmd_p[i] = 0;
99 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
100 }
101 pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
102 pmd_p[pmd_index(address)] = pmd;
103
104 return 0;
35} 105}
36 106
37/* Don't add a printk in there. printk relies on the PDA which is not initialized 107/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -42,14 +112,25 @@ static void __init clear_bss(void)
42 (unsigned long) __bss_stop - (unsigned long) __bss_start); 112 (unsigned long) __bss_stop - (unsigned long) __bss_start);
43} 113}
44 114
115static unsigned long get_cmd_line_ptr(void)
116{
117 unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
118
119 cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
120
121 return cmd_line_ptr;
122}
123
45static void __init copy_bootdata(char *real_mode_data) 124static void __init copy_bootdata(char *real_mode_data)
46{ 125{
47 char * command_line; 126 char * command_line;
127 unsigned long cmd_line_ptr;
48 128
49 memcpy(&boot_params, real_mode_data, sizeof boot_params); 129 memcpy(&boot_params, real_mode_data, sizeof boot_params);
50 sanitize_boot_params(&boot_params); 130 sanitize_boot_params(&boot_params);
51 if (boot_params.hdr.cmd_line_ptr) { 131 cmd_line_ptr = get_cmd_line_ptr();
52 command_line = __va(boot_params.hdr.cmd_line_ptr); 132 if (cmd_line_ptr) {
133 command_line = __va(cmd_line_ptr);
53 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 134 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
54 } 135 }
55} 136}
@@ -72,14 +153,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
72 (__START_KERNEL & PGDIR_MASK))); 153 (__START_KERNEL & PGDIR_MASK)));
73 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); 154 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
74 155
156 /* Kill off the identity-map trampoline */
157 reset_early_page_tables();
158
75 /* clear bss before set_intr_gate with early_idt_handler */ 159 /* clear bss before set_intr_gate with early_idt_handler */
76 clear_bss(); 160 clear_bss();
77 161
78 /* Make NULL pointers segfault */
79 zap_identity_mappings();
80
81 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
82
83 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 162 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
84#ifdef CONFIG_EARLY_PRINTK 163#ifdef CONFIG_EARLY_PRINTK
85 set_intr_gate(i, &early_idt_handlers[i]); 164 set_intr_gate(i, &early_idt_handlers[i]);
@@ -89,37 +168,25 @@ void __init x86_64_start_kernel(char * real_mode_data)
89 } 168 }
90 load_idt((const struct desc_ptr *)&idt_descr); 169 load_idt((const struct desc_ptr *)&idt_descr);
91 170
171 copy_bootdata(__va(real_mode_data));
172
92 if (console_loglevel == 10) 173 if (console_loglevel == 10)
93 early_printk("Kernel alive\n"); 174 early_printk("Kernel alive\n");
94 175
176 clear_page(init_level4_pgt);
177 /* set init_level4_pgt kernel high mapping*/
178 init_level4_pgt[511] = early_level4_pgt[511];
179
95 x86_64_start_reservations(real_mode_data); 180 x86_64_start_reservations(real_mode_data);
96} 181}
97 182
98void __init x86_64_start_reservations(char *real_mode_data) 183void __init x86_64_start_reservations(char *real_mode_data)
99{ 184{
100 copy_bootdata(__va(real_mode_data)); 185 /* version is always not zero if it is copied */
101 186 if (!boot_params.hdr.version)
102 memblock_reserve(__pa_symbol(&_text), 187 copy_bootdata(__va(real_mode_data));
103 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
104
105#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */
107 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
108 /* Assume only end is not page aligned */
109 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
110 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
111 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
112 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
113 }
114#endif
115 188
116 reserve_ebda_region(); 189 reserve_ebda_region();
117 190
118 /*
119 * At this point everything still needed from the boot loader
120 * or BIOS or kernel text should be early reserved or marked not
121 * RAM in e820. All other memory is free game.
122 */
123
124 start_kernel(); 191 start_kernel();
125} 192}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c4b9cc..d94f6d68be2a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
47 .code64 47 .code64
48 .globl startup_64 48 .globl startup_64
49startup_64: 49startup_64:
50
51 /* 50 /*
52 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 51 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
53 * and someone has loaded an identity mapped page table 52 * and someone has loaded an identity mapped page table
54 * for us. These identity mapped page tables map all of the 53 * for us. These identity mapped page tables map all of the
55 * kernel pages and possibly all of memory. 54 * kernel pages and possibly all of memory.
56 * 55 *
57 * %esi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
58 * 57 *
59 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
60 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
66 * tables and then reload them. 65 * tables and then reload them.
67 */ 66 */
68 67
69 /* Compute the delta between the address I am compiled to run at and the 68 /*
69 * Compute the delta between the address I am compiled to run at and the
70 * address I am actually running at. 70 * address I am actually running at.
71 */ 71 */
72 leaq _text(%rip), %rbp 72 leaq _text(%rip), %rbp
@@ -78,45 +78,62 @@ startup_64:
78 testl %eax, %eax 78 testl %eax, %eax
79 jnz bad_address 79 jnz bad_address
80 80
81 /* Is the address too large? */ 81 /*
82 leaq _text(%rip), %rdx 82 * Is the address too large?
83 movq $PGDIR_SIZE, %rax
84 cmpq %rax, %rdx
85 jae bad_address
86
87 /* Fixup the physical addresses in the page table
88 */ 83 */
89 addq %rbp, init_level4_pgt + 0(%rip) 84 leaq _text(%rip), %rax
90 addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) 85 shrq $MAX_PHYSMEM_BITS, %rax
91 addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) 86 jnz bad_address
92 87
93 addq %rbp, level3_ident_pgt + 0(%rip) 88 /*
89 * Fixup the physical addresses in the page table
90 */
91 addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
94 92
95 addq %rbp, level3_kernel_pgt + (510*8)(%rip) 93 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
96 addq %rbp, level3_kernel_pgt + (511*8)(%rip) 94 addq %rbp, level3_kernel_pgt + (511*8)(%rip)
97 95
98 addq %rbp, level2_fixmap_pgt + (506*8)(%rip) 96 addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
99 97
100 /* Add an Identity mapping if I am above 1G */ 98 /*
99 * Set up the identity mapping for the switchover. These
100 * entries should *NOT* have the global bit set! This also
101 * creates a bunch of nonsense entries but that is fine --
102 * it avoids problems around wraparound.
103 */
101 leaq _text(%rip), %rdi 104 leaq _text(%rip), %rdi
102 andq $PMD_PAGE_MASK, %rdi 105 leaq early_level4_pgt(%rip), %rbx
103 106
104 movq %rdi, %rax 107 movq %rdi, %rax
105 shrq $PUD_SHIFT, %rax 108 shrq $PGDIR_SHIFT, %rax
106 andq $(PTRS_PER_PUD - 1), %rax
107 jz ident_complete
108 109
109 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx 110 leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
110 leaq level3_ident_pgt(%rip), %rbx 111 movq %rdx, 0(%rbx,%rax,8)
111 movq %rdx, 0(%rbx, %rax, 8) 112 movq %rdx, 8(%rbx,%rax,8)
112 113
114 addq $4096, %rdx
113 movq %rdi, %rax 115 movq %rdi, %rax
114 shrq $PMD_SHIFT, %rax 116 shrq $PUD_SHIFT, %rax
115 andq $(PTRS_PER_PMD - 1), %rax 117 andl $(PTRS_PER_PUD-1), %eax
116 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx 118 movq %rdx, (4096+0)(%rbx,%rax,8)
117 leaq level2_spare_pgt(%rip), %rbx 119 movq %rdx, (4096+8)(%rbx,%rax,8)
118 movq %rdx, 0(%rbx, %rax, 8) 120
119ident_complete: 121 addq $8192, %rbx
122 movq %rdi, %rax
123 shrq $PMD_SHIFT, %rdi
124 addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
125 leaq (_end - 1)(%rip), %rcx
126 shrq $PMD_SHIFT, %rcx
127 subq %rdi, %rcx
128 incl %ecx
129
1301:
131 andq $(PTRS_PER_PMD - 1), %rdi
132 movq %rax, (%rbx,%rdi,8)
133 incq %rdi
134 addq $PMD_SIZE, %rax
135 decl %ecx
136 jnz 1b
120 137
121 /* 138 /*
122 * Fixup the kernel text+data virtual addresses. Note that 139 * Fixup the kernel text+data virtual addresses. Note that
@@ -124,7 +141,6 @@ ident_complete:
124 * cleanup_highmap() fixes this up along with the mappings 141 * cleanup_highmap() fixes this up along with the mappings
125 * beyond _end. 142 * beyond _end.
126 */ 143 */
127
128 leaq level2_kernel_pgt(%rip), %rdi 144 leaq level2_kernel_pgt(%rip), %rdi
129 leaq 4096(%rdi), %r8 145 leaq 4096(%rdi), %r8
130 /* See if it is a valid page table entry */ 146 /* See if it is a valid page table entry */
@@ -139,17 +155,14 @@ ident_complete:
139 /* Fixup phys_base */ 155 /* Fixup phys_base */
140 addq %rbp, phys_base(%rip) 156 addq %rbp, phys_base(%rip)
141 157
142 /* Due to ENTRY(), sometimes the empty space gets filled with 158 movq $(early_level4_pgt - __START_KERNEL_map), %rax
143 * zeros. Better take a jmp than relying on empty space being 159 jmp 1f
144 * filled with 0x90 (nop)
145 */
146 jmp secondary_startup_64
147ENTRY(secondary_startup_64) 160ENTRY(secondary_startup_64)
148 /* 161 /*
149 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 162 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
150 * and someone has loaded a mapped page table. 163 * and someone has loaded a mapped page table.
151 * 164 *
152 * %esi holds a physical pointer to real_mode_data. 165 * %rsi holds a physical pointer to real_mode_data.
153 * 166 *
154 * We come here either from startup_64 (using physical addresses) 167 * We come here either from startup_64 (using physical addresses)
155 * or from trampoline.S (using virtual addresses). 168 * or from trampoline.S (using virtual addresses).
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64)
159 * after the boot processor executes this code. 172 * after the boot processor executes this code.
160 */ 173 */
161 174
175 movq $(init_level4_pgt - __START_KERNEL_map), %rax
1761:
177
162 /* Enable PAE mode and PGE */ 178 /* Enable PAE mode and PGE */
163 movl $(X86_CR4_PAE | X86_CR4_PGE), %eax 179 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
164 movq %rax, %cr4 180 movq %rcx, %cr4
165 181
166 /* Setup early boot stage 4 level pagetables. */ 182 /* Setup early boot stage 4 level pagetables. */
167 movq $(init_level4_pgt - __START_KERNEL_map), %rax
168 addq phys_base(%rip), %rax 183 addq phys_base(%rip), %rax
169 movq %rax, %cr3 184 movq %rax, %cr3
170 185
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64)
196 movq %rax, %cr0 211 movq %rax, %cr0
197 212
198 /* Setup a boot time stack */ 213 /* Setup a boot time stack */
199 movq stack_start(%rip),%rsp 214 movq stack_start(%rip), %rsp
200 215
201 /* zero EFLAGS after setting rsp */ 216 /* zero EFLAGS after setting rsp */
202 pushq $0 217 pushq $0
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64)
236 movl initial_gs+4(%rip),%edx 251 movl initial_gs+4(%rip),%edx
237 wrmsr 252 wrmsr
238 253
239 /* esi is pointer to real mode structure with interesting info. 254 /* rsi is pointer to real mode structure with interesting info.
240 pass it to C */ 255 pass it to C */
241 movl %esi, %edi 256 movq %rsi, %rdi
242 257
243 /* Finally jump to run C code and to be on real kernel address 258 /* Finally jump to run C code and to be on real kernel address
244 * Since we are running on identity-mapped space we have to jump 259 * Since we are running on identity-mapped space we have to jump
245 * to the full 64bit address, this is only possible as indirect 260 * to the full 64bit address, this is only possible as indirect
246 * jump. In addition we need to ensure %cs is set so we make this 261 * jump. In addition we need to ensure %cs is set so we make this
247 * a far return. 262 * a far return.
263 *
264 * Note: do not change to far jump indirect with 64bit offset.
265 *
266 * AMD does not support far jump indirect with 64bit offset.
267 * AMD64 Architecture Programmer's Manual, Volume 3: states only
268 * JMP FAR mem16:16 FF /5 Far jump indirect,
269 * with the target specified by a far pointer in memory.
270 * JMP FAR mem16:32 FF /5 Far jump indirect,
271 * with the target specified by a far pointer in memory.
272 *
273 * Intel64 does support 64bit offset.
274 * Software Developer Manual Vol 2: states:
275 * FF /5 JMP m16:16 Jump far, absolute indirect,
276 * address given in m16:16
277 * FF /5 JMP m16:32 Jump far, absolute indirect,
278 * address given in m16:32.
279 * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
280 * address given in m16:64.
248 */ 281 */
249 movq initial_code(%rip),%rax 282 movq initial_code(%rip),%rax
250 pushq $0 # fake return address to stop unwinder 283 pushq $0 # fake return address to stop unwinder
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0)
270 303
271 /* SMP bootup changes these two */ 304 /* SMP bootup changes these two */
272 __REFDATA 305 __REFDATA
273 .align 8 306 .balign 8
274 ENTRY(initial_code) 307 GLOBAL(initial_code)
275 .quad x86_64_start_kernel 308 .quad x86_64_start_kernel
276 ENTRY(initial_gs) 309 GLOBAL(initial_gs)
277 .quad INIT_PER_CPU_VAR(irq_stack_union) 310 .quad INIT_PER_CPU_VAR(irq_stack_union)
278 311
279 ENTRY(stack_start) 312 GLOBAL(stack_start)
280 .quad init_thread_union+THREAD_SIZE-8 313 .quad init_thread_union+THREAD_SIZE-8
281 .word 0 314 .word 0
282 __FINITDATA 315 __FINITDATA
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0)
284bad_address: 317bad_address:
285 jmp bad_address 318 jmp bad_address
286 319
287 .section ".init.text","ax" 320 __INIT
288 .globl early_idt_handlers 321 .globl early_idt_handlers
289early_idt_handlers: 322early_idt_handlers:
290 # 104(%rsp) %rflags 323 # 104(%rsp) %rflags
@@ -321,14 +354,22 @@ ENTRY(early_idt_handler)
321 pushq %r11 # 0(%rsp) 354 pushq %r11 # 0(%rsp)
322 355
323 cmpl $__KERNEL_CS,96(%rsp) 356 cmpl $__KERNEL_CS,96(%rsp)
324 jne 10f 357 jne 11f
358
359 cmpl $14,72(%rsp) # Page fault?
360 jnz 10f
361 GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
362 call early_make_pgtable
363 andl %eax,%eax
364 jz 20f # All good
325 365
36610:
326 leaq 88(%rsp),%rdi # Pointer to %rip 367 leaq 88(%rsp),%rdi # Pointer to %rip
327 call early_fixup_exception 368 call early_fixup_exception
328 andl %eax,%eax 369 andl %eax,%eax
329 jnz 20f # Found an exception entry 370 jnz 20f # Found an exception entry
330 371
33110: 37211:
332#ifdef CONFIG_EARLY_PRINTK 373#ifdef CONFIG_EARLY_PRINTK
333 GET_CR2_INTO(%r9) # can clobber any volatile register if pv 374 GET_CR2_INTO(%r9) # can clobber any volatile register if pv
334 movl 80(%rsp),%r8d # error code 375 movl 80(%rsp),%r8d # error code
@@ -350,7 +391,7 @@ ENTRY(early_idt_handler)
3501: hlt 3911: hlt
351 jmp 1b 392 jmp 1b
352 393
35320: # Exception table entry found 39420: # Exception table entry found or page table generated
354 popq %r11 395 popq %r11
355 popq %r10 396 popq %r10
356 popq %r9 397 popq %r9
@@ -364,6 +405,8 @@ ENTRY(early_idt_handler)
364 decl early_recursion_flag(%rip) 405 decl early_recursion_flag(%rip)
365 INTERRUPT_RETURN 406 INTERRUPT_RETURN
366 407
408 __INITDATA
409
367 .balign 4 410 .balign 4
368early_recursion_flag: 411early_recursion_flag:
369 .long 0 412 .long 0
@@ -374,11 +417,10 @@ early_idt_msg:
374early_idt_ripmsg: 417early_idt_ripmsg:
375 .asciz "RIP %s\n" 418 .asciz "RIP %s\n"
376#endif /* CONFIG_EARLY_PRINTK */ 419#endif /* CONFIG_EARLY_PRINTK */
377 .previous
378 420
379#define NEXT_PAGE(name) \ 421#define NEXT_PAGE(name) \
380 .balign PAGE_SIZE; \ 422 .balign PAGE_SIZE; \
381ENTRY(name) 423GLOBAL(name)
382 424
383/* Automate the creation of 1 to 1 mapping pmd entries */ 425/* Automate the creation of 1 to 1 mapping pmd entries */
384#define PMDS(START, PERM, COUNT) \ 426#define PMDS(START, PERM, COUNT) \
@@ -388,24 +430,37 @@ ENTRY(name)
388 i = i + 1 ; \ 430 i = i + 1 ; \
389 .endr 431 .endr
390 432
433 __INITDATA
434NEXT_PAGE(early_level4_pgt)
435 .fill 511,8,0
436 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
437
438NEXT_PAGE(early_dynamic_pgts)
439 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
440
391 .data 441 .data
392 /* 442
393 * This default setting generates an ident mapping at address 0x100000 443#ifndef CONFIG_XEN
394 * and a mapping for the kernel that precisely maps virtual address
395 * 0xffffffff80000000 to physical address 0x000000. (always using
396 * 2Mbyte large pages provided by PAE mode)
397 */
398NEXT_PAGE(init_level4_pgt) 444NEXT_PAGE(init_level4_pgt)
399 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 445 .fill 512,8,0
400 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 446#else
401 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 447NEXT_PAGE(init_level4_pgt)
402 .org init_level4_pgt + L4_START_KERNEL*8, 0 448 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
449 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
450 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
451 .org init_level4_pgt + L4_START_KERNEL*8, 0
403 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 452 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
404 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 453 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
405 454
406NEXT_PAGE(level3_ident_pgt) 455NEXT_PAGE(level3_ident_pgt)
407 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 456 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
408 .fill 511,8,0 457 .fill 511, 8, 0
458NEXT_PAGE(level2_ident_pgt)
459 /* Since I easily can, map the first 1G.
460 * Don't set NX because code runs from these pages.
461 */
462 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
463#endif
409 464
410NEXT_PAGE(level3_kernel_pgt) 465NEXT_PAGE(level3_kernel_pgt)
411 .fill L3_START_KERNEL,8,0 466 .fill L3_START_KERNEL,8,0
@@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt)
413 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 468 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
414 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 469 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
415 470
416NEXT_PAGE(level2_fixmap_pgt)
417 .fill 506,8,0
418 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
419 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
420 .fill 5,8,0
421
422NEXT_PAGE(level1_fixmap_pgt)
423 .fill 512,8,0
424
425NEXT_PAGE(level2_ident_pgt)
426 /* Since I easily can, map the first 1G.
427 * Don't set NX because code runs from these pages.
428 */
429 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
430
431NEXT_PAGE(level2_kernel_pgt) 471NEXT_PAGE(level2_kernel_pgt)
432 /* 472 /*
433 * 512 MB kernel mapping. We spend a full page on this pagetable 473 * 512 MB kernel mapping. We spend a full page on this pagetable
@@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt)
442 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, 482 PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
443 KERNEL_IMAGE_SIZE/PMD_SIZE) 483 KERNEL_IMAGE_SIZE/PMD_SIZE)
444 484
445NEXT_PAGE(level2_spare_pgt) 485NEXT_PAGE(level2_fixmap_pgt)
446 .fill 512, 8, 0 486 .fill 506,8,0
487 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
488 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
489 .fill 5,8,0
490
491NEXT_PAGE(level1_fixmap_pgt)
492 .fill 512,8,0
447 493
448#undef PMDS 494#undef PMDS
449#undef NEXT_PAGE
450 495
451 .data 496 .data
452 .align 16 497 .align 16
@@ -472,6 +517,5 @@ ENTRY(nmi_idt_table)
472 .skip IDT_ENTRIES * 16 517 .skip IDT_ENTRIES * 16
473 518
474 __PAGE_ALIGNED_BSS 519 __PAGE_ALIGNED_BSS
475 .align PAGE_SIZE 520NEXT_PAGE(empty_zero_page)
476ENTRY(empty_zero_page)
477 .skip PAGE_SIZE 521 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 9c3bd4a2050e..0fa69127209a 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(csum_partial_copy_generic);
26EXPORT_SYMBOL(__get_user_1); 26EXPORT_SYMBOL(__get_user_1);
27EXPORT_SYMBOL(__get_user_2); 27EXPORT_SYMBOL(__get_user_2);
28EXPORT_SYMBOL(__get_user_4); 28EXPORT_SYMBOL(__get_user_4);
29EXPORT_SYMBOL(__get_user_8);
29 30
30EXPORT_SYMBOL(__put_user_1); 31EXPORT_SYMBOL(__put_user_1);
31EXPORT_SYMBOL(__put_user_2); 32EXPORT_SYMBOL(__put_user_2);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 2b44ea5f269d..b686a904d7c3 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -297,9 +297,9 @@ static void kvm_register_steal_time(void)
297 297
298 memset(st, 0, sizeof(*st)); 298 memset(st, 0, sizeof(*st));
299 299
300 wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); 300 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
301 printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", 301 pr_info("kvm-stealtime: cpu %d, msr %llx\n",
302 cpu, __pa(st)); 302 cpu, (unsigned long long) slow_virt_to_phys(st));
303} 303}
304 304
305static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; 305static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
@@ -324,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void)
324 return; 324 return;
325 325
326 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 326 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
327 u64 pa = __pa(&__get_cpu_var(apf_reason)); 327 u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason));
328 328
329#ifdef CONFIG_PREEMPT 329#ifdef CONFIG_PREEMPT
330 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 330 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -340,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void)
340 /* Size alignment is implied but just to make it explicit. */ 340 /* Size alignment is implied but just to make it explicit. */
341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); 341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
342 __get_cpu_var(kvm_apic_eoi) = 0; 342 __get_cpu_var(kvm_apic_eoi) = 0;
343 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; 343 pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi))
344 | KVM_MSR_ENABLED;
344 wrmsrl(MSR_KVM_PV_EOI_EN, pa); 345 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
345 } 346 }
346 347
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 220a360010f8..9f966dc0b9e4 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -162,8 +162,8 @@ int kvm_register_clock(char *txt)
162 int low, high, ret; 162 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; 163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
164 164
165 low = (int)__pa(src) | 1; 165 low = (int)slow_virt_to_phys(src) | 1;
166 high = ((u64)__pa(src) >> 32); 166 high = ((u64)slow_virt_to_phys(src) >> 32);
167 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 167 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
169 cpu, high, low, txt); 169 cpu, high, low, txt);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db39db6..4eabc160696f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -16,125 +16,12 @@
16#include <linux/io.h> 16#include <linux/io.h>
17#include <linux/suspend.h> 17#include <linux/suspend.h>
18 18
19#include <asm/init.h>
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/debugreg.h> 23#include <asm/debugreg.h>
23 24
24static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
25 unsigned long addr)
26{
27 pud_t *pud;
28 pmd_t *pmd;
29 struct page *page;
30 int result = -ENOMEM;
31
32 addr &= PMD_MASK;
33 pgd += pgd_index(addr);
34 if (!pgd_present(*pgd)) {
35 page = kimage_alloc_control_pages(image, 0);
36 if (!page)
37 goto out;
38 pud = (pud_t *)page_address(page);
39 clear_page(pud);
40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
41 }
42 pud = pud_offset(pgd, addr);
43 if (!pud_present(*pud)) {
44 page = kimage_alloc_control_pages(image, 0);
45 if (!page)
46 goto out;
47 pmd = (pmd_t *)page_address(page);
48 clear_page(pmd);
49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
50 }
51 pmd = pmd_offset(pud, addr);
52 if (!pmd_present(*pmd))
53 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
54 result = 0;
55out:
56 return result;
57}
58
59static void init_level2_page(pmd_t *level2p, unsigned long addr)
60{
61 unsigned long end_addr;
62
63 addr &= PAGE_MASK;
64 end_addr = addr + PUD_SIZE;
65 while (addr < end_addr) {
66 set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
67 addr += PMD_SIZE;
68 }
69}
70
71static int init_level3_page(struct kimage *image, pud_t *level3p,
72 unsigned long addr, unsigned long last_addr)
73{
74 unsigned long end_addr;
75 int result;
76
77 result = 0;
78 addr &= PAGE_MASK;
79 end_addr = addr + PGDIR_SIZE;
80 while ((addr < last_addr) && (addr < end_addr)) {
81 struct page *page;
82 pmd_t *level2p;
83
84 page = kimage_alloc_control_pages(image, 0);
85 if (!page) {
86 result = -ENOMEM;
87 goto out;
88 }
89 level2p = (pmd_t *)page_address(page);
90 init_level2_page(level2p, addr);
91 set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
92 addr += PUD_SIZE;
93 }
94 /* clear the unused entries */
95 while (addr < end_addr) {
96 pud_clear(level3p++);
97 addr += PUD_SIZE;
98 }
99out:
100 return result;
101}
102
103
104static int init_level4_page(struct kimage *image, pgd_t *level4p,
105 unsigned long addr, unsigned long last_addr)
106{
107 unsigned long end_addr;
108 int result;
109
110 result = 0;
111 addr &= PAGE_MASK;
112 end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
113 while ((addr < last_addr) && (addr < end_addr)) {
114 struct page *page;
115 pud_t *level3p;
116
117 page = kimage_alloc_control_pages(image, 0);
118 if (!page) {
119 result = -ENOMEM;
120 goto out;
121 }
122 level3p = (pud_t *)page_address(page);
123 result = init_level3_page(image, level3p, addr, last_addr);
124 if (result)
125 goto out;
126 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
127 addr += PGDIR_SIZE;
128 }
129 /* clear the unused entries */
130 while (addr < end_addr) {
131 pgd_clear(level4p++);
132 addr += PGDIR_SIZE;
133 }
134out:
135 return result;
136}
137
138static void free_transition_pgtable(struct kimage *image) 25static void free_transition_pgtable(struct kimage *image)
139{ 26{
140 free_page((unsigned long)image->arch.pud); 27 free_page((unsigned long)image->arch.pud);
@@ -184,22 +71,62 @@ err:
184 return result; 71 return result;
185} 72}
186 73
74static void *alloc_pgt_page(void *data)
75{
76 struct kimage *image = (struct kimage *)data;
77 struct page *page;
78 void *p = NULL;
79
80 page = kimage_alloc_control_pages(image, 0);
81 if (page) {
82 p = page_address(page);
83 clear_page(p);
84 }
85
86 return p;
87}
187 88
188static int init_pgtable(struct kimage *image, unsigned long start_pgtable) 89static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
189{ 90{
91 struct x86_mapping_info info = {
92 .alloc_pgt_page = alloc_pgt_page,
93 .context = image,
94 .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
95 };
96 unsigned long mstart, mend;
190 pgd_t *level4p; 97 pgd_t *level4p;
191 int result; 98 int result;
99 int i;
100
192 level4p = (pgd_t *)__va(start_pgtable); 101 level4p = (pgd_t *)__va(start_pgtable);
193 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 102 clear_page(level4p);
194 if (result) 103 for (i = 0; i < nr_pfn_mapped; i++) {
195 return result; 104 mstart = pfn_mapped[i].start << PAGE_SHIFT;
105 mend = pfn_mapped[i].end << PAGE_SHIFT;
106
107 result = kernel_ident_mapping_init(&info,
108 level4p, mstart, mend);
109 if (result)
110 return result;
111 }
112
196 /* 113 /*
197 * image->start may be outside 0 ~ max_pfn, for example when 114 * segments's mem ranges could be outside 0 ~ max_pfn,
198 * jump back to original kernel from kexeced kernel 115 * for example when jump back to original kernel from kexeced kernel.
116 * or first kernel is booted with user mem map, and second kernel
117 * could be loaded out of that range.
199 */ 118 */
200 result = init_one_level2_page(image, level4p, image->start); 119 for (i = 0; i < image->nr_segments; i++) {
201 if (result) 120 mstart = image->segment[i].mem;
202 return result; 121 mend = mstart + image->segment[i].memsz;
122
123 result = kernel_ident_mapping_init(&info,
124 level4p, mstart, mend);
125
126 if (result)
127 return result;
128 }
129
203 return init_transition_pgtable(image, level4p); 130 return init_transition_pgtable(image, level4p);
204} 131}
205 132
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8b24289cc10c..915f5efefcf5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -108,17 +108,16 @@
108#include <asm/topology.h> 108#include <asm/topology.h>
109#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/amd_nb.h> 110#include <asm/amd_nb.h>
111#ifdef CONFIG_X86_64
112#include <asm/numa_64.h>
113#endif
114#include <asm/mce.h> 111#include <asm/mce.h>
115#include <asm/alternative.h> 112#include <asm/alternative.h>
116#include <asm/prom.h> 113#include <asm/prom.h>
117 114
118/* 115/*
119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 116 * max_low_pfn_mapped: highest direct mapped pfn under 4GB
120 * The direct mapping extends to max_pfn_mapped, so that we can directly access 117 * max_pfn_mapped: highest direct mapped pfn over 4GB
121 * apertures, ACPI and other tables without having to play with fixmaps. 118 *
119 * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
120 * represented by pfn_mapped
122 */ 121 */
123unsigned long max_low_pfn_mapped; 122unsigned long max_low_pfn_mapped;
124unsigned long max_pfn_mapped; 123unsigned long max_pfn_mapped;
@@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align)
276 return ret; 275 return ret;
277} 276}
278 277
279#ifdef CONFIG_X86_64 278#ifdef CONFIG_X86_32
280static void __init init_gbpages(void)
281{
282 if (direct_gbpages && cpu_has_gbpages)
283 printk(KERN_INFO "Using GB pages for direct mapping\n");
284 else
285 direct_gbpages = 0;
286}
287#else
288static inline void init_gbpages(void)
289{
290}
291static void __init cleanup_highmap(void) 279static void __init cleanup_highmap(void)
292{ 280{
293} 281}
@@ -296,8 +284,8 @@ static void __init cleanup_highmap(void)
296static void __init reserve_brk(void) 284static void __init reserve_brk(void)
297{ 285{
298 if (_brk_end > _brk_start) 286 if (_brk_end > _brk_start)
299 memblock_reserve(__pa(_brk_start), 287 memblock_reserve(__pa_symbol(_brk_start),
300 __pa(_brk_end) - __pa(_brk_start)); 288 _brk_end - _brk_start);
301 289
302 /* Mark brk area as locked down and no longer taking any 290 /* Mark brk area as locked down and no longer taking any
303 new allocations */ 291 new allocations */
@@ -306,27 +294,43 @@ static void __init reserve_brk(void)
306 294
307#ifdef CONFIG_BLK_DEV_INITRD 295#ifdef CONFIG_BLK_DEV_INITRD
308 296
297static u64 __init get_ramdisk_image(void)
298{
299 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
300
301 ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
302
303 return ramdisk_image;
304}
305static u64 __init get_ramdisk_size(void)
306{
307 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
308
309 ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
310
311 return ramdisk_size;
312}
313
309#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 314#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
310static void __init relocate_initrd(void) 315static void __init relocate_initrd(void)
311{ 316{
312 /* Assume only end is not page aligned */ 317 /* Assume only end is not page aligned */
313 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 318 u64 ramdisk_image = get_ramdisk_image();
314 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 319 u64 ramdisk_size = get_ramdisk_size();
315 u64 area_size = PAGE_ALIGN(ramdisk_size); 320 u64 area_size = PAGE_ALIGN(ramdisk_size);
316 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
317 u64 ramdisk_here; 321 u64 ramdisk_here;
318 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
319 char *p, *q; 323 char *p, *q;
320 324
321 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into directly mapped mem */
322 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, 326 ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
323 PAGE_SIZE); 327 area_size, PAGE_SIZE);
324 328
325 if (!ramdisk_here) 329 if (!ramdisk_here)
326 panic("Cannot find place for new RAMDISK of size %lld\n", 330 panic("Cannot find place for new RAMDISK of size %lld\n",
327 ramdisk_size); 331 ramdisk_size);
328 332
329 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the mem currently occupied by
330 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
331 memblock_reserve(ramdisk_here, area_size); 335 memblock_reserve(ramdisk_here, area_size);
332 initrd_start = ramdisk_here + PAGE_OFFSET; 336 initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -336,17 +340,7 @@ static void __init relocate_initrd(void)
336 340
337 q = (char *)initrd_start; 341 q = (char *)initrd_start;
338 342
339 /* Copy any lowmem portion of the initrd */ 343 /* Copy the initrd */
340 if (ramdisk_image < end_of_lowmem) {
341 clen = end_of_lowmem - ramdisk_image;
342 p = (char *)__va(ramdisk_image);
343 memcpy(q, p, clen);
344 q += clen;
345 ramdisk_image += clen;
346 ramdisk_size -= clen;
347 }
348
349 /* Copy the highmem portion of the initrd */
350 while (ramdisk_size) { 344 while (ramdisk_size) {
351 slop = ramdisk_image & ~PAGE_MASK; 345 slop = ramdisk_image & ~PAGE_MASK;
352 clen = ramdisk_size; 346 clen = ramdisk_size;
@@ -360,22 +354,35 @@ static void __init relocate_initrd(void)
360 ramdisk_image += clen; 354 ramdisk_image += clen;
361 ramdisk_size -= clen; 355 ramdisk_size -= clen;
362 } 356 }
363 /* high pages is not converted by early_res_to_bootmem */ 357
364 ramdisk_image = boot_params.hdr.ramdisk_image; 358 ramdisk_image = get_ramdisk_image();
365 ramdisk_size = boot_params.hdr.ramdisk_size; 359 ramdisk_size = get_ramdisk_size();
366 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 360 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
367 " [mem %#010llx-%#010llx]\n", 361 " [mem %#010llx-%#010llx]\n",
368 ramdisk_image, ramdisk_image + ramdisk_size - 1, 362 ramdisk_image, ramdisk_image + ramdisk_size - 1,
369 ramdisk_here, ramdisk_here + ramdisk_size - 1); 363 ramdisk_here, ramdisk_here + ramdisk_size - 1);
370} 364}
371 365
366static void __init early_reserve_initrd(void)
367{
368 /* Assume only end is not page aligned */
369 u64 ramdisk_image = get_ramdisk_image();
370 u64 ramdisk_size = get_ramdisk_size();
371 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
372
373 if (!boot_params.hdr.type_of_loader ||
374 !ramdisk_image || !ramdisk_size)
375 return; /* No initrd provided by bootloader */
376
377 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
378}
372static void __init reserve_initrd(void) 379static void __init reserve_initrd(void)
373{ 380{
374 /* Assume only end is not page aligned */ 381 /* Assume only end is not page aligned */
375 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 382 u64 ramdisk_image = get_ramdisk_image();
376 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 383 u64 ramdisk_size = get_ramdisk_size();
377 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 384 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
378 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 385 u64 mapped_size;
379 386
380 if (!boot_params.hdr.type_of_loader || 387 if (!boot_params.hdr.type_of_loader ||
381 !ramdisk_image || !ramdisk_size) 388 !ramdisk_image || !ramdisk_size)
@@ -383,22 +390,18 @@ static void __init reserve_initrd(void)
383 390
384 initrd_start = 0; 391 initrd_start = 0;
385 392
386 if (ramdisk_size >= (end_of_lowmem>>1)) { 393 mapped_size = memblock_mem_size(max_pfn_mapped);
394 if (ramdisk_size >= (mapped_size>>1))
387 panic("initrd too large to handle, " 395 panic("initrd too large to handle, "
388 "disabling initrd (%lld needed, %lld available)\n", 396 "disabling initrd (%lld needed, %lld available)\n",
389 ramdisk_size, end_of_lowmem>>1); 397 ramdisk_size, mapped_size>>1);
390 }
391 398
392 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, 399 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
393 ramdisk_end - 1); 400 ramdisk_end - 1);
394 401
395 402 if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
396 if (ramdisk_end <= end_of_lowmem) { 403 PFN_DOWN(ramdisk_end))) {
397 /* All in lowmem, easy case */ 404 /* All are mapped, easy case */
398 /*
399 * don't need to reserve again, already reserved early
400 * in i386_start_kernel
401 */
402 initrd_start = ramdisk_image + PAGE_OFFSET; 405 initrd_start = ramdisk_image + PAGE_OFFSET;
403 initrd_end = initrd_start + ramdisk_size; 406 initrd_end = initrd_start + ramdisk_size;
404 return; 407 return;
@@ -409,6 +412,9 @@ static void __init reserve_initrd(void)
409 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); 412 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
410} 413}
411#else 414#else
415static void __init early_reserve_initrd(void)
416{
417}
412static void __init reserve_initrd(void) 418static void __init reserve_initrd(void)
413{ 419{
414} 420}
@@ -419,8 +425,6 @@ static void __init parse_setup_data(void)
419 struct setup_data *data; 425 struct setup_data *data;
420 u64 pa_data; 426 u64 pa_data;
421 427
422 if (boot_params.hdr.version < 0x0209)
423 return;
424 pa_data = boot_params.hdr.setup_data; 428 pa_data = boot_params.hdr.setup_data;
425 while (pa_data) { 429 while (pa_data) {
426 u32 data_len, map_len; 430 u32 data_len, map_len;
@@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void)
456 u64 pa_data; 460 u64 pa_data;
457 int found = 0; 461 int found = 0;
458 462
459 if (boot_params.hdr.version < 0x0209)
460 return;
461 pa_data = boot_params.hdr.setup_data; 463 pa_data = boot_params.hdr.setup_data;
462 while (pa_data) { 464 while (pa_data) {
463 data = early_memremap(pa_data, sizeof(*data)); 465 data = early_memremap(pa_data, sizeof(*data));
@@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
481 struct setup_data *data; 483 struct setup_data *data;
482 u64 pa_data; 484 u64 pa_data;
483 485
484 if (boot_params.hdr.version < 0x0209)
485 return;
486 pa_data = boot_params.hdr.setup_data; 486 pa_data = boot_params.hdr.setup_data;
487 while (pa_data) { 487 while (pa_data) {
488 data = early_memremap(pa_data, sizeof(*data)); 488 data = early_memremap(pa_data, sizeof(*data));
@@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void)
501/* 501/*
502 * Keep the crash kernel below this limit. On 32 bits earlier kernels 502 * Keep the crash kernel below this limit. On 32 bits earlier kernels
503 * would limit the kernel to the low 512 MiB due to mapping restrictions. 503 * would limit the kernel to the low 512 MiB due to mapping restrictions.
504 * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
505 * limit once kexec-tools are fixed.
506 */ 504 */
507#ifdef CONFIG_X86_32 505#ifdef CONFIG_X86_32
508# define CRASH_KERNEL_ADDR_MAX (512 << 20) 506# define CRASH_KERNEL_ADDR_MAX (512 << 20)
509#else 507#else
510# define CRASH_KERNEL_ADDR_MAX (896 << 20) 508# define CRASH_KERNEL_ADDR_MAX MAXMEM
509#endif
510
511static void __init reserve_crashkernel_low(void)
512{
513#ifdef CONFIG_X86_64
514 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long low_base = 0, low_size = 0;
516 unsigned long total_low_mem;
517 unsigned long long base;
518 int ret;
519
520 total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
521 ret = parse_crashkernel_low(boot_command_line, total_low_mem,
522 &low_size, &base);
523 if (ret != 0 || low_size <= 0)
524 return;
525
526 low_base = memblock_find_in_range(low_size, (1ULL<<32),
527 low_size, alignment);
528
529 if (!low_base) {
530 pr_info("crashkernel low reservation failed - No suitable area found.\n");
531
532 return;
533 }
534
535 memblock_reserve(low_base, low_size);
536 pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
537 (unsigned long)(low_size >> 20),
538 (unsigned long)(low_base >> 20),
539 (unsigned long)(total_low_mem >> 20));
540 crashk_low_res.start = low_base;
541 crashk_low_res.end = low_base + low_size - 1;
542 insert_resource(&iomem_resource, &crashk_low_res);
511#endif 543#endif
544}
512 545
513static void __init reserve_crashkernel(void) 546static void __init reserve_crashkernel(void)
514{ 547{
548 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long total_mem; 549 unsigned long long total_mem;
516 unsigned long long crash_size, crash_base; 550 unsigned long long crash_size, crash_base;
517 int ret; 551 int ret;
@@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void)
525 559
526 /* 0 means: find the address automatically */ 560 /* 0 means: find the address automatically */
527 if (crash_base <= 0) { 561 if (crash_base <= 0) {
528 const unsigned long long alignment = 16<<20; /* 16M */
529
530 /* 562 /*
531 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX 563 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
532 */ 564 */
@@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void)
537 pr_info("crashkernel reservation failed - No suitable area found.\n"); 569 pr_info("crashkernel reservation failed - No suitable area found.\n");
538 return; 570 return;
539 } 571 }
572
540 } else { 573 } else {
541 unsigned long long start; 574 unsigned long long start;
542 575
@@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void)
558 crashk_res.start = crash_base; 591 crashk_res.start = crash_base;
559 crashk_res.end = crash_base + crash_size - 1; 592 crashk_res.end = crash_base + crash_size - 1;
560 insert_resource(&iomem_resource, &crashk_res); 593 insert_resource(&iomem_resource, &crashk_res);
594
595 if (crash_base >= (1ULL<<32))
596 reserve_crashkernel_low();
561} 597}
562#else 598#else
563static void __init reserve_crashkernel(void) 599static void __init reserve_crashkernel(void)
@@ -608,8 +644,6 @@ static __init void reserve_ibft_region(void)
608 memblock_reserve(addr, size); 644 memblock_reserve(addr, size);
609} 645}
610 646
611static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
612
613static bool __init snb_gfx_workaround_needed(void) 647static bool __init snb_gfx_workaround_needed(void)
614{ 648{
615#ifdef CONFIG_PCI 649#ifdef CONFIG_PCI
@@ -698,8 +732,7 @@ static void __init trim_bios_range(void)
698 * since some BIOSes are known to corrupt low memory. See the 732 * since some BIOSes are known to corrupt low memory. See the
699 * Kconfig help text for X86_RESERVE_LOW. 733 * Kconfig help text for X86_RESERVE_LOW.
700 */ 734 */
701 e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), 735 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
702 E820_RAM, E820_RESERVED);
703 736
704 /* 737 /*
705 * special case: Some BIOSen report the PC BIOS 738 * special case: Some BIOSen report the PC BIOS
@@ -711,6 +744,29 @@ static void __init trim_bios_range(void)
711 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 744 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
712} 745}
713 746
747/* called before trim_bios_range() to spare extra sanitize */
748static void __init e820_add_kernel_range(void)
749{
750 u64 start = __pa_symbol(_text);
751 u64 size = __pa_symbol(_end) - start;
752
753 /*
754 * Complain if .text .data and .bss are not marked as E820_RAM and
755 * attempt to fix it by adding the range. We may have a confused BIOS,
756 * or the user may have used memmap=exactmap or memmap=xxM$yyM to
757 * exclude kernel range. If we really are running on top non-RAM,
758 * we will crash later anyways.
759 */
760 if (e820_all_mapped(start, start + size, E820_RAM))
761 return;
762
763 pr_warn(".text .data .bss are not marked as E820_RAM!\n");
764 e820_remove_range(start, size, E820_RAM, 0);
765 e820_add_region(start, size, E820_RAM);
766}
767
768static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
769
714static int __init parse_reservelow(char *p) 770static int __init parse_reservelow(char *p)
715{ 771{
716 unsigned long long size; 772 unsigned long long size;
@@ -733,6 +789,11 @@ static int __init parse_reservelow(char *p)
733 789
734early_param("reservelow", parse_reservelow); 790early_param("reservelow", parse_reservelow);
735 791
792static void __init trim_low_memory_range(void)
793{
794 memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
795}
796
736/* 797/*
737 * Determine if we were loaded by an EFI loader. If so, then we have also been 798 * Determine if we were loaded by an EFI loader. If so, then we have also been
738 * passed the efi memmap, systab, etc., so we should use these data structures 799 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -748,6 +809,17 @@ early_param("reservelow", parse_reservelow);
748 809
749void __init setup_arch(char **cmdline_p) 810void __init setup_arch(char **cmdline_p)
750{ 811{
812 memblock_reserve(__pa_symbol(_text),
813 (unsigned long)__bss_stop - (unsigned long)_text);
814
815 early_reserve_initrd();
816
817 /*
818 * At this point everything still needed from the boot loader
819 * or BIOS or kernel text should be early reserved or marked not
820 * RAM in e820. All other memory is free game.
821 */
822
751#ifdef CONFIG_X86_32 823#ifdef CONFIG_X86_32
752 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 824 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
753 visws_early_detect(); 825 visws_early_detect();
@@ -835,12 +907,12 @@ void __init setup_arch(char **cmdline_p)
835 init_mm.end_data = (unsigned long) _edata; 907 init_mm.end_data = (unsigned long) _edata;
836 init_mm.brk = _brk_end; 908 init_mm.brk = _brk_end;
837 909
838 code_resource.start = virt_to_phys(_text); 910 code_resource.start = __pa_symbol(_text);
839 code_resource.end = virt_to_phys(_etext)-1; 911 code_resource.end = __pa_symbol(_etext)-1;
840 data_resource.start = virt_to_phys(_etext); 912 data_resource.start = __pa_symbol(_etext);
841 data_resource.end = virt_to_phys(_edata)-1; 913 data_resource.end = __pa_symbol(_edata)-1;
842 bss_resource.start = virt_to_phys(&__bss_start); 914 bss_resource.start = __pa_symbol(__bss_start);
843 bss_resource.end = virt_to_phys(&__bss_stop)-1; 915 bss_resource.end = __pa_symbol(__bss_stop)-1;
844 916
845#ifdef CONFIG_CMDLINE_BOOL 917#ifdef CONFIG_CMDLINE_BOOL
846#ifdef CONFIG_CMDLINE_OVERRIDE 918#ifdef CONFIG_CMDLINE_OVERRIDE
@@ -906,6 +978,7 @@ void __init setup_arch(char **cmdline_p)
906 insert_resource(&iomem_resource, &data_resource); 978 insert_resource(&iomem_resource, &data_resource);
907 insert_resource(&iomem_resource, &bss_resource); 979 insert_resource(&iomem_resource, &bss_resource);
908 980
981 e820_add_kernel_range();
909 trim_bios_range(); 982 trim_bios_range();
910#ifdef CONFIG_X86_32 983#ifdef CONFIG_X86_32
911 if (ppro_with_ram_bug()) { 984 if (ppro_with_ram_bug()) {
@@ -955,6 +1028,8 @@ void __init setup_arch(char **cmdline_p)
955 1028
956 reserve_ibft_region(); 1029 reserve_ibft_region();
957 1030
1031 early_alloc_pgt_buf();
1032
958 /* 1033 /*
959 * Need to conclude brk, before memblock_x86_fill() 1034 * Need to conclude brk, before memblock_x86_fill()
960 * it could use memblock_find_in_range, could overlap with 1035 * it could use memblock_find_in_range, could overlap with
@@ -964,7 +1039,7 @@ void __init setup_arch(char **cmdline_p)
964 1039
965 cleanup_highmap(); 1040 cleanup_highmap();
966 1041
967 memblock.current_limit = get_max_mapped(); 1042 memblock.current_limit = ISA_END_ADDRESS;
968 memblock_x86_fill(); 1043 memblock_x86_fill();
969 1044
970 /* 1045 /*
@@ -981,41 +1056,22 @@ void __init setup_arch(char **cmdline_p)
981 setup_bios_corruption_check(); 1056 setup_bios_corruption_check();
982#endif 1057#endif
983 1058
1059#ifdef CONFIG_X86_32
984 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", 1060 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
985 (max_pfn_mapped<<PAGE_SHIFT) - 1); 1061 (max_pfn_mapped<<PAGE_SHIFT) - 1);
1062#endif
986 1063
987 setup_real_mode(); 1064 reserve_real_mode();
988 1065
989 trim_platform_memory_ranges(); 1066 trim_platform_memory_ranges();
1067 trim_low_memory_range();
990 1068
991 init_gbpages(); 1069 init_mem_mapping();
992
993 /* max_pfn_mapped is updated here */
994 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
995 max_pfn_mapped = max_low_pfn_mapped;
996
997#ifdef CONFIG_X86_64
998 if (max_pfn > max_low_pfn) {
999 int i;
1000 unsigned long start, end;
1001 unsigned long start_pfn, end_pfn;
1002
1003 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
1004 NULL) {
1005 1070
1006 end = PFN_PHYS(end_pfn); 1071 early_trap_pf_init();
1007 if (end <= (1UL<<32))
1008 continue;
1009 1072
1010 start = PFN_PHYS(start_pfn); 1073 setup_real_mode();
1011 max_pfn_mapped = init_memory_mapping(
1012 max((1UL<<32), start), end);
1013 }
1014 1074
1015 /* can we preseve max_low_pfn ?*/
1016 max_low_pfn = max_pfn;
1017 }
1018#endif
1019 memblock.current_limit = get_max_mapped(); 1075 memblock.current_limit = get_max_mapped();
1020 dma_contiguous_reserve(0); 1076 dma_contiguous_reserve(0);
1021 1077
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca11f4e9..68bda7a84159 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -688,10 +688,19 @@ void __init early_trap_init(void)
688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
689 /* int3 can be called from all */ 689 /* int3 can be called from all */
690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
691#ifdef CONFIG_X86_32
691 set_intr_gate(X86_TRAP_PF, &page_fault); 692 set_intr_gate(X86_TRAP_PF, &page_fault);
693#endif
692 load_idt(&idt_descr); 694 load_idt(&idt_descr);
693} 695}
694 696
697void __init early_trap_pf_init(void)
698{
699#ifdef CONFIG_X86_64
700 set_intr_gate(X86_TRAP_PF, &page_fault);
701#endif
702}
703
695void __init trap_init(void) 704void __init trap_init(void)
696{ 705{
697 int i; 706 int i;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1330dd102950..b014d9414d08 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -59,6 +59,9 @@ EXPORT_SYMBOL(memcpy);
59EXPORT_SYMBOL(__memcpy); 59EXPORT_SYMBOL(__memcpy);
60EXPORT_SYMBOL(memmove); 60EXPORT_SYMBOL(memmove);
61 61
62#ifndef CONFIG_DEBUG_VIRTUAL
63EXPORT_SYMBOL(phys_base);
64#endif
62EXPORT_SYMBOL(empty_zero_page); 65EXPORT_SYMBOL(empty_zero_page);
63#ifndef CONFIG_PARAVIRT 66#ifndef CONFIG_PARAVIRT
64EXPORT_SYMBOL(native_load_gs_index); 67EXPORT_SYMBOL(native_load_gs_index);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index d065d67c2672..45a14dbbddaf 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -63,10 +63,6 @@ struct x86_init_ops x86_init __initdata = {
63 .banner = default_banner, 63 .banner = default_banner,
64 }, 64 },
65 65
66 .mapping = {
67 .pagetable_reserve = native_pagetable_reserve,
68 },
69
70 .paging = { 66 .paging = {
71 .pagetable_init = native_pagetable_init, 67 .pagetable_init = native_pagetable_init,
72 }, 68 },
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index df4176cdbb32..1cbd89ca5569 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -552,7 +552,8 @@ static void lguest_write_cr3(unsigned long cr3)
552 current_cr3 = cr3; 552 current_cr3 = cr3;
553 553
554 /* These two page tables are simple, linear, and used during boot */ 554 /* These two page tables are simple, linear, and used during boot */
555 if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) 555 if (cr3 != __pa_symbol(swapper_pg_dir) &&
556 cr3 != __pa_symbol(initial_page_table))
556 cr3_changed = true; 557 cr3_changed = true;
557} 558}
558 559
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 156b9c804670..a4512359656a 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -15,11 +15,10 @@
15 * __get_user_X 15 * __get_user_X
16 * 16 *
17 * Inputs: %[r|e]ax contains the address. 17 * Inputs: %[r|e]ax contains the address.
18 * The register is modified, but all changes are undone
19 * before returning because the C code doesn't know about it.
20 * 18 *
21 * Outputs: %[r|e]ax is error code (0 or -EFAULT) 19 * Outputs: %[r|e]ax is error code (0 or -EFAULT)
22 * %[r|e]dx contains zero-extended value 20 * %[r|e]dx contains zero-extended value
21 * %ecx contains the high half for 32-bit __get_user_8
23 * 22 *
24 * 23 *
25 * These functions should not modify any other registers, 24 * These functions should not modify any other registers,
@@ -42,7 +41,7 @@ ENTRY(__get_user_1)
42 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX 41 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
43 jae bad_get_user 42 jae bad_get_user
44 ASM_STAC 43 ASM_STAC
451: movzb (%_ASM_AX),%edx 441: movzbl (%_ASM_AX),%edx
46 xor %eax,%eax 45 xor %eax,%eax
47 ASM_CLAC 46 ASM_CLAC
48 ret 47 ret
@@ -72,29 +71,42 @@ ENTRY(__get_user_4)
72 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX 71 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
73 jae bad_get_user 72 jae bad_get_user
74 ASM_STAC 73 ASM_STAC
753: mov -3(%_ASM_AX),%edx 743: movl -3(%_ASM_AX),%edx
76 xor %eax,%eax 75 xor %eax,%eax
77 ASM_CLAC 76 ASM_CLAC
78 ret 77 ret
79 CFI_ENDPROC 78 CFI_ENDPROC
80ENDPROC(__get_user_4) 79ENDPROC(__get_user_4)
81 80
82#ifdef CONFIG_X86_64
83ENTRY(__get_user_8) 81ENTRY(__get_user_8)
84 CFI_STARTPROC 82 CFI_STARTPROC
83#ifdef CONFIG_X86_64
85 add $7,%_ASM_AX 84 add $7,%_ASM_AX
86 jc bad_get_user 85 jc bad_get_user
87 GET_THREAD_INFO(%_ASM_DX) 86 GET_THREAD_INFO(%_ASM_DX)
88 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX 87 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
89 jae bad_get_user 88 jae bad_get_user
90 ASM_STAC 89 ASM_STAC
914: movq -7(%_ASM_AX),%_ASM_DX 904: movq -7(%_ASM_AX),%rdx
92 xor %eax,%eax 91 xor %eax,%eax
93 ASM_CLAC 92 ASM_CLAC
94 ret 93 ret
94#else
95 add $7,%_ASM_AX
96 jc bad_get_user_8
97 GET_THREAD_INFO(%_ASM_DX)
98 cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
99 jae bad_get_user_8
100 ASM_STAC
1014: movl -7(%_ASM_AX),%edx
1025: movl -3(%_ASM_AX),%ecx
103 xor %eax,%eax
104 ASM_CLAC
105 ret
106#endif
95 CFI_ENDPROC 107 CFI_ENDPROC
96ENDPROC(__get_user_8) 108ENDPROC(__get_user_8)
97#endif 109
98 110
99bad_get_user: 111bad_get_user:
100 CFI_STARTPROC 112 CFI_STARTPROC
@@ -105,9 +117,24 @@ bad_get_user:
105 CFI_ENDPROC 117 CFI_ENDPROC
106END(bad_get_user) 118END(bad_get_user)
107 119
120#ifdef CONFIG_X86_32
121bad_get_user_8:
122 CFI_STARTPROC
123 xor %edx,%edx
124 xor %ecx,%ecx
125 mov $(-EFAULT),%_ASM_AX
126 ASM_CLAC
127 ret
128 CFI_ENDPROC
129END(bad_get_user_8)
130#endif
131
108 _ASM_EXTABLE(1b,bad_get_user) 132 _ASM_EXTABLE(1b,bad_get_user)
109 _ASM_EXTABLE(2b,bad_get_user) 133 _ASM_EXTABLE(2b,bad_get_user)
110 _ASM_EXTABLE(3b,bad_get_user) 134 _ASM_EXTABLE(3b,bad_get_user)
111#ifdef CONFIG_X86_64 135#ifdef CONFIG_X86_64
112 _ASM_EXTABLE(4b,bad_get_user) 136 _ASM_EXTABLE(4b,bad_get_user)
137#else
138 _ASM_EXTABLE(4b,bad_get_user_8)
139 _ASM_EXTABLE(5b,bad_get_user_8)
113#endif 140#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d7aea41563b3..d41815265a0b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -17,86 +17,132 @@
17#include <asm/proto.h> 17#include <asm/proto.h>
18#include <asm/dma.h> /* for MAX_DMA_PFN */ 18#include <asm/dma.h> /* for MAX_DMA_PFN */
19 19
20unsigned long __initdata pgt_buf_start; 20#include "mm_internal.h"
21unsigned long __meminitdata pgt_buf_end;
22unsigned long __meminitdata pgt_buf_top;
23 21
24int after_bootmem; 22static unsigned long __initdata pgt_buf_start;
23static unsigned long __initdata pgt_buf_end;
24static unsigned long __initdata pgt_buf_top;
25 25
26int direct_gbpages 26static unsigned long min_pfn_mapped;
27#ifdef CONFIG_DIRECT_GBPAGES
28 = 1
29#endif
30;
31 27
32struct map_range { 28static bool __initdata can_use_brk_pgt = true;
33 unsigned long start;
34 unsigned long end;
35 unsigned page_size_mask;
36};
37 29
38/* 30/*
39 * First calculate space needed for kernel direct mapping page tables to cover 31 * Pages returned are already directly mapped.
40 * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB 32 *
41 * pages. Then find enough contiguous space for those page tables. 33 * Changing that is likely to break Xen, see commit:
34 *
35 * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
36 *
37 * for detailed information.
42 */ 38 */
43static void __init find_early_table_space(struct map_range *mr, int nr_range) 39__ref void *alloc_low_pages(unsigned int num)
44{ 40{
41 unsigned long pfn;
45 int i; 42 int i;
46 unsigned long puds = 0, pmds = 0, ptes = 0, tables;
47 unsigned long start = 0, good_end;
48 phys_addr_t base;
49 43
50 for (i = 0; i < nr_range; i++) { 44 if (after_bootmem) {
51 unsigned long range, extra; 45 unsigned int order;
52 46
53 range = mr[i].end - mr[i].start; 47 order = get_order((unsigned long)num << PAGE_SHIFT);
54 puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; 48 return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
49 __GFP_ZERO, order);
50 }
55 51
56 if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { 52 if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
57 extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); 53 unsigned long ret;
58 pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; 54 if (min_pfn_mapped >= max_pfn_mapped)
59 } else { 55 panic("alloc_low_page: ran out of memory");
60 pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; 56 ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
61 } 57 max_pfn_mapped << PAGE_SHIFT,
58 PAGE_SIZE * num , PAGE_SIZE);
59 if (!ret)
60 panic("alloc_low_page: can not alloc memory");
61 memblock_reserve(ret, PAGE_SIZE * num);
62 pfn = ret >> PAGE_SHIFT;
63 } else {
64 pfn = pgt_buf_end;
65 pgt_buf_end += num;
66 printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
67 pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
68 }
62 69
63 if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { 70 for (i = 0; i < num; i++) {
64 extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); 71 void *adr;
65#ifdef CONFIG_X86_32 72
66 extra += PMD_SIZE; 73 adr = __va((pfn + i) << PAGE_SHIFT);
67#endif 74 clear_page(adr);
68 ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
69 } else {
70 ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
71 }
72 } 75 }
73 76
74 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); 77 return __va(pfn << PAGE_SHIFT);
75 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); 78}
76 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
77 79
78#ifdef CONFIG_X86_32 80/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
79 /* for fixmap */ 81#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
80 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 82RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
81#endif 83void __init early_alloc_pgt_buf(void)
82 good_end = max_pfn_mapped << PAGE_SHIFT; 84{
85 unsigned long tables = INIT_PGT_BUF_SIZE;
86 phys_addr_t base;
83 87
84 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); 88 base = __pa(extend_brk(tables, PAGE_SIZE));
85 if (!base)
86 panic("Cannot find space for the kernel page tables");
87 89
88 pgt_buf_start = base >> PAGE_SHIFT; 90 pgt_buf_start = base >> PAGE_SHIFT;
89 pgt_buf_end = pgt_buf_start; 91 pgt_buf_end = pgt_buf_start;
90 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); 92 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
93}
94
95int after_bootmem;
96
97int direct_gbpages
98#ifdef CONFIG_DIRECT_GBPAGES
99 = 1
100#endif
101;
91 102
92 printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", 103static void __init init_gbpages(void)
93 mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, 104{
94 (pgt_buf_top << PAGE_SHIFT) - 1); 105#ifdef CONFIG_X86_64
106 if (direct_gbpages && cpu_has_gbpages)
107 printk(KERN_INFO "Using GB pages for direct mapping\n");
108 else
109 direct_gbpages = 0;
110#endif
95} 111}
96 112
97void __init native_pagetable_reserve(u64 start, u64 end) 113struct map_range {
114 unsigned long start;
115 unsigned long end;
116 unsigned page_size_mask;
117};
118
119static int page_size_mask;
120
121static void __init probe_page_size_mask(void)
98{ 122{
99 memblock_reserve(start, end - start); 123 init_gbpages();
124
125#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
126 /*
127 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
128 * This will simplify cpa(), which otherwise needs to support splitting
129 * large pages into small in interrupt context, etc.
130 */
131 if (direct_gbpages)
132 page_size_mask |= 1 << PG_LEVEL_1G;
133 if (cpu_has_pse)
134 page_size_mask |= 1 << PG_LEVEL_2M;
135#endif
136
137 /* Enable PSE if available */
138 if (cpu_has_pse)
139 set_in_cr4(X86_CR4_PSE);
140
141 /* Enable PGE if available */
142 if (cpu_has_pge) {
143 set_in_cr4(X86_CR4_PGE);
144 __supported_pte_mask |= _PAGE_GLOBAL;
145 }
100} 146}
101 147
102#ifdef CONFIG_X86_32 148#ifdef CONFIG_X86_32
@@ -122,58 +168,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
122} 168}
123 169
124/* 170/*
125 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 171 * adjust the page_size_mask for small range to go with
126 * This runs before bootmem is initialized and gets pages directly from 172 * big page size instead small one if nearby are ram too.
127 * the physical memory. To access them they are temporarily mapped.
128 */ 173 */
129unsigned long __init_refok init_memory_mapping(unsigned long start, 174static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
130 unsigned long end) 175 int nr_range)
131{ 176{
132 unsigned long page_size_mask = 0; 177 int i;
133 unsigned long start_pfn, end_pfn;
134 unsigned long ret = 0;
135 unsigned long pos;
136
137 struct map_range mr[NR_RANGE_MR];
138 int nr_range, i;
139 int use_pse, use_gbpages;
140 178
141 printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", 179 for (i = 0; i < nr_range; i++) {
142 start, end - 1); 180 if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
181 !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
182 unsigned long start = round_down(mr[i].start, PMD_SIZE);
183 unsigned long end = round_up(mr[i].end, PMD_SIZE);
143 184
144#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) 185#ifdef CONFIG_X86_32
145 /* 186 if ((end >> PAGE_SHIFT) > max_low_pfn)
146 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 187 continue;
147 * This will simplify cpa(), which otherwise needs to support splitting
148 * large pages into small in interrupt context, etc.
149 */
150 use_pse = use_gbpages = 0;
151#else
152 use_pse = cpu_has_pse;
153 use_gbpages = direct_gbpages;
154#endif 188#endif
155 189
156 /* Enable PSE if available */ 190 if (memblock_is_region_memory(start, end - start))
157 if (cpu_has_pse) 191 mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
158 set_in_cr4(X86_CR4_PSE); 192 }
193 if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
194 !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
195 unsigned long start = round_down(mr[i].start, PUD_SIZE);
196 unsigned long end = round_up(mr[i].end, PUD_SIZE);
159 197
160 /* Enable PGE if available */ 198 if (memblock_is_region_memory(start, end - start))
161 if (cpu_has_pge) { 199 mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
162 set_in_cr4(X86_CR4_PGE); 200 }
163 __supported_pte_mask |= _PAGE_GLOBAL;
164 } 201 }
202}
165 203
166 if (use_gbpages) 204static int __meminit split_mem_range(struct map_range *mr, int nr_range,
167 page_size_mask |= 1 << PG_LEVEL_1G; 205 unsigned long start,
168 if (use_pse) 206 unsigned long end)
169 page_size_mask |= 1 << PG_LEVEL_2M; 207{
208 unsigned long start_pfn, end_pfn, limit_pfn;
209 unsigned long pfn;
210 int i;
170 211
171 memset(mr, 0, sizeof(mr)); 212 limit_pfn = PFN_DOWN(end);
172 nr_range = 0;
173 213
174 /* head if not big page alignment ? */ 214 /* head if not big page alignment ? */
175 start_pfn = start >> PAGE_SHIFT; 215 pfn = start_pfn = PFN_DOWN(start);
176 pos = start_pfn << PAGE_SHIFT;
177#ifdef CONFIG_X86_32 216#ifdef CONFIG_X86_32
178 /* 217 /*
179 * Don't use a large page for the first 2/4MB of memory 218 * Don't use a large page for the first 2/4MB of memory
@@ -181,66 +220,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
181 * and overlapping MTRRs into large pages can cause 220 * and overlapping MTRRs into large pages can cause
182 * slowdowns. 221 * slowdowns.
183 */ 222 */
184 if (pos == 0) 223 if (pfn == 0)
185 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); 224 end_pfn = PFN_DOWN(PMD_SIZE);
186 else 225 else
187 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 226 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
188 << (PMD_SHIFT - PAGE_SHIFT);
189#else /* CONFIG_X86_64 */ 227#else /* CONFIG_X86_64 */
190 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) 228 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
191 << (PMD_SHIFT - PAGE_SHIFT);
192#endif 229#endif
193 if (end_pfn > (end >> PAGE_SHIFT)) 230 if (end_pfn > limit_pfn)
194 end_pfn = end >> PAGE_SHIFT; 231 end_pfn = limit_pfn;
195 if (start_pfn < end_pfn) { 232 if (start_pfn < end_pfn) {
196 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 233 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
197 pos = end_pfn << PAGE_SHIFT; 234 pfn = end_pfn;
198 } 235 }
199 236
200 /* big page (2M) range */ 237 /* big page (2M) range */
201 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 238 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
202 << (PMD_SHIFT - PAGE_SHIFT);
203#ifdef CONFIG_X86_32 239#ifdef CONFIG_X86_32
204 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 240 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
205#else /* CONFIG_X86_64 */ 241#else /* CONFIG_X86_64 */
206 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) 242 end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
207 << (PUD_SHIFT - PAGE_SHIFT); 243 if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
208 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) 244 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
209 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
210#endif 245#endif
211 246
212 if (start_pfn < end_pfn) { 247 if (start_pfn < end_pfn) {
213 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 248 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
214 page_size_mask & (1<<PG_LEVEL_2M)); 249 page_size_mask & (1<<PG_LEVEL_2M));
215 pos = end_pfn << PAGE_SHIFT; 250 pfn = end_pfn;
216 } 251 }
217 252
218#ifdef CONFIG_X86_64 253#ifdef CONFIG_X86_64
219 /* big page (1G) range */ 254 /* big page (1G) range */
220 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) 255 start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
221 << (PUD_SHIFT - PAGE_SHIFT); 256 end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
222 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
223 if (start_pfn < end_pfn) { 257 if (start_pfn < end_pfn) {
224 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 258 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
225 page_size_mask & 259 page_size_mask &
226 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); 260 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
227 pos = end_pfn << PAGE_SHIFT; 261 pfn = end_pfn;
228 } 262 }
229 263
230 /* tail is not big page (1G) alignment */ 264 /* tail is not big page (1G) alignment */
231 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 265 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
232 << (PMD_SHIFT - PAGE_SHIFT); 266 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
233 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
234 if (start_pfn < end_pfn) { 267 if (start_pfn < end_pfn) {
235 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 268 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
236 page_size_mask & (1<<PG_LEVEL_2M)); 269 page_size_mask & (1<<PG_LEVEL_2M));
237 pos = end_pfn << PAGE_SHIFT; 270 pfn = end_pfn;
238 } 271 }
239#endif 272#endif
240 273
241 /* tail is not big page (2M) alignment */ 274 /* tail is not big page (2M) alignment */
242 start_pfn = pos>>PAGE_SHIFT; 275 start_pfn = pfn;
243 end_pfn = end>>PAGE_SHIFT; 276 end_pfn = limit_pfn;
244 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 277 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
245 278
246 /* try to merge same page size and continuous */ 279 /* try to merge same page size and continuous */
@@ -257,59 +290,169 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
257 nr_range--; 290 nr_range--;
258 } 291 }
259 292
293 if (!after_bootmem)
294 adjust_range_page_size_mask(mr, nr_range);
295
260 for (i = 0; i < nr_range; i++) 296 for (i = 0; i < nr_range; i++)
261 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", 297 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
262 mr[i].start, mr[i].end - 1, 298 mr[i].start, mr[i].end - 1,
263 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( 299 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
264 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 300 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
265 301
266 /* 302 return nr_range;
267 * Find space for the kernel direct mapping tables. 303}
268 * 304
269 * Later we should allocate these tables in the local node of the 305struct range pfn_mapped[E820_X_MAX];
270 * memory mapped. Unfortunately this is done currently before the 306int nr_pfn_mapped;
271 * nodes are discovered. 307
272 */ 308static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
273 if (!after_bootmem) 309{
274 find_early_table_space(mr, nr_range); 310 nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
311 nr_pfn_mapped, start_pfn, end_pfn);
312 nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
313
314 max_pfn_mapped = max(max_pfn_mapped, end_pfn);
315
316 if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
317 max_low_pfn_mapped = max(max_low_pfn_mapped,
318 min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
319}
320
321bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
322{
323 int i;
324
325 for (i = 0; i < nr_pfn_mapped; i++)
326 if ((start_pfn >= pfn_mapped[i].start) &&
327 (end_pfn <= pfn_mapped[i].end))
328 return true;
329
330 return false;
331}
332
333/*
334 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
335 * This runs before bootmem is initialized and gets pages directly from
336 * the physical memory. To access them they are temporarily mapped.
337 */
338unsigned long __init_refok init_memory_mapping(unsigned long start,
339 unsigned long end)
340{
341 struct map_range mr[NR_RANGE_MR];
342 unsigned long ret = 0;
343 int nr_range, i;
344
345 pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
346 start, end - 1);
347
348 memset(mr, 0, sizeof(mr));
349 nr_range = split_mem_range(mr, 0, start, end);
275 350
276 for (i = 0; i < nr_range; i++) 351 for (i = 0; i < nr_range; i++)
277 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 352 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
278 mr[i].page_size_mask); 353 mr[i].page_size_mask);
279 354
280#ifdef CONFIG_X86_32 355 add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
281 early_ioremap_page_table_range_init();
282 356
283 load_cr3(swapper_pg_dir); 357 return ret >> PAGE_SHIFT;
284#endif 358}
285 359
286 __flush_tlb_all(); 360/*
361 * would have hole in the middle or ends, and only ram parts will be mapped.
362 */
363static unsigned long __init init_range_memory_mapping(
364 unsigned long r_start,
365 unsigned long r_end)
366{
367 unsigned long start_pfn, end_pfn;
368 unsigned long mapped_ram_size = 0;
369 int i;
287 370
288 /* 371 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
289 * Reserve the kernel pagetable pages we used (pgt_buf_start - 372 u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
290 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) 373 u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
291 * so that they can be reused for other purposes. 374 if (start >= end)
292 * 375 continue;
293 * On native it just means calling memblock_reserve, on Xen it also
294 * means marking RW the pagetable pages that we allocated before
295 * but that haven't been used.
296 *
297 * In fact on xen we mark RO the whole range pgt_buf_start -
298 * pgt_buf_top, because we have to make sure that when
299 * init_memory_mapping reaches the pagetable pages area, it maps
300 * RO all the pagetable pages, including the ones that are beyond
301 * pgt_buf_end at that time.
302 */
303 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
304 x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
305 PFN_PHYS(pgt_buf_end));
306 376
307 if (!after_bootmem) 377 /*
308 early_memtest(start, end); 378 * if it is overlapping with brk pgt, we need to
379 * alloc pgt buf from memblock instead.
380 */
381 can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
382 min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
383 init_memory_mapping(start, end);
384 mapped_ram_size += end - start;
385 can_use_brk_pgt = true;
386 }
309 387
310 return ret >> PAGE_SHIFT; 388 return mapped_ram_size;
311} 389}
312 390
391/* (PUD_SHIFT-PMD_SHIFT)/2 */
392#define STEP_SIZE_SHIFT 5
393void __init init_mem_mapping(void)
394{
395 unsigned long end, real_end, start, last_start;
396 unsigned long step_size;
397 unsigned long addr;
398 unsigned long mapped_ram_size = 0;
399 unsigned long new_mapped_ram_size;
400
401 probe_page_size_mask();
402
403#ifdef CONFIG_X86_64
404 end = max_pfn << PAGE_SHIFT;
405#else
406 end = max_low_pfn << PAGE_SHIFT;
407#endif
408
409 /* the ISA range is always mapped regardless of memory holes */
410 init_memory_mapping(0, ISA_END_ADDRESS);
411
412 /* xen has big range in reserved near end of ram, skip it at first */
413 addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
414 PAGE_SIZE);
415 real_end = addr + PMD_SIZE;
416
417 /* step_size need to be small so pgt_buf from BRK could cover it */
418 step_size = PMD_SIZE;
419 max_pfn_mapped = 0; /* will get exact value next */
420 min_pfn_mapped = real_end >> PAGE_SHIFT;
421 last_start = start = real_end;
422 while (last_start > ISA_END_ADDRESS) {
423 if (last_start > step_size) {
424 start = round_down(last_start - 1, step_size);
425 if (start < ISA_END_ADDRESS)
426 start = ISA_END_ADDRESS;
427 } else
428 start = ISA_END_ADDRESS;
429 new_mapped_ram_size = init_range_memory_mapping(start,
430 last_start);
431 last_start = start;
432 min_pfn_mapped = last_start >> PAGE_SHIFT;
433 /* only increase step_size after big range get mapped */
434 if (new_mapped_ram_size > mapped_ram_size)
435 step_size <<= STEP_SIZE_SHIFT;
436 mapped_ram_size += new_mapped_ram_size;
437 }
438
439 if (real_end < end)
440 init_range_memory_mapping(real_end, end);
441
442#ifdef CONFIG_X86_64
443 if (max_pfn > max_low_pfn) {
444 /* can we preseve max_low_pfn ?*/
445 max_low_pfn = max_pfn;
446 }
447#else
448 early_ioremap_page_table_range_init();
449#endif
450
451 load_cr3(swapper_pg_dir);
452 __flush_tlb_all();
453
454 early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
455}
313 456
314/* 457/*
315 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 458 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 745d66b843c8..b299724f6e34 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -53,25 +53,14 @@
53#include <asm/page_types.h> 53#include <asm/page_types.h>
54#include <asm/init.h> 54#include <asm/init.h>
55 55
56#include "mm_internal.h"
57
56unsigned long highstart_pfn, highend_pfn; 58unsigned long highstart_pfn, highend_pfn;
57 59
58static noinline int do_test_wp_bit(void); 60static noinline int do_test_wp_bit(void);
59 61
60bool __read_mostly __vmalloc_start_set = false; 62bool __read_mostly __vmalloc_start_set = false;
61 63
62static __init void *alloc_low_page(void)
63{
64 unsigned long pfn = pgt_buf_end++;
65 void *adr;
66
67 if (pfn >= pgt_buf_top)
68 panic("alloc_low_page: ran out of memory");
69
70 adr = __va(pfn * PAGE_SIZE);
71 clear_page(adr);
72 return adr;
73}
74
75/* 64/*
76 * Creates a middle page table and puts a pointer to it in the 65 * Creates a middle page table and puts a pointer to it in the
77 * given global directory entry. This only returns the gd entry 66 * given global directory entry. This only returns the gd entry
@@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
84 73
85#ifdef CONFIG_X86_PAE 74#ifdef CONFIG_X86_PAE
86 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 75 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
87 if (after_bootmem) 76 pmd_table = (pmd_t *)alloc_low_page();
88 pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
89 else
90 pmd_table = (pmd_t *)alloc_low_page();
91 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 77 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
92 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 78 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
93 pud = pud_offset(pgd, 0); 79 pud = pud_offset(pgd, 0);
@@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
109static pte_t * __init one_page_table_init(pmd_t *pmd) 95static pte_t * __init one_page_table_init(pmd_t *pmd)
110{ 96{
111 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 97 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
112 pte_t *page_table = NULL; 98 pte_t *page_table = (pte_t *)alloc_low_page();
113
114 if (after_bootmem) {
115#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
116 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
117#endif
118 if (!page_table)
119 page_table =
120 (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
121 } else
122 page_table = (pte_t *)alloc_low_page();
123 99
124 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 100 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
125 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 101 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
146 return one_page_table_init(pmd) + pte_idx; 122 return one_page_table_init(pmd) + pte_idx;
147} 123}
148 124
125static unsigned long __init
126page_table_range_init_count(unsigned long start, unsigned long end)
127{
128 unsigned long count = 0;
129#ifdef CONFIG_HIGHMEM
130 int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
131 int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
132 int pgd_idx, pmd_idx;
133 unsigned long vaddr;
134
135 if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
136 return 0;
137
138 vaddr = start;
139 pgd_idx = pgd_index(vaddr);
140
141 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
142 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
143 pmd_idx++) {
144 if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
145 (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
146 count++;
147 vaddr += PMD_SIZE;
148 }
149 pmd_idx = 0;
150 }
151#endif
152 return count;
153}
154
149static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 155static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
150 unsigned long vaddr, pte_t *lastpte) 156 unsigned long vaddr, pte_t *lastpte,
157 void **adr)
151{ 158{
152#ifdef CONFIG_HIGHMEM 159#ifdef CONFIG_HIGHMEM
153 /* 160 /*
@@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
161 168
162 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 169 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
163 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 170 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
164 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 171 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
165 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
166 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
167 pte_t *newpte; 172 pte_t *newpte;
168 int i; 173 int i;
169 174
170 BUG_ON(after_bootmem); 175 BUG_ON(after_bootmem);
171 newpte = alloc_low_page(); 176 newpte = *adr;
172 for (i = 0; i < PTRS_PER_PTE; i++) 177 for (i = 0; i < PTRS_PER_PTE; i++)
173 set_pte(newpte + i, pte[i]); 178 set_pte(newpte + i, pte[i]);
179 *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
174 180
175 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); 181 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
176 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); 182 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
204 pgd_t *pgd; 210 pgd_t *pgd;
205 pmd_t *pmd; 211 pmd_t *pmd;
206 pte_t *pte = NULL; 212 pte_t *pte = NULL;
213 unsigned long count = page_table_range_init_count(start, end);
214 void *adr = NULL;
215
216 if (count)
217 adr = alloc_low_pages(count);
207 218
208 vaddr = start; 219 vaddr = start;
209 pgd_idx = pgd_index(vaddr); 220 pgd_idx = pgd_index(vaddr);
@@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
216 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); 227 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
217 pmd++, pmd_idx++) { 228 pmd++, pmd_idx++) {
218 pte = page_table_kmap_check(one_page_table_init(pmd), 229 pte = page_table_kmap_check(one_page_table_init(pmd),
219 pmd, vaddr, pte); 230 pmd, vaddr, pte, &adr);
220 231
221 vaddr += PMD_SIZE; 232 vaddr += PMD_SIZE;
222 } 233 }
@@ -310,6 +321,7 @@ repeat:
310 __pgprot(PTE_IDENT_ATTR | 321 __pgprot(PTE_IDENT_ATTR |
311 _PAGE_PSE); 322 _PAGE_PSE);
312 323
324 pfn &= PMD_MASK >> PAGE_SHIFT;
313 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 325 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
314 PAGE_OFFSET + PAGE_SIZE-1; 326 PAGE_OFFSET + PAGE_SIZE-1;
315 327
@@ -455,9 +467,14 @@ void __init native_pagetable_init(void)
455 467
456 /* 468 /*
457 * Remove any mappings which extend past the end of physical 469 * Remove any mappings which extend past the end of physical
458 * memory from the boot time page table: 470 * memory from the boot time page table.
471 * In virtual address space, we should have at least two pages
472 * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
473 * definition. And max_low_pfn is set to VMALLOC_END physical
474 * address. If initial memory mapping is doing right job, we
475 * should have pte used near max_low_pfn or one pmd is not present.
459 */ 476 */
460 for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { 477 for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
461 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); 478 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
462 pgd = base + pgd_index(va); 479 pgd = base + pgd_index(va);
463 if (!pgd_present(*pgd)) 480 if (!pgd_present(*pgd))
@@ -468,10 +485,19 @@ void __init native_pagetable_init(void)
468 if (!pmd_present(*pmd)) 485 if (!pmd_present(*pmd))
469 break; 486 break;
470 487
488 /* should not be large page here */
489 if (pmd_large(*pmd)) {
490 pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
491 pfn, pmd, __pa(pmd));
492 BUG_ON(1);
493 }
494
471 pte = pte_offset_kernel(pmd, va); 495 pte = pte_offset_kernel(pmd, va);
472 if (!pte_present(*pte)) 496 if (!pte_present(*pte))
473 break; 497 break;
474 498
499 printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
500 pfn, pmd, __pa(pmd), pte, __pa(pte));
475 pte_clear(NULL, va, pte); 501 pte_clear(NULL, va, pte);
476 } 502 }
477 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); 503 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
@@ -550,7 +576,7 @@ early_param("highmem", parse_highmem);
550 * artificially via the highmem=x boot parameter then create 576 * artificially via the highmem=x boot parameter then create
551 * it: 577 * it:
552 */ 578 */
553void __init lowmem_pfn_init(void) 579static void __init lowmem_pfn_init(void)
554{ 580{
555 /* max_low_pfn is 0, we already have early_res support */ 581 /* max_low_pfn is 0, we already have early_res support */
556 max_low_pfn = max_pfn; 582 max_low_pfn = max_pfn;
@@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void)
586 * We have more RAM than fits into lowmem - we try to put it into 612 * We have more RAM than fits into lowmem - we try to put it into
587 * highmem, also taking the highmem=x boot parameter into account: 613 * highmem, also taking the highmem=x boot parameter into account:
588 */ 614 */
589void __init highmem_pfn_init(void) 615static void __init highmem_pfn_init(void)
590{ 616{
591 max_low_pfn = MAXMEM_PFN; 617 max_low_pfn = MAXMEM_PFN;
592 618
@@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void)
669 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 695 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
670 max_pfn_mapped<<PAGE_SHIFT); 696 max_pfn_mapped<<PAGE_SHIFT);
671 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 697 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
672
673 after_bootmem = 1;
674} 698}
675 699
676/* 700/*
@@ -753,6 +777,8 @@ void __init mem_init(void)
753 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 777 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
754 reservedpages++; 778 reservedpages++;
755 779
780 after_bootmem = 1;
781
756 codesize = (unsigned long) &_etext - (unsigned long) &_text; 782 codesize = (unsigned long) &_etext - (unsigned long) &_text;
757 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 783 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
758 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 784 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d6eeead43758..3eba7f429880 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,82 @@
54#include <asm/uv/uv.h> 54#include <asm/uv/uv.h>
55#include <asm/setup.h> 55#include <asm/setup.h>
56 56
57#include "mm_internal.h"
58
59static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
60 unsigned long addr, unsigned long end)
61{
62 addr &= PMD_MASK;
63 for (; addr < end; addr += PMD_SIZE) {
64 pmd_t *pmd = pmd_page + pmd_index(addr);
65
66 if (!pmd_present(*pmd))
67 set_pmd(pmd, __pmd(addr | pmd_flag));
68 }
69}
70static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
71 unsigned long addr, unsigned long end)
72{
73 unsigned long next;
74
75 for (; addr < end; addr = next) {
76 pud_t *pud = pud_page + pud_index(addr);
77 pmd_t *pmd;
78
79 next = (addr & PUD_MASK) + PUD_SIZE;
80 if (next > end)
81 next = end;
82
83 if (pud_present(*pud)) {
84 pmd = pmd_offset(pud, 0);
85 ident_pmd_init(info->pmd_flag, pmd, addr, next);
86 continue;
87 }
88 pmd = (pmd_t *)info->alloc_pgt_page(info->context);
89 if (!pmd)
90 return -ENOMEM;
91 ident_pmd_init(info->pmd_flag, pmd, addr, next);
92 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
93 }
94
95 return 0;
96}
97
98int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
99 unsigned long addr, unsigned long end)
100{
101 unsigned long next;
102 int result;
103 int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
104
105 for (; addr < end; addr = next) {
106 pgd_t *pgd = pgd_page + pgd_index(addr) + off;
107 pud_t *pud;
108
109 next = (addr & PGDIR_MASK) + PGDIR_SIZE;
110 if (next > end)
111 next = end;
112
113 if (pgd_present(*pgd)) {
114 pud = pud_offset(pgd, 0);
115 result = ident_pud_init(info, pud, addr, next);
116 if (result)
117 return result;
118 continue;
119 }
120
121 pud = (pud_t *)info->alloc_pgt_page(info->context);
122 if (!pud)
123 return -ENOMEM;
124 result = ident_pud_init(info, pud, addr, next);
125 if (result)
126 return result;
127 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
128 }
129
130 return 0;
131}
132
57static int __init parse_direct_gbpages_off(char *arg) 133static int __init parse_direct_gbpages_off(char *arg)
58{ 134{
59 direct_gbpages = 0; 135 direct_gbpages = 0;
@@ -302,10 +378,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
302void __init cleanup_highmap(void) 378void __init cleanup_highmap(void)
303{ 379{
304 unsigned long vaddr = __START_KERNEL_map; 380 unsigned long vaddr = __START_KERNEL_map;
305 unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); 381 unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
306 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; 382 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
307 pmd_t *pmd = level2_kernel_pgt; 383 pmd_t *pmd = level2_kernel_pgt;
308 384
385 /*
386 * Native path, max_pfn_mapped is not set yet.
387 * Xen has valid max_pfn_mapped set in
388 * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
389 */
390 if (max_pfn_mapped)
391 vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
392
309 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { 393 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
310 if (pmd_none(*pmd)) 394 if (pmd_none(*pmd))
311 continue; 395 continue;
@@ -314,69 +398,24 @@ void __init cleanup_highmap(void)
314 } 398 }
315} 399}
316 400
317static __ref void *alloc_low_page(unsigned long *phys)
318{
319 unsigned long pfn = pgt_buf_end++;
320 void *adr;
321
322 if (after_bootmem) {
323 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
324 *phys = __pa(adr);
325
326 return adr;
327 }
328
329 if (pfn >= pgt_buf_top)
330 panic("alloc_low_page: ran out of memory");
331
332 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
333 clear_page(adr);
334 *phys = pfn * PAGE_SIZE;
335 return adr;
336}
337
338static __ref void *map_low_page(void *virt)
339{
340 void *adr;
341 unsigned long phys, left;
342
343 if (after_bootmem)
344 return virt;
345
346 phys = __pa(virt);
347 left = phys & (PAGE_SIZE - 1);
348 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
349 adr = (void *)(((unsigned long)adr) | left);
350
351 return adr;
352}
353
354static __ref void unmap_low_page(void *adr)
355{
356 if (after_bootmem)
357 return;
358
359 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
360}
361
362static unsigned long __meminit 401static unsigned long __meminit
363phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, 402phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
364 pgprot_t prot) 403 pgprot_t prot)
365{ 404{
366 unsigned pages = 0; 405 unsigned long pages = 0, next;
367 unsigned long last_map_addr = end; 406 unsigned long last_map_addr = end;
368 int i; 407 int i;
369 408
370 pte_t *pte = pte_page + pte_index(addr); 409 pte_t *pte = pte_page + pte_index(addr);
371 410
372 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { 411 for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
373 412 next = (addr & PAGE_MASK) + PAGE_SIZE;
374 if (addr >= end) { 413 if (addr >= end) {
375 if (!after_bootmem) { 414 if (!after_bootmem &&
376 for(; i < PTRS_PER_PTE; i++, pte++) 415 !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
377 set_pte(pte, __pte(0)); 416 !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
378 } 417 set_pte(pte, __pte(0));
379 break; 418 continue;
380 } 419 }
381 420
382 /* 421 /*
@@ -414,28 +453,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
414 int i = pmd_index(address); 453 int i = pmd_index(address);
415 454
416 for (; i < PTRS_PER_PMD; i++, address = next) { 455 for (; i < PTRS_PER_PMD; i++, address = next) {
417 unsigned long pte_phys;
418 pmd_t *pmd = pmd_page + pmd_index(address); 456 pmd_t *pmd = pmd_page + pmd_index(address);
419 pte_t *pte; 457 pte_t *pte;
420 pgprot_t new_prot = prot; 458 pgprot_t new_prot = prot;
421 459
460 next = (address & PMD_MASK) + PMD_SIZE;
422 if (address >= end) { 461 if (address >= end) {
423 if (!after_bootmem) { 462 if (!after_bootmem &&
424 for (; i < PTRS_PER_PMD; i++, pmd++) 463 !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
425 set_pmd(pmd, __pmd(0)); 464 !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
426 } 465 set_pmd(pmd, __pmd(0));
427 break; 466 continue;
428 } 467 }
429 468
430 next = (address & PMD_MASK) + PMD_SIZE;
431
432 if (pmd_val(*pmd)) { 469 if (pmd_val(*pmd)) {
433 if (!pmd_large(*pmd)) { 470 if (!pmd_large(*pmd)) {
434 spin_lock(&init_mm.page_table_lock); 471 spin_lock(&init_mm.page_table_lock);
435 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); 472 pte = (pte_t *)pmd_page_vaddr(*pmd);
436 last_map_addr = phys_pte_init(pte, address, 473 last_map_addr = phys_pte_init(pte, address,
437 end, prot); 474 end, prot);
438 unmap_low_page(pte);
439 spin_unlock(&init_mm.page_table_lock); 475 spin_unlock(&init_mm.page_table_lock);
440 continue; 476 continue;
441 } 477 }
@@ -464,19 +500,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
464 pages++; 500 pages++;
465 spin_lock(&init_mm.page_table_lock); 501 spin_lock(&init_mm.page_table_lock);
466 set_pte((pte_t *)pmd, 502 set_pte((pte_t *)pmd,
467 pfn_pte(address >> PAGE_SHIFT, 503 pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
468 __pgprot(pgprot_val(prot) | _PAGE_PSE))); 504 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
469 spin_unlock(&init_mm.page_table_lock); 505 spin_unlock(&init_mm.page_table_lock);
470 last_map_addr = next; 506 last_map_addr = next;
471 continue; 507 continue;
472 } 508 }
473 509
474 pte = alloc_low_page(&pte_phys); 510 pte = alloc_low_page();
475 last_map_addr = phys_pte_init(pte, address, end, new_prot); 511 last_map_addr = phys_pte_init(pte, address, end, new_prot);
476 unmap_low_page(pte);
477 512
478 spin_lock(&init_mm.page_table_lock); 513 spin_lock(&init_mm.page_table_lock);
479 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); 514 pmd_populate_kernel(&init_mm, pmd, pte);
480 spin_unlock(&init_mm.page_table_lock); 515 spin_unlock(&init_mm.page_table_lock);
481 } 516 }
482 update_page_count(PG_LEVEL_2M, pages); 517 update_page_count(PG_LEVEL_2M, pages);
@@ -492,27 +527,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
492 int i = pud_index(addr); 527 int i = pud_index(addr);
493 528
494 for (; i < PTRS_PER_PUD; i++, addr = next) { 529 for (; i < PTRS_PER_PUD; i++, addr = next) {
495 unsigned long pmd_phys;
496 pud_t *pud = pud_page + pud_index(addr); 530 pud_t *pud = pud_page + pud_index(addr);
497 pmd_t *pmd; 531 pmd_t *pmd;
498 pgprot_t prot = PAGE_KERNEL; 532 pgprot_t prot = PAGE_KERNEL;
499 533
500 if (addr >= end)
501 break;
502
503 next = (addr & PUD_MASK) + PUD_SIZE; 534 next = (addr & PUD_MASK) + PUD_SIZE;
504 535 if (addr >= end) {
505 if (!after_bootmem && !e820_any_mapped(addr, next, 0)) { 536 if (!after_bootmem &&
506 set_pud(pud, __pud(0)); 537 !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
538 !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
539 set_pud(pud, __pud(0));
507 continue; 540 continue;
508 } 541 }
509 542
510 if (pud_val(*pud)) { 543 if (pud_val(*pud)) {
511 if (!pud_large(*pud)) { 544 if (!pud_large(*pud)) {
512 pmd = map_low_page(pmd_offset(pud, 0)); 545 pmd = pmd_offset(pud, 0);
513 last_map_addr = phys_pmd_init(pmd, addr, end, 546 last_map_addr = phys_pmd_init(pmd, addr, end,
514 page_size_mask, prot); 547 page_size_mask, prot);
515 unmap_low_page(pmd);
516 __flush_tlb_all(); 548 __flush_tlb_all();
517 continue; 549 continue;
518 } 550 }
@@ -541,19 +573,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
541 pages++; 573 pages++;
542 spin_lock(&init_mm.page_table_lock); 574 spin_lock(&init_mm.page_table_lock);
543 set_pte((pte_t *)pud, 575 set_pte((pte_t *)pud,
544 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 576 pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
577 PAGE_KERNEL_LARGE));
545 spin_unlock(&init_mm.page_table_lock); 578 spin_unlock(&init_mm.page_table_lock);
546 last_map_addr = next; 579 last_map_addr = next;
547 continue; 580 continue;
548 } 581 }
549 582
550 pmd = alloc_low_page(&pmd_phys); 583 pmd = alloc_low_page();
551 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, 584 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
552 prot); 585 prot);
553 unmap_low_page(pmd);
554 586
555 spin_lock(&init_mm.page_table_lock); 587 spin_lock(&init_mm.page_table_lock);
556 pud_populate(&init_mm, pud, __va(pmd_phys)); 588 pud_populate(&init_mm, pud, pmd);
557 spin_unlock(&init_mm.page_table_lock); 589 spin_unlock(&init_mm.page_table_lock);
558 } 590 }
559 __flush_tlb_all(); 591 __flush_tlb_all();
@@ -578,28 +610,23 @@ kernel_physical_mapping_init(unsigned long start,
578 610
579 for (; start < end; start = next) { 611 for (; start < end; start = next) {
580 pgd_t *pgd = pgd_offset_k(start); 612 pgd_t *pgd = pgd_offset_k(start);
581 unsigned long pud_phys;
582 pud_t *pud; 613 pud_t *pud;
583 614
584 next = (start + PGDIR_SIZE) & PGDIR_MASK; 615 next = (start & PGDIR_MASK) + PGDIR_SIZE;
585 if (next > end)
586 next = end;
587 616
588 if (pgd_val(*pgd)) { 617 if (pgd_val(*pgd)) {
589 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); 618 pud = (pud_t *)pgd_page_vaddr(*pgd);
590 last_map_addr = phys_pud_init(pud, __pa(start), 619 last_map_addr = phys_pud_init(pud, __pa(start),
591 __pa(end), page_size_mask); 620 __pa(end), page_size_mask);
592 unmap_low_page(pud);
593 continue; 621 continue;
594 } 622 }
595 623
596 pud = alloc_low_page(&pud_phys); 624 pud = alloc_low_page();
597 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), 625 last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
598 page_size_mask); 626 page_size_mask);
599 unmap_low_page(pud);
600 627
601 spin_lock(&init_mm.page_table_lock); 628 spin_lock(&init_mm.page_table_lock);
602 pgd_populate(&init_mm, pgd, __va(pud_phys)); 629 pgd_populate(&init_mm, pgd, pud);
603 spin_unlock(&init_mm.page_table_lock); 630 spin_unlock(&init_mm.page_table_lock);
604 pgd_changed = true; 631 pgd_changed = true;
605 } 632 }
@@ -664,13 +691,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
664{ 691{
665 struct pglist_data *pgdat = NODE_DATA(nid); 692 struct pglist_data *pgdat = NODE_DATA(nid);
666 struct zone *zone = pgdat->node_zones + ZONE_NORMAL; 693 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
667 unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; 694 unsigned long start_pfn = start >> PAGE_SHIFT;
668 unsigned long nr_pages = size >> PAGE_SHIFT; 695 unsigned long nr_pages = size >> PAGE_SHIFT;
669 int ret; 696 int ret;
670 697
671 last_mapped_pfn = init_memory_mapping(start, start + size); 698 init_memory_mapping(start, start + size);
672 if (last_mapped_pfn > max_pfn_mapped)
673 max_pfn_mapped = last_mapped_pfn;
674 699
675 ret = __add_pages(nid, zone, start_pfn, nr_pages); 700 ret = __add_pages(nid, zone, start_pfn, nr_pages);
676 WARN_ON_ONCE(ret); 701 WARN_ON_ONCE(ret);
@@ -686,6 +711,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory);
686 711
687static struct kcore_list kcore_vsyscall; 712static struct kcore_list kcore_vsyscall;
688 713
714static void __init register_page_bootmem_info(void)
715{
716#ifdef CONFIG_NUMA
717 int i;
718
719 for_each_online_node(i)
720 register_page_bootmem_info_node(NODE_DATA(i));
721#endif
722}
723
689void __init mem_init(void) 724void __init mem_init(void)
690{ 725{
691 long codesize, reservedpages, datasize, initsize; 726 long codesize, reservedpages, datasize, initsize;
@@ -698,11 +733,8 @@ void __init mem_init(void)
698 reservedpages = 0; 733 reservedpages = 0;
699 734
700 /* this will put all low memory onto the freelists */ 735 /* this will put all low memory onto the freelists */
701#ifdef CONFIG_NUMA 736 register_page_bootmem_info();
702 totalram_pages = numa_free_all_bootmem();
703#else
704 totalram_pages = free_all_bootmem(); 737 totalram_pages = free_all_bootmem();
705#endif
706 738
707 absent_pages = absent_pages_in_range(0, max_pfn); 739 absent_pages = absent_pages_in_range(0, max_pfn);
708 reservedpages = max_pfn - totalram_pages - absent_pages; 740 reservedpages = max_pfn - totalram_pages - absent_pages;
@@ -772,12 +804,11 @@ void set_kernel_text_ro(void)
772void mark_rodata_ro(void) 804void mark_rodata_ro(void)
773{ 805{
774 unsigned long start = PFN_ALIGN(_text); 806 unsigned long start = PFN_ALIGN(_text);
775 unsigned long rodata_start = 807 unsigned long rodata_start = PFN_ALIGN(__start_rodata);
776 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
777 unsigned long end = (unsigned long) &__end_rodata_hpage_align; 808 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
778 unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); 809 unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
779 unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); 810 unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
780 unsigned long data_start = (unsigned long) &_sdata; 811 unsigned long all_end = PFN_ALIGN(&_end);
781 812
782 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 813 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
783 (end - start) >> 10); 814 (end - start) >> 10);
@@ -786,10 +817,10 @@ void mark_rodata_ro(void)
786 kernel_set_to_readonly = 1; 817 kernel_set_to_readonly = 1;
787 818
788 /* 819 /*
789 * The rodata section (but not the kernel text!) should also be 820 * The rodata/data/bss/brk section (but not the kernel text!)
790 * not-executable. 821 * should also be not-executable.
791 */ 822 */
792 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); 823 set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
793 824
794 rodata_test(); 825 rodata_test();
795 826
@@ -802,12 +833,12 @@ void mark_rodata_ro(void)
802#endif 833#endif
803 834
804 free_init_pages("unused kernel memory", 835 free_init_pages("unused kernel memory",
805 (unsigned long) page_address(virt_to_page(text_end)), 836 (unsigned long) __va(__pa_symbol(text_end)),
806 (unsigned long) 837 (unsigned long) __va(__pa_symbol(rodata_start)));
807 page_address(virt_to_page(rodata_start))); 838
808 free_init_pages("unused kernel memory", 839 free_init_pages("unused kernel memory",
809 (unsigned long) page_address(virt_to_page(rodata_end)), 840 (unsigned long) __va(__pa_symbol(rodata_end)),
810 (unsigned long) page_address(virt_to_page(data_start))); 841 (unsigned long) __va(__pa_symbol(_sdata)));
811} 842}
812 843
813#endif 844#endif
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 000000000000..6b563a118891
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,19 @@
1#ifndef __X86_MM_INTERNAL_H
2#define __X86_MM_INTERNAL_H
3
4void *alloc_low_pages(unsigned int num);
5static inline void *alloc_low_page(void)
6{
7 return alloc_low_pages(1);
8}
9
10void early_ioremap_page_table_range_init(void);
11
12unsigned long kernel_physical_mapping_init(unsigned long start,
13 unsigned long end,
14 unsigned long page_size_mask);
15void zone_sizes_init(void);
16
17extern int after_bootmem;
18
19#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 2d125be1bae9..8504f3698753 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -193,7 +193,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
193static void __init setup_node_data(int nid, u64 start, u64 end) 193static void __init setup_node_data(int nid, u64 start, u64 end)
194{ 194{
195 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 195 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
196 bool remapped = false;
197 u64 nd_pa; 196 u64 nd_pa;
198 void *nd; 197 void *nd;
199 int tnid; 198 int tnid;
@@ -205,37 +204,28 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
205 if (end && (end - start) < NODE_MIN_SIZE) 204 if (end && (end - start) < NODE_MIN_SIZE)
206 return; 205 return;
207 206
208 /* initialize remap allocator before aligning to ZONE_ALIGN */
209 init_alloc_remap(nid, start, end);
210
211 start = roundup(start, ZONE_ALIGN); 207 start = roundup(start, ZONE_ALIGN);
212 208
213 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", 209 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
214 nid, start, end - 1); 210 nid, start, end - 1);
215 211
216 /* 212 /*
217 * Allocate node data. Try remap allocator first, node-local 213 * Allocate node data. Try node-local memory and then any node.
218 * memory and then any node. Never allocate in DMA zone. 214 * Never allocate in DMA zone.
219 */ 215 */
220 nd = alloc_remap(nid, nd_size); 216 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
221 if (nd) { 217 if (!nd_pa) {
222 nd_pa = __pa(nd); 218 pr_err("Cannot find %zu bytes in node %d\n",
223 remapped = true; 219 nd_size, nid);
224 } else { 220 return;
225 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
226 if (!nd_pa) {
227 pr_err("Cannot find %zu bytes in node %d\n",
228 nd_size, nid);
229 return;
230 }
231 nd = __va(nd_pa);
232 } 221 }
222 nd = __va(nd_pa);
233 223
234 /* report and initialize */ 224 /* report and initialize */
235 printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n", 225 printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
236 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); 226 nd_pa, nd_pa + nd_size - 1);
237 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 227 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
238 if (!remapped && tnid != nid) 228 if (tnid != nid)
239 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); 229 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
240 230
241 node_data[nid] = nd; 231 node_data[nid] = nd;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 534255a36b6b..73a6d7395bd3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
73 73
74extern unsigned long highend_pfn, highstart_pfn; 74extern unsigned long highend_pfn, highstart_pfn;
75 75
76#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
77
78static void *node_remap_start_vaddr[MAX_NUMNODES];
79void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
80
81/*
82 * Remap memory allocator
83 */
84static unsigned long node_remap_start_pfn[MAX_NUMNODES];
85static void *node_remap_end_vaddr[MAX_NUMNODES];
86static void *node_remap_alloc_vaddr[MAX_NUMNODES];
87
88/**
89 * alloc_remap - Allocate remapped memory
90 * @nid: NUMA node to allocate memory from
91 * @size: The size of allocation
92 *
93 * Allocate @size bytes from the remap area of NUMA node @nid. The
94 * size of the remap area is predetermined by init_alloc_remap() and
95 * only the callers considered there should call this function. For
96 * more info, please read the comment on top of init_alloc_remap().
97 *
98 * The caller must be ready to handle allocation failure from this
99 * function and fall back to regular memory allocator in such cases.
100 *
101 * CONTEXT:
102 * Single CPU early boot context.
103 *
104 * RETURNS:
105 * Pointer to the allocated memory on success, %NULL on failure.
106 */
107void *alloc_remap(int nid, unsigned long size)
108{
109 void *allocation = node_remap_alloc_vaddr[nid];
110
111 size = ALIGN(size, L1_CACHE_BYTES);
112
113 if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
114 return NULL;
115
116 node_remap_alloc_vaddr[nid] += size;
117 memset(allocation, 0, size);
118
119 return allocation;
120}
121
122#ifdef CONFIG_HIBERNATION
123/**
124 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
125 * during resume from hibernation
126 * @pgd_base - temporary resume page directory
127 */
128void resume_map_numa_kva(pgd_t *pgd_base)
129{
130 int node;
131
132 for_each_online_node(node) {
133 unsigned long start_va, start_pfn, nr_pages, pfn;
134
135 start_va = (unsigned long)node_remap_start_vaddr[node];
136 start_pfn = node_remap_start_pfn[node];
137 nr_pages = (node_remap_end_vaddr[node] -
138 node_remap_start_vaddr[node]) >> PAGE_SHIFT;
139
140 printk(KERN_DEBUG "%s: node %d\n", __func__, node);
141
142 for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
143 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
144 pgd_t *pgd = pgd_base + pgd_index(vaddr);
145 pud_t *pud = pud_offset(pgd, vaddr);
146 pmd_t *pmd = pmd_offset(pud, vaddr);
147
148 set_pmd(pmd, pfn_pmd(start_pfn + pfn,
149 PAGE_KERNEL_LARGE_EXEC));
150
151 printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
152 __func__, vaddr, start_pfn + pfn);
153 }
154 }
155}
156#endif
157
158/**
159 * init_alloc_remap - Initialize remap allocator for a NUMA node
160 * @nid: NUMA node to initizlie remap allocator for
161 *
162 * NUMA nodes may end up without any lowmem. As allocating pgdat and
163 * memmap on a different node with lowmem is inefficient, a special
164 * remap allocator is implemented which can be used by alloc_remap().
165 *
166 * For each node, the amount of memory which will be necessary for
167 * pgdat and memmap is calculated and two memory areas of the size are
168 * allocated - one in the node and the other in lowmem; then, the area
169 * in the node is remapped to the lowmem area.
170 *
171 * As pgdat and memmap must be allocated in lowmem anyway, this
172 * doesn't waste lowmem address space; however, the actual lowmem
173 * which gets remapped over is wasted. The amount shouldn't be
174 * problematic on machines this feature will be used.
175 *
176 * Initialization failure isn't fatal. alloc_remap() is used
177 * opportunistically and the callers will fall back to other memory
178 * allocation mechanisms on failure.
179 */
180void __init init_alloc_remap(int nid, u64 start, u64 end)
181{
182 unsigned long start_pfn = start >> PAGE_SHIFT;
183 unsigned long end_pfn = end >> PAGE_SHIFT;
184 unsigned long size, pfn;
185 u64 node_pa, remap_pa;
186 void *remap_va;
187
188 /*
189 * The acpi/srat node info can show hot-add memroy zones where
190 * memory could be added but not currently present.
191 */
192 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
193 nid, start_pfn, end_pfn);
194
195 /* calculate the necessary space aligned to large page size */
196 size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
197 size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
198 size = ALIGN(size, LARGE_PAGE_BYTES);
199
200 /* allocate node memory and the lowmem remap area */
201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
202 if (!node_pa) {
203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
204 size, nid);
205 return;
206 }
207 memblock_reserve(node_pa, size);
208
209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
210 max_low_pfn << PAGE_SHIFT,
211 size, LARGE_PAGE_BYTES);
212 if (!remap_pa) {
213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
214 size, nid);
215 memblock_free(node_pa, size);
216 return;
217 }
218 memblock_reserve(remap_pa, size);
219 remap_va = phys_to_virt(remap_pa);
220
221 /* perform actual remap */
222 for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
223 set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
224 (node_pa >> PAGE_SHIFT) + pfn,
225 PAGE_KERNEL_LARGE);
226
227 /* initialize remap allocator parameters */
228 node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
229 node_remap_start_vaddr[nid] = remap_va;
230 node_remap_end_vaddr[nid] = remap_va + size;
231 node_remap_alloc_vaddr[nid] = remap_va;
232
233 printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
234 nid, node_pa, node_pa + size, remap_va, remap_va + size);
235}
236
237void __init initmem_init(void) 76void __init initmem_init(void)
238{ 77{
239 x86_numa_init(); 78 x86_numa_init();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 92e27119ee1a..9405ffc91502 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -10,16 +10,3 @@ void __init initmem_init(void)
10{ 10{
11 x86_numa_init(); 11 x86_numa_init();
12} 12}
13
14unsigned long __init numa_free_all_bootmem(void)
15{
16 unsigned long pages = 0;
17 int i;
18
19 for_each_online_node(i)
20 pages += free_all_bootmem_node(NODE_DATA(i));
21
22 pages += free_low_memory_core_early(MAX_NUMNODES);
23
24 return pages;
25}
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 7178c3afe05e..ad86ec91e640 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
21 21
22void __init x86_numa_init(void); 22void __init x86_numa_init(void);
23 23
24#ifdef CONFIG_X86_64
25static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
26#else
27void __init init_alloc_remap(int nid, u64 start, u64 end);
28#endif
29
30#ifdef CONFIG_NUMA_EMU 24#ifdef CONFIG_NUMA_EMU
31void __init numa_emulation(struct numa_meminfo *numa_meminfo, 25void __init numa_emulation(struct numa_meminfo *numa_meminfo,
32 int numa_dist_cnt); 26 int numa_dist_cnt);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a718e0d23503..a1b1c88f9caf 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -94,12 +94,12 @@ static inline void split_page_count(int level) { }
94 94
95static inline unsigned long highmap_start_pfn(void) 95static inline unsigned long highmap_start_pfn(void)
96{ 96{
97 return __pa(_text) >> PAGE_SHIFT; 97 return __pa_symbol(_text) >> PAGE_SHIFT;
98} 98}
99 99
100static inline unsigned long highmap_end_pfn(void) 100static inline unsigned long highmap_end_pfn(void)
101{ 101{
102 return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; 102 return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
103} 103}
104 104
105#endif 105#endif
@@ -276,8 +276,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
276 * The .rodata section needs to be read-only. Using the pfn 276 * The .rodata section needs to be read-only. Using the pfn
277 * catches all aliases. 277 * catches all aliases.
278 */ 278 */
279 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, 279 if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
280 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 280 __pa_symbol(__end_rodata) >> PAGE_SHIFT))
281 pgprot_val(forbidden) |= _PAGE_RW; 281 pgprot_val(forbidden) |= _PAGE_RW;
282 282
283#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 283#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
@@ -364,6 +364,37 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
364EXPORT_SYMBOL_GPL(lookup_address); 364EXPORT_SYMBOL_GPL(lookup_address);
365 365
366/* 366/*
367 * This is necessary because __pa() does not work on some
368 * kinds of memory, like vmalloc() or the alloc_remap()
369 * areas on 32-bit NUMA systems. The percpu areas can
370 * end up in this kind of memory, for instance.
371 *
372 * This could be optimized, but it is only intended to be
373 * used at inititalization time, and keeping it
374 * unoptimized should increase the testing coverage for
375 * the more obscure platforms.
376 */
377phys_addr_t slow_virt_to_phys(void *__virt_addr)
378{
379 unsigned long virt_addr = (unsigned long)__virt_addr;
380 phys_addr_t phys_addr;
381 unsigned long offset;
382 enum pg_level level;
383 unsigned long psize;
384 unsigned long pmask;
385 pte_t *pte;
386
387 pte = lookup_address(virt_addr, &level);
388 BUG_ON(!pte);
389 psize = page_level_size(level);
390 pmask = page_level_mask(level);
391 offset = virt_addr & ~pmask;
392 phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
393 return (phys_addr | offset);
394}
395EXPORT_SYMBOL_GPL(slow_virt_to_phys);
396
397/*
367 * Set the new pmd in all the pgds we know about: 398 * Set the new pmd in all the pgds we know about:
368 */ 399 */
369static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 400static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
@@ -396,7 +427,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
396 pte_t new_pte, old_pte, *tmp; 427 pte_t new_pte, old_pte, *tmp;
397 pgprot_t old_prot, new_prot, req_prot; 428 pgprot_t old_prot, new_prot, req_prot;
398 int i, do_split = 1; 429 int i, do_split = 1;
399 unsigned int level; 430 enum pg_level level;
400 431
401 if (cpa->force_split) 432 if (cpa->force_split)
402 return 1; 433 return 1;
@@ -412,15 +443,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
412 443
413 switch (level) { 444 switch (level) {
414 case PG_LEVEL_2M: 445 case PG_LEVEL_2M:
415 psize = PMD_PAGE_SIZE;
416 pmask = PMD_PAGE_MASK;
417 break;
418#ifdef CONFIG_X86_64 446#ifdef CONFIG_X86_64
419 case PG_LEVEL_1G: 447 case PG_LEVEL_1G:
420 psize = PUD_PAGE_SIZE;
421 pmask = PUD_PAGE_MASK;
422 break;
423#endif 448#endif
449 psize = page_level_size(level);
450 pmask = page_level_mask(level);
451 break;
424 default: 452 default:
425 do_split = -EINVAL; 453 do_split = -EINVAL;
426 goto out_unlock; 454 goto out_unlock;
@@ -551,16 +579,10 @@ static int split_large_page(pte_t *kpte, unsigned long address)
551 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 579 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
552 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 580 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
553 581
554 if (address >= (unsigned long)__va(0) && 582 if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
555 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) 583 PFN_DOWN(__pa(address)) + 1))
556 split_page_count(level); 584 split_page_count(level);
557 585
558#ifdef CONFIG_X86_64
559 if (address >= (unsigned long)__va(1UL<<32) &&
560 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
561 split_page_count(level);
562#endif
563
564 /* 586 /*
565 * Install the new, split up pagetable. 587 * Install the new, split up pagetable.
566 * 588 *
@@ -729,13 +751,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
729 unsigned long vaddr; 751 unsigned long vaddr;
730 int ret; 752 int ret;
731 753
732 if (cpa->pfn >= max_pfn_mapped) 754 if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
733 return 0; 755 return 0;
734 756
735#ifdef CONFIG_X86_64
736 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
737 return 0;
738#endif
739 /* 757 /*
740 * No need to redo, when the primary call touched the direct 758 * No need to redo, when the primary call touched the direct
741 * mapping already: 759 * mapping already:
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 0eb572eda406..2610bd93c896 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -560,10 +560,10 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
560{ 560{
561 unsigned long id_sz; 561 unsigned long id_sz;
562 562
563 if (base >= __pa(high_memory)) 563 if (base > __pa(high_memory-1))
564 return 0; 564 return 0;
565 565
566 id_sz = (__pa(high_memory) < base + size) ? 566 id_sz = (__pa(high_memory-1) <= base + size) ?
567 __pa(high_memory) - base : 567 __pa(high_memory) - base :
568 size; 568 size;
569 569
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e27fbf887f3b..193350b51f90 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -334,7 +334,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
334 if (changed && dirty) { 334 if (changed && dirty) {
335 *pmdp = entry; 335 *pmdp = entry;
336 pmd_update_defer(vma->vm_mm, address, pmdp); 336 pmd_update_defer(vma->vm_mm, address, pmdp);
337 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 337 /*
338 * We had a write-protection fault here and changed the pmd
339 * to to more permissive. No need to flush the TLB for that,
340 * #PF is architecturally guaranteed to do that and in the
341 * worst-case we'll generate a spurious fault.
342 */
338 } 343 }
339 344
340 return changed; 345 return changed;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index d2e2735327b4..e666cbbb9261 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
1#include <linux/bootmem.h>
1#include <linux/mmdebug.h> 2#include <linux/mmdebug.h>
2#include <linux/module.h> 3#include <linux/module.h>
3#include <linux/mm.h> 4#include <linux/mm.h>
@@ -8,33 +9,54 @@
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
10 11
12#ifdef CONFIG_DEBUG_VIRTUAL
11unsigned long __phys_addr(unsigned long x) 13unsigned long __phys_addr(unsigned long x)
12{ 14{
13 if (x >= __START_KERNEL_map) { 15 unsigned long y = x - __START_KERNEL_map;
14 x -= __START_KERNEL_map; 16
15 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); 17 /* use the carry flag to determine if x was < __START_KERNEL_map */
16 x += phys_base; 18 if (unlikely(x > y)) {
19 x = y + phys_base;
20
21 VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
17 } else { 22 } else {
18 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 23 x = y + (__START_KERNEL_map - PAGE_OFFSET);
19 x -= PAGE_OFFSET; 24
20 VIRTUAL_BUG_ON(!phys_addr_valid(x)); 25 /* carry flag will be set if starting x was >= PAGE_OFFSET */
26 VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
21 } 27 }
28
22 return x; 29 return x;
23} 30}
24EXPORT_SYMBOL(__phys_addr); 31EXPORT_SYMBOL(__phys_addr);
25 32
33unsigned long __phys_addr_symbol(unsigned long x)
34{
35 unsigned long y = x - __START_KERNEL_map;
36
37 /* only check upper bounds since lower bounds will trigger carry */
38 VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
39
40 return y + phys_base;
41}
42EXPORT_SYMBOL(__phys_addr_symbol);
43#endif
44
26bool __virt_addr_valid(unsigned long x) 45bool __virt_addr_valid(unsigned long x)
27{ 46{
28 if (x >= __START_KERNEL_map) { 47 unsigned long y = x - __START_KERNEL_map;
29 x -= __START_KERNEL_map; 48
30 if (x >= KERNEL_IMAGE_SIZE) 49 /* use the carry flag to determine if x was < __START_KERNEL_map */
50 if (unlikely(x > y)) {
51 x = y + phys_base;
52
53 if (y >= KERNEL_IMAGE_SIZE)
31 return false; 54 return false;
32 x += phys_base;
33 } else { 55 } else {
34 if (x < PAGE_OFFSET) 56 x = y + (__START_KERNEL_map - PAGE_OFFSET);
35 return false; 57
36 x -= PAGE_OFFSET; 58 /* carry flag will be set if starting x was >= PAGE_OFFSET */
37 if (!phys_addr_valid(x)) 59 if ((x > y) || !phys_addr_valid(x))
38 return false; 60 return false;
39 } 61 }
40 62
@@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid);
47#ifdef CONFIG_DEBUG_VIRTUAL 69#ifdef CONFIG_DEBUG_VIRTUAL
48unsigned long __phys_addr(unsigned long x) 70unsigned long __phys_addr(unsigned long x)
49{ 71{
72 unsigned long phys_addr = x - PAGE_OFFSET;
50 /* VMALLOC_* aren't constants */ 73 /* VMALLOC_* aren't constants */
51 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 74 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
52 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); 75 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
53 return x - PAGE_OFFSET; 76 /* max_low_pfn is set early, but not _that_ early */
77 if (max_low_pfn) {
78 VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
79 BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
80 }
81 return phys_addr;
54} 82}
55EXPORT_SYMBOL(__phys_addr); 83EXPORT_SYMBOL(__phys_addr);
56#endif 84#endif
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 928bf837040a..70b2a3a305d6 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -416,8 +416,8 @@ void __init efi_reserve_boot_services(void)
416 * - Not within any part of the kernel 416 * - Not within any part of the kernel
417 * - Not the bios reserved area 417 * - Not the bios reserved area
418 */ 418 */
419 if ((start+size >= virt_to_phys(_text) 419 if ((start+size >= __pa_symbol(_text)
420 && start <= virt_to_phys(_end)) || 420 && start <= __pa_symbol(_end)) ||
421 !e820_all_mapped(start, start+size, E820_RAM) || 421 !e820_all_mapped(start, start+size, E820_RAM) ||
422 memblock_is_region_reserved(start, size)) { 422 memblock_is_region_reserved(start, size)) {
423 /* Could not reserve, skip it */ 423 /* Could not reserve, skip it */
@@ -843,7 +843,7 @@ void __init efi_enter_virtual_mode(void)
843 efi_memory_desc_t *md, *prev_md = NULL; 843 efi_memory_desc_t *md, *prev_md = NULL;
844 efi_status_t status; 844 efi_status_t status;
845 unsigned long size; 845 unsigned long size;
846 u64 end, systab, end_pfn; 846 u64 end, systab, start_pfn, end_pfn;
847 void *p, *va, *new_memmap = NULL; 847 void *p, *va, *new_memmap = NULL;
848 int count = 0; 848 int count = 0;
849 849
@@ -896,10 +896,9 @@ void __init efi_enter_virtual_mode(void)
896 size = md->num_pages << EFI_PAGE_SHIFT; 896 size = md->num_pages << EFI_PAGE_SHIFT;
897 end = md->phys_addr + size; 897 end = md->phys_addr + size;
898 898
899 start_pfn = PFN_DOWN(md->phys_addr);
899 end_pfn = PFN_UP(end); 900 end_pfn = PFN_UP(end);
900 if (end_pfn <= max_low_pfn_mapped 901 if (pfn_range_is_mapped(start_pfn, end_pfn)) {
901 || (end_pfn > (1UL << (32 - PAGE_SHIFT))
902 && end_pfn <= max_pfn_mapped)) {
903 va = __va(md->phys_addr); 902 va = __va(md->phys_addr);
904 903
905 if (!(md->attribute & EFI_MEMORY_WB)) 904 if (!(md->attribute & EFI_MEMORY_WB))
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 74202c1910cd..7d28c885d238 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -129,8 +129,6 @@ static int resume_physical_mapping_init(pgd_t *pgd_base)
129 } 129 }
130 } 130 }
131 131
132 resume_map_numa_kva(pgd_base);
133
134 return 0; 132 return 0;
135} 133}
136 134
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 460f314d13e5..a0fde91c16cf 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,8 @@
11#include <linux/gfp.h> 11#include <linux/gfp.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14
15#include <asm/init.h>
14#include <asm/proto.h> 16#include <asm/proto.h>
15#include <asm/page.h> 17#include <asm/page.h>
16#include <asm/pgtable.h> 18#include <asm/pgtable.h>
@@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt;
39 41
40void *relocated_restore_code; 42void *relocated_restore_code;
41 43
42static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) 44static void *alloc_pgt_page(void *context)
43{ 45{
44 long i, j; 46 return (void *)get_safe_page(GFP_ATOMIC);
45
46 i = pud_index(address);
47 pud = pud + i;
48 for (; i < PTRS_PER_PUD; pud++, i++) {
49 unsigned long paddr;
50 pmd_t *pmd;
51
52 paddr = address + i*PUD_SIZE;
53 if (paddr >= end)
54 break;
55
56 pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
57 if (!pmd)
58 return -ENOMEM;
59 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
60 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
61 unsigned long pe;
62
63 if (paddr >= end)
64 break;
65 pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
66 pe &= __supported_pte_mask;
67 set_pmd(pmd, __pmd(pe));
68 }
69 }
70 return 0;
71} 47}
72 48
73static int set_up_temporary_mappings(void) 49static int set_up_temporary_mappings(void)
74{ 50{
75 unsigned long start, end, next; 51 struct x86_mapping_info info = {
76 int error; 52 .alloc_pgt_page = alloc_pgt_page,
53 .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
54 .kernel_mapping = true,
55 };
56 unsigned long mstart, mend;
57 int result;
58 int i;
77 59
78 temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); 60 temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
79 if (!temp_level4_pgt) 61 if (!temp_level4_pgt)
@@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void)
84 init_level4_pgt[pgd_index(__START_KERNEL_map)]); 66 init_level4_pgt[pgd_index(__START_KERNEL_map)]);
85 67
86 /* Set up the direct mapping from scratch */ 68 /* Set up the direct mapping from scratch */
87 start = (unsigned long)pfn_to_kaddr(0); 69 for (i = 0; i < nr_pfn_mapped; i++) {
88 end = (unsigned long)pfn_to_kaddr(max_pfn); 70 mstart = pfn_mapped[i].start << PAGE_SHIFT;
89 71 mend = pfn_mapped[i].end << PAGE_SHIFT;
90 for (; start < end; start = next) { 72
91 pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); 73 result = kernel_ident_mapping_init(&info, temp_level4_pgt,
92 if (!pud) 74 mstart, mend);
93 return -ENOMEM; 75
94 next = start + PGDIR_SIZE; 76 if (result)
95 if (next > end) 77 return result;
96 next = end;
97 if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
98 return error;
99 set_pgd(temp_level4_pgt + pgd_index(start),
100 mk_kernel_pgd(__pa(pud)));
101 } 78 }
79
102 return 0; 80 return 0;
103} 81}
104 82
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cbca565af5bd..a44f457e70a1 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -8,9 +8,26 @@
8struct real_mode_header *real_mode_header; 8struct real_mode_header *real_mode_header;
9u32 *trampoline_cr4_features; 9u32 *trampoline_cr4_features;
10 10
11void __init setup_real_mode(void) 11void __init reserve_real_mode(void)
12{ 12{
13 phys_addr_t mem; 13 phys_addr_t mem;
14 unsigned char *base;
15 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
16
17 /* Has to be under 1M so we can execute real-mode AP code. */
18 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
19 if (!mem)
20 panic("Cannot allocate trampoline\n");
21
22 base = __va(mem);
23 memblock_reserve(mem, size);
24 real_mode_header = (struct real_mode_header *) base;
25 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
26 base, (unsigned long long)mem, size);
27}
28
29void __init setup_real_mode(void)
30{
14 u16 real_mode_seg; 31 u16 real_mode_seg;
15 u32 *rel; 32 u32 *rel;
16 u32 count; 33 u32 count;
@@ -25,16 +42,7 @@ void __init setup_real_mode(void)
25 u64 efer; 42 u64 efer;
26#endif 43#endif
27 44
28 /* Has to be in very low memory so we can execute real-mode AP code. */ 45 base = (unsigned char *)real_mode_header;
29 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
30 if (!mem)
31 panic("Cannot allocate trampoline\n");
32
33 base = __va(mem);
34 memblock_reserve(mem, size);
35 real_mode_header = (struct real_mode_header *) base;
36 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
37 base, (unsigned long long)mem, size);
38 46
39 memcpy(base, real_mode_blob, size); 47 memcpy(base, real_mode_blob, size);
40 48
@@ -62,9 +70,9 @@ void __init setup_real_mode(void)
62 __va(real_mode_header->trampoline_header); 70 __va(real_mode_header->trampoline_header);
63 71
64#ifdef CONFIG_X86_32 72#ifdef CONFIG_X86_32
65 trampoline_header->start = __pa(startup_32_smp); 73 trampoline_header->start = __pa_symbol(startup_32_smp);
66 trampoline_header->gdt_limit = __BOOT_DS + 7; 74 trampoline_header->gdt_limit = __BOOT_DS + 7;
67 trampoline_header->gdt_base = __pa(boot_gdt); 75 trampoline_header->gdt_base = __pa_symbol(boot_gdt);
68#else 76#else
69 /* 77 /*
70 * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR 78 * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
@@ -78,16 +86,18 @@ void __init setup_real_mode(void)
78 *trampoline_cr4_features = read_cr4(); 86 *trampoline_cr4_features = read_cr4();
79 87
80 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 88 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
81 trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; 89 trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
82 trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; 90 trampoline_pgd[511] = init_level4_pgt[511].pgd;
83#endif 91#endif
84} 92}
85 93
86/* 94/*
87 * set_real_mode_permissions() gets called very early, to guarantee the 95 * reserve_real_mode() gets called very early, to guarantee the
88 * availability of low memory. This is before the proper kernel page 96 * availability of low memory. This is before the proper kernel page
89 * tables are set up, so we cannot set page permissions in that 97 * tables are set up, so we cannot set page permissions in that
90 * function. Thus, we use an arch_initcall instead. 98 * function. Also trampoline code will be executed by APs so we
99 * need to mark it executable at do_pre_smp_initcalls() at least,
100 * thus run it as a early_initcall().
91 */ 101 */
92static int __init set_real_mode_permissions(void) 102static int __init set_real_mode_permissions(void)
93{ 103{
@@ -111,5 +121,4 @@ static int __init set_real_mode_permissions(void)
111 121
112 return 0; 122 return 0;
113} 123}
114 124early_initcall(set_real_mode_permissions);
115arch_initcall(set_real_mode_permissions);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 01de35c77221..f5e86eee4e0e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm)
1178 1178
1179static void xen_post_allocator_init(void); 1179static void xen_post_allocator_init(void);
1180 1180
1181static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1182{
1183 /* reserve the range used */
1184 native_pagetable_reserve(start, end);
1185
1186 /* set as RW the rest */
1187 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1188 PFN_PHYS(pgt_buf_top));
1189 while (end < PFN_PHYS(pgt_buf_top)) {
1190 make_lowmem_page_readwrite(__va(end));
1191 end += PAGE_SIZE;
1192 }
1193}
1194
1195#ifdef CONFIG_X86_64 1181#ifdef CONFIG_X86_64
1196static void __init xen_cleanhighmap(unsigned long vaddr, 1182static void __init xen_cleanhighmap(unsigned long vaddr,
1197 unsigned long vaddr_end) 1183 unsigned long vaddr_end)
@@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1503#else /* CONFIG_X86_64 */ 1489#else /* CONFIG_X86_64 */
1504static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) 1490static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1505{ 1491{
1506 unsigned long pfn = pte_pfn(pte);
1507
1508 /*
1509 * If the new pfn is within the range of the newly allocated
1510 * kernel pagetable, and it isn't being mapped into an
1511 * early_ioremap fixmap slot as a freshly allocated page, make sure
1512 * it is RO.
1513 */
1514 if (((!is_early_ioremap_ptep(ptep) &&
1515 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1516 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1517 pte = pte_wrprotect(pte);
1518
1519 return pte; 1492 return pte;
1520} 1493}
1521#endif /* CONFIG_X86_64 */ 1494#endif /* CONFIG_X86_64 */
@@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2197 2170
2198void __init xen_init_mmu_ops(void) 2171void __init xen_init_mmu_ops(void)
2199{ 2172{
2200 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2201 x86_init.paging.pagetable_init = xen_pagetable_init; 2173 x86_init.paging.pagetable_init = xen_pagetable_init;
2202 pv_mmu_ops = xen_mmu_ops; 2174 pv_mmu_ops = xen_mmu_ops;
2203 2175