aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_gart_64.c5
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/head32.c20
-rw-r--r--arch/x86/kernel/head64.c131
-rw-r--r--arch/x86/kernel/head_64.S210
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c1
-rw-r--r--arch/x86/kernel/kvm.c11
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c171
-rw-r--r--arch/x86/kernel/setup.c260
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c4
19 files changed, 479 insertions, 386 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bacf4b0d91f4..cfc755dc1607 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
51 51
52#ifdef CONFIG_X86_64 52#ifdef CONFIG_X86_64
53# include <asm/proto.h> 53# include <asm/proto.h>
54# include <asm/numa_64.h>
55#endif /* X86 */ 54#endif /* X86 */
56 55
57#define BAD_MADT_ENTRY(entry, end) ( \ 56#define BAD_MADT_ENTRY(entry, end) ( \
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d5e0d717005a..0532f5d6e4ef 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void)
69 69
70#ifndef CONFIG_64BIT 70#ifndef CONFIG_64BIT
71 header->pmode_entry = (u32)&wakeup_pmode_return; 71 header->pmode_entry = (u32)&wakeup_pmode_return;
72 header->pmode_cr3 = (u32)__pa(&initial_page_table); 72 header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
73 saved_magic = 0x12345678; 73 saved_magic = 0x12345678;
74#else /* CONFIG_64BIT */ 74#else /* CONFIG_64BIT */
75#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e66311200cbd..b574b295a2f9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
768 aper_base = info.aper_base; 768 aper_base = info.aper_base;
769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
770 770
771 if (end_pfn > max_low_pfn_mapped) { 771 start_pfn = PFN_DOWN(aper_base);
772 start_pfn = (aper_base>>PAGE_SHIFT); 772 if (!pfn_range_is_mapped(start_pfn, end_pfn))
773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
774 }
775 774
776 pr_info("PCI-DMA: using GART IOMMU.\n"); 775 pr_info("PCI-DMA: using GART IOMMU.\n");
777 iommu_size = check_iommu_size(info.aper_base, aper_size); 776 iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 9c2aa89a11cb..9a9110918ca7 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -28,6 +28,7 @@
28#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/ipi.h> 29#include <asm/ipi.h>
30#include <asm/apic_flat_64.h> 30#include <asm/apic_flat_64.h>
31#include <asm/pgtable.h>
31 32
32static int numachip_system __read_mostly; 33static int numachip_system __read_mostly;
33 34
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 84bee67141ad..edd77e7508b3 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,6 @@
12#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
13 13
14#ifdef CONFIG_X86_64 14#ifdef CONFIG_X86_64
15# include <asm/numa_64.h>
16# include <asm/mmconfig.h> 15# include <asm/mmconfig.h>
17# include <asm/cacheflush.h> 16# include <asm/cacheflush.h>
18#endif 17#endif
@@ -680,12 +679,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
680 * benefit in doing so. 679 * benefit in doing so.
681 */ 680 */
682 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { 681 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
682 unsigned long pfn = tseg >> PAGE_SHIFT;
683
683 printk(KERN_DEBUG "tseg: %010llx\n", tseg); 684 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
684 if ((tseg>>PMD_SHIFT) < 685 if (pfn_range_is_mapped(pfn, pfn + 1))
685 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
686 ((tseg>>PMD_SHIFT) <
687 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
688 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
689 set_memory_4k((unsigned long)__va(tseg), 1); 686 set_memory_4k((unsigned long)__va(tseg), 1);
690 } 687 }
691 } 688 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcaabd0432c5..1905ce98bee0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -17,7 +17,6 @@
17 17
18#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
19#include <linux/topology.h> 19#include <linux/topology.h>
20#include <asm/numa_64.h>
21#endif 20#endif
22 21
23#include "cpu.h" 22#include "cpu.h"
@@ -168,7 +167,7 @@ int __cpuinit ppro_with_ram_bug(void)
168#ifdef CONFIG_X86_F00F_BUG 167#ifdef CONFIG_X86_F00F_BUG
169static void __cpuinit trap_init_f00f_bug(void) 168static void __cpuinit trap_init_f00f_bug(void)
170{ 169{
171 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 170 __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
172 171
173 /* 172 /*
174 * Update the IDT descriptor and reload the IDT so that 173 * Update the IDT descriptor and reload the IDT so that
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index df06ade26bef..d32abeabbda5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
835} 835}
836early_param("mem", parse_memopt); 836early_param("mem", parse_memopt);
837 837
838static int __init parse_memmap_opt(char *p) 838static int __init parse_memmap_one(char *p)
839{ 839{
840 char *oldp; 840 char *oldp;
841 u64 start_at, mem_size; 841 u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
877 877
878 return *p == '\0' ? 0 : -EINVAL; 878 return *p == '\0' ? 0 : -EINVAL;
879} 879}
880static int __init parse_memmap_opt(char *str)
881{
882 while (str) {
883 char *k = strchr(str, ',');
884
885 if (k)
886 *k++ = 0;
887
888 parse_memmap_one(str);
889 str = k;
890 }
891
892 return 0;
893}
880early_param("memmap", parse_memmap_opt); 894early_param("memmap", parse_memmap_opt);
881 895
882void __init finish_e820_parsing(void) 896void __init finish_e820_parsing(void)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1d414029f1d8..42a392a9fd02 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
89 * kernel identity mapping to modify code. 89 * kernel identity mapping to modify code.
90 */ 90 */
91 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 91 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
92 ip = (unsigned long)__va(__pa(ip)); 92 ip = (unsigned long)__va(__pa_symbol(ip));
93 93
94 return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); 94 return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
95} 95}
@@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, int size)
279 * kernel identity mapping to modify code. 279 * kernel identity mapping to modify code.
280 */ 280 */
281 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 281 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
282 ip = (unsigned long)__va(__pa(ip)); 282 ip = (unsigned long)__va(__pa_symbol(ip));
283 283
284 return probe_kernel_write((void *)ip, val, size); 284 return probe_kernel_write((void *)ip, val, size);
285} 285}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 6773c918b8cc..138463a24877 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -33,20 +33,6 @@ void __init i386_start_kernel(void)
33{ 33{
34 sanitize_boot_params(&boot_params); 34 sanitize_boot_params(&boot_params);
35 35
36 memblock_reserve(__pa_symbol(&_text),
37 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
38
39#ifdef CONFIG_BLK_DEV_INITRD
40 /* Reserve INITRD */
41 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
42 /* Assume only end is not page aligned */
43 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
44 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
45 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
46 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
47 }
48#endif
49
50 /* Call the subarch specific early setup function */ 36 /* Call the subarch specific early setup function */
51 switch (boot_params.hdr.hardware_subarch) { 37 switch (boot_params.hdr.hardware_subarch) {
52 case X86_SUBARCH_MRST: 38 case X86_SUBARCH_MRST:
@@ -60,11 +46,5 @@ void __init i386_start_kernel(void)
60 break; 46 break;
61 } 47 }
62 48
63 /*
64 * At this point everything still needed from the boot loader
65 * or BIOS or kernel text should be early reserved or marked not
66 * RAM in e820. All other memory is free game.
67 */
68
69 start_kernel(); 49 start_kernel();
70} 50}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 849fc9e63c2f..57334f4cd3af 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,11 +27,81 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/bootparam_utils.h> 28#include <asm/bootparam_utils.h>
29 29
30static void __init zap_identity_mappings(void) 30/*
31 * Manage page tables very early on.
32 */
33extern pgd_t early_level4_pgt[PTRS_PER_PGD];
34extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
35static unsigned int __initdata next_early_pgt = 2;
36
37/* Wipe all early page tables except for the kernel symbol map */
38static void __init reset_early_page_tables(void)
39{
40 unsigned long i;
41
42 for (i = 0; i < PTRS_PER_PGD-1; i++)
43 early_level4_pgt[i].pgd = 0;
44
45 next_early_pgt = 0;
46
47 write_cr3(__pa(early_level4_pgt));
48}
49
50/* Create a new PMD entry */
51int __init early_make_pgtable(unsigned long address)
31{ 52{
32 pgd_t *pgd = pgd_offset_k(0UL); 53 unsigned long physaddr = address - __PAGE_OFFSET;
33 pgd_clear(pgd); 54 unsigned long i;
34 __flush_tlb_all(); 55 pgdval_t pgd, *pgd_p;
56 pudval_t pud, *pud_p;
57 pmdval_t pmd, *pmd_p;
58
59 /* Invalid address or early pgt is done ? */
60 if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
61 return -1;
62
63again:
64 pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
65 pgd = *pgd_p;
66
67 /*
68 * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
69 * critical -- __PAGE_OFFSET would point us back into the dynamic
70 * range and we might end up looping forever...
71 */
72 if (pgd)
73 pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
74 else {
75 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
76 reset_early_page_tables();
77 goto again;
78 }
79
80 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
81 for (i = 0; i < PTRS_PER_PUD; i++)
82 pud_p[i] = 0;
83 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
84 }
85 pud_p += pud_index(address);
86 pud = *pud_p;
87
88 if (pud)
89 pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
90 else {
91 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
92 reset_early_page_tables();
93 goto again;
94 }
95
96 pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
97 for (i = 0; i < PTRS_PER_PMD; i++)
98 pmd_p[i] = 0;
99 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
100 }
101 pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
102 pmd_p[pmd_index(address)] = pmd;
103
104 return 0;
35} 105}
36 106
37/* Don't add a printk in there. printk relies on the PDA which is not initialized 107/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -42,14 +112,25 @@ static void __init clear_bss(void)
42 (unsigned long) __bss_stop - (unsigned long) __bss_start); 112 (unsigned long) __bss_stop - (unsigned long) __bss_start);
43} 113}
44 114
115static unsigned long get_cmd_line_ptr(void)
116{
117 unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
118
119 cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
120
121 return cmd_line_ptr;
122}
123
45static void __init copy_bootdata(char *real_mode_data) 124static void __init copy_bootdata(char *real_mode_data)
46{ 125{
47 char * command_line; 126 char * command_line;
127 unsigned long cmd_line_ptr;
48 128
49 memcpy(&boot_params, real_mode_data, sizeof boot_params); 129 memcpy(&boot_params, real_mode_data, sizeof boot_params);
50 sanitize_boot_params(&boot_params); 130 sanitize_boot_params(&boot_params);
51 if (boot_params.hdr.cmd_line_ptr) { 131 cmd_line_ptr = get_cmd_line_ptr();
52 command_line = __va(boot_params.hdr.cmd_line_ptr); 132 if (cmd_line_ptr) {
133 command_line = __va(cmd_line_ptr);
53 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 134 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
54 } 135 }
55} 136}
@@ -72,14 +153,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
72 (__START_KERNEL & PGDIR_MASK))); 153 (__START_KERNEL & PGDIR_MASK)));
73 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); 154 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
74 155
156 /* Kill off the identity-map trampoline */
157 reset_early_page_tables();
158
75 /* clear bss before set_intr_gate with early_idt_handler */ 159 /* clear bss before set_intr_gate with early_idt_handler */
76 clear_bss(); 160 clear_bss();
77 161
78 /* Make NULL pointers segfault */
79 zap_identity_mappings();
80
81 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
82
83 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 162 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
84#ifdef CONFIG_EARLY_PRINTK 163#ifdef CONFIG_EARLY_PRINTK
85 set_intr_gate(i, &early_idt_handlers[i]); 164 set_intr_gate(i, &early_idt_handlers[i]);
@@ -89,37 +168,25 @@ void __init x86_64_start_kernel(char * real_mode_data)
89 } 168 }
90 load_idt((const struct desc_ptr *)&idt_descr); 169 load_idt((const struct desc_ptr *)&idt_descr);
91 170
171 copy_bootdata(__va(real_mode_data));
172
92 if (console_loglevel == 10) 173 if (console_loglevel == 10)
93 early_printk("Kernel alive\n"); 174 early_printk("Kernel alive\n");
94 175
176 clear_page(init_level4_pgt);
177 /* set init_level4_pgt kernel high mapping*/
178 init_level4_pgt[511] = early_level4_pgt[511];
179
95 x86_64_start_reservations(real_mode_data); 180 x86_64_start_reservations(real_mode_data);
96} 181}
97 182
98void __init x86_64_start_reservations(char *real_mode_data) 183void __init x86_64_start_reservations(char *real_mode_data)
99{ 184{
100 copy_bootdata(__va(real_mode_data)); 185 /* version is always not zero if it is copied */
101 186 if (!boot_params.hdr.version)
102 memblock_reserve(__pa_symbol(&_text), 187 copy_bootdata(__va(real_mode_data));
103 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
104
105#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */
107 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
108 /* Assume only end is not page aligned */
109 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
110 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
111 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
112 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
113 }
114#endif
115 188
116 reserve_ebda_region(); 189 reserve_ebda_region();
117 190
118 /*
119 * At this point everything still needed from the boot loader
120 * or BIOS or kernel text should be early reserved or marked not
121 * RAM in e820. All other memory is free game.
122 */
123
124 start_kernel(); 191 start_kernel();
125} 192}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c4b9cc..d94f6d68be2a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
47 .code64 47 .code64
48 .globl startup_64 48 .globl startup_64
49startup_64: 49startup_64:
50
51 /* 50 /*
52 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 51 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
53 * and someone has loaded an identity mapped page table 52 * and someone has loaded an identity mapped page table
54 * for us. These identity mapped page tables map all of the 53 * for us. These identity mapped page tables map all of the
55 * kernel pages and possibly all of memory. 54 * kernel pages and possibly all of memory.
56 * 55 *
57 * %esi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
58 * 57 *
59 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
60 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
66 * tables and then reload them. 65 * tables and then reload them.
67 */ 66 */
68 67
69 /* Compute the delta between the address I am compiled to run at and the 68 /*
69 * Compute the delta between the address I am compiled to run at and the
70 * address I am actually running at. 70 * address I am actually running at.
71 */ 71 */
72 leaq _text(%rip), %rbp 72 leaq _text(%rip), %rbp
@@ -78,45 +78,62 @@ startup_64:
78 testl %eax, %eax 78 testl %eax, %eax
79 jnz bad_address 79 jnz bad_address
80 80
81 /* Is the address too large? */ 81 /*
82 leaq _text(%rip), %rdx 82 * Is the address too large?
83 movq $PGDIR_SIZE, %rax
84 cmpq %rax, %rdx
85 jae bad_address
86
87 /* Fixup the physical addresses in the page table
88 */ 83 */
89 addq %rbp, init_level4_pgt + 0(%rip) 84 leaq _text(%rip), %rax
90 addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) 85 shrq $MAX_PHYSMEM_BITS, %rax
91 addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) 86 jnz bad_address
92 87
93 addq %rbp, level3_ident_pgt + 0(%rip) 88 /*
89 * Fixup the physical addresses in the page table
90 */
91 addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
94 92
95 addq %rbp, level3_kernel_pgt + (510*8)(%rip) 93 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
96 addq %rbp, level3_kernel_pgt + (511*8)(%rip) 94 addq %rbp, level3_kernel_pgt + (511*8)(%rip)
97 95
98 addq %rbp, level2_fixmap_pgt + (506*8)(%rip) 96 addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
99 97
100 /* Add an Identity mapping if I am above 1G */ 98 /*
99 * Set up the identity mapping for the switchover. These
100 * entries should *NOT* have the global bit set! This also
101 * creates a bunch of nonsense entries but that is fine --
102 * it avoids problems around wraparound.
103 */
101 leaq _text(%rip), %rdi 104 leaq _text(%rip), %rdi
102 andq $PMD_PAGE_MASK, %rdi 105 leaq early_level4_pgt(%rip), %rbx
103 106
104 movq %rdi, %rax 107 movq %rdi, %rax
105 shrq $PUD_SHIFT, %rax 108 shrq $PGDIR_SHIFT, %rax
106 andq $(PTRS_PER_PUD - 1), %rax
107 jz ident_complete
108 109
109 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx 110 leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
110 leaq level3_ident_pgt(%rip), %rbx 111 movq %rdx, 0(%rbx,%rax,8)
111 movq %rdx, 0(%rbx, %rax, 8) 112 movq %rdx, 8(%rbx,%rax,8)
112 113
114 addq $4096, %rdx
113 movq %rdi, %rax 115 movq %rdi, %rax
114 shrq $PMD_SHIFT, %rax 116 shrq $PUD_SHIFT, %rax
115 andq $(PTRS_PER_PMD - 1), %rax 117 andl $(PTRS_PER_PUD-1), %eax
116 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx 118 movq %rdx, (4096+0)(%rbx,%rax,8)
117 leaq level2_spare_pgt(%rip), %rbx 119 movq %rdx, (4096+8)(%rbx,%rax,8)
118 movq %rdx, 0(%rbx, %rax, 8) 120
119ident_complete: 121 addq $8192, %rbx
122 movq %rdi, %rax
123 shrq $PMD_SHIFT, %rdi
124 addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
125 leaq (_end - 1)(%rip), %rcx
126 shrq $PMD_SHIFT, %rcx
127 subq %rdi, %rcx
128 incl %ecx
129
1301:
131 andq $(PTRS_PER_PMD - 1), %rdi
132 movq %rax, (%rbx,%rdi,8)
133 incq %rdi
134 addq $PMD_SIZE, %rax
135 decl %ecx
136 jnz 1b
120 137
121 /* 138 /*
122 * Fixup the kernel text+data virtual addresses. Note that 139 * Fixup the kernel text+data virtual addresses. Note that
@@ -124,7 +141,6 @@ ident_complete:
124 * cleanup_highmap() fixes this up along with the mappings 141 * cleanup_highmap() fixes this up along with the mappings
125 * beyond _end. 142 * beyond _end.
126 */ 143 */
127
128 leaq level2_kernel_pgt(%rip), %rdi 144 leaq level2_kernel_pgt(%rip), %rdi
129 leaq 4096(%rdi), %r8 145 leaq 4096(%rdi), %r8
130 /* See if it is a valid page table entry */ 146 /* See if it is a valid page table entry */
@@ -139,17 +155,14 @@ ident_complete:
139 /* Fixup phys_base */ 155 /* Fixup phys_base */
140 addq %rbp, phys_base(%rip) 156 addq %rbp, phys_base(%rip)
141 157
142 /* Due to ENTRY(), sometimes the empty space gets filled with 158 movq $(early_level4_pgt - __START_KERNEL_map), %rax
143 * zeros. Better take a jmp than relying on empty space being 159 jmp 1f
144 * filled with 0x90 (nop)
145 */
146 jmp secondary_startup_64
147ENTRY(secondary_startup_64) 160ENTRY(secondary_startup_64)
148 /* 161 /*
149 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 162 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
150 * and someone has loaded a mapped page table. 163 * and someone has loaded a mapped page table.
151 * 164 *
152 * %esi holds a physical pointer to real_mode_data. 165 * %rsi holds a physical pointer to real_mode_data.
153 * 166 *
154 * We come here either from startup_64 (using physical addresses) 167 * We come here either from startup_64 (using physical addresses)
155 * or from trampoline.S (using virtual addresses). 168 * or from trampoline.S (using virtual addresses).
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64)
159 * after the boot processor executes this code. 172 * after the boot processor executes this code.
160 */ 173 */
161 174
175 movq $(init_level4_pgt - __START_KERNEL_map), %rax
1761:
177
162 /* Enable PAE mode and PGE */ 178 /* Enable PAE mode and PGE */
163 movl $(X86_CR4_PAE | X86_CR4_PGE), %eax 179 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
164 movq %rax, %cr4 180 movq %rcx, %cr4
165 181
166 /* Setup early boot stage 4 level pagetables. */ 182 /* Setup early boot stage 4 level pagetables. */
167 movq $(init_level4_pgt - __START_KERNEL_map), %rax
168 addq phys_base(%rip), %rax 183 addq phys_base(%rip), %rax
169 movq %rax, %cr3 184 movq %rax, %cr3
170 185
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64)
196 movq %rax, %cr0 211 movq %rax, %cr0
197 212
198 /* Setup a boot time stack */ 213 /* Setup a boot time stack */
199 movq stack_start(%rip),%rsp 214 movq stack_start(%rip), %rsp
200 215
201 /* zero EFLAGS after setting rsp */ 216 /* zero EFLAGS after setting rsp */
202 pushq $0 217 pushq $0
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64)
236 movl initial_gs+4(%rip),%edx 251 movl initial_gs+4(%rip),%edx
237 wrmsr 252 wrmsr
238 253
239 /* esi is pointer to real mode structure with interesting info. 254 /* rsi is pointer to real mode structure with interesting info.
240 pass it to C */ 255 pass it to C */
241 movl %esi, %edi 256 movq %rsi, %rdi
242 257
243 /* Finally jump to run C code and to be on real kernel address 258 /* Finally jump to run C code and to be on real kernel address
244 * Since we are running on identity-mapped space we have to jump 259 * Since we are running on identity-mapped space we have to jump
245 * to the full 64bit address, this is only possible as indirect 260 * to the full 64bit address, this is only possible as indirect
246 * jump. In addition we need to ensure %cs is set so we make this 261 * jump. In addition we need to ensure %cs is set so we make this
247 * a far return. 262 * a far return.
263 *
264 * Note: do not change to far jump indirect with 64bit offset.
265 *
266 * AMD does not support far jump indirect with 64bit offset.
267 * AMD64 Architecture Programmer's Manual, Volume 3: states only
268 * JMP FAR mem16:16 FF /5 Far jump indirect,
269 * with the target specified by a far pointer in memory.
270 * JMP FAR mem16:32 FF /5 Far jump indirect,
271 * with the target specified by a far pointer in memory.
272 *
273 * Intel64 does support 64bit offset.
274 * Software Developer Manual Vol 2: states:
275 * FF /5 JMP m16:16 Jump far, absolute indirect,
276 * address given in m16:16
277 * FF /5 JMP m16:32 Jump far, absolute indirect,
278 * address given in m16:32.
279 * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
280 * address given in m16:64.
248 */ 281 */
249 movq initial_code(%rip),%rax 282 movq initial_code(%rip),%rax
250 pushq $0 # fake return address to stop unwinder 283 pushq $0 # fake return address to stop unwinder
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0)
270 303
271 /* SMP bootup changes these two */ 304 /* SMP bootup changes these two */
272 __REFDATA 305 __REFDATA
273 .align 8 306 .balign 8
274 ENTRY(initial_code) 307 GLOBAL(initial_code)
275 .quad x86_64_start_kernel 308 .quad x86_64_start_kernel
276 ENTRY(initial_gs) 309 GLOBAL(initial_gs)
277 .quad INIT_PER_CPU_VAR(irq_stack_union) 310 .quad INIT_PER_CPU_VAR(irq_stack_union)
278 311
279 ENTRY(stack_start) 312 GLOBAL(stack_start)
280 .quad init_thread_union+THREAD_SIZE-8 313 .quad init_thread_union+THREAD_SIZE-8
281 .word 0 314 .word 0
282 __FINITDATA 315 __FINITDATA
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0)
284bad_address: 317bad_address:
285 jmp bad_address 318 jmp bad_address
286 319
287 .section ".init.text","ax" 320 __INIT
288 .globl early_idt_handlers 321 .globl early_idt_handlers
289early_idt_handlers: 322early_idt_handlers:
290 # 104(%rsp) %rflags 323 # 104(%rsp) %rflags
@@ -321,14 +354,22 @@ ENTRY(early_idt_handler)
321 pushq %r11 # 0(%rsp) 354 pushq %r11 # 0(%rsp)
322 355
323 cmpl $__KERNEL_CS,96(%rsp) 356 cmpl $__KERNEL_CS,96(%rsp)
324 jne 10f 357 jne 11f
358
359 cmpl $14,72(%rsp) # Page fault?
360 jnz 10f
361 GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
362 call early_make_pgtable
363 andl %eax,%eax
364 jz 20f # All good
325 365
36610:
326 leaq 88(%rsp),%rdi # Pointer to %rip 367 leaq 88(%rsp),%rdi # Pointer to %rip
327 call early_fixup_exception 368 call early_fixup_exception
328 andl %eax,%eax 369 andl %eax,%eax
329 jnz 20f # Found an exception entry 370 jnz 20f # Found an exception entry
330 371
33110: 37211:
332#ifdef CONFIG_EARLY_PRINTK 373#ifdef CONFIG_EARLY_PRINTK
333 GET_CR2_INTO(%r9) # can clobber any volatile register if pv 374 GET_CR2_INTO(%r9) # can clobber any volatile register if pv
334 movl 80(%rsp),%r8d # error code 375 movl 80(%rsp),%r8d # error code
@@ -350,7 +391,7 @@ ENTRY(early_idt_handler)
3501: hlt 3911: hlt
351 jmp 1b 392 jmp 1b
352 393
35320: # Exception table entry found 39420: # Exception table entry found or page table generated
354 popq %r11 395 popq %r11
355 popq %r10 396 popq %r10
356 popq %r9 397 popq %r9
@@ -364,6 +405,8 @@ ENTRY(early_idt_handler)
364 decl early_recursion_flag(%rip) 405 decl early_recursion_flag(%rip)
365 INTERRUPT_RETURN 406 INTERRUPT_RETURN
366 407
408 __INITDATA
409
367 .balign 4 410 .balign 4
368early_recursion_flag: 411early_recursion_flag:
369 .long 0 412 .long 0
@@ -374,11 +417,10 @@ early_idt_msg:
374early_idt_ripmsg: 417early_idt_ripmsg:
375 .asciz "RIP %s\n" 418 .asciz "RIP %s\n"
376#endif /* CONFIG_EARLY_PRINTK */ 419#endif /* CONFIG_EARLY_PRINTK */
377 .previous
378 420
379#define NEXT_PAGE(name) \ 421#define NEXT_PAGE(name) \
380 .balign PAGE_SIZE; \ 422 .balign PAGE_SIZE; \
381ENTRY(name) 423GLOBAL(name)
382 424
383/* Automate the creation of 1 to 1 mapping pmd entries */ 425/* Automate the creation of 1 to 1 mapping pmd entries */
384#define PMDS(START, PERM, COUNT) \ 426#define PMDS(START, PERM, COUNT) \
@@ -388,24 +430,37 @@ ENTRY(name)
388 i = i + 1 ; \ 430 i = i + 1 ; \
389 .endr 431 .endr
390 432
433 __INITDATA
434NEXT_PAGE(early_level4_pgt)
435 .fill 511,8,0
436 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
437
438NEXT_PAGE(early_dynamic_pgts)
439 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
440
391 .data 441 .data
392 /* 442
393 * This default setting generates an ident mapping at address 0x100000 443#ifndef CONFIG_XEN
394 * and a mapping for the kernel that precisely maps virtual address
395 * 0xffffffff80000000 to physical address 0x000000. (always using
396 * 2Mbyte large pages provided by PAE mode)
397 */
398NEXT_PAGE(init_level4_pgt) 444NEXT_PAGE(init_level4_pgt)
399 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 445 .fill 512,8,0
400 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 446#else
401 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 447NEXT_PAGE(init_level4_pgt)
402 .org init_level4_pgt + L4_START_KERNEL*8, 0 448 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
449 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
450 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
451 .org init_level4_pgt + L4_START_KERNEL*8, 0
403 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 452 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
404 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 453 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
405 454
406NEXT_PAGE(level3_ident_pgt) 455NEXT_PAGE(level3_ident_pgt)
407 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 456 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
408 .fill 511,8,0 457 .fill 511, 8, 0
458NEXT_PAGE(level2_ident_pgt)
459 /* Since I easily can, map the first 1G.
460 * Don't set NX because code runs from these pages.
461 */
462 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
463#endif
409 464
410NEXT_PAGE(level3_kernel_pgt) 465NEXT_PAGE(level3_kernel_pgt)
411 .fill L3_START_KERNEL,8,0 466 .fill L3_START_KERNEL,8,0
@@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt)
413 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 468 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
414 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 469 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
415 470
416NEXT_PAGE(level2_fixmap_pgt)
417 .fill 506,8,0
418 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
419 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
420 .fill 5,8,0
421
422NEXT_PAGE(level1_fixmap_pgt)
423 .fill 512,8,0
424
425NEXT_PAGE(level2_ident_pgt)
426 /* Since I easily can, map the first 1G.
427 * Don't set NX because code runs from these pages.
428 */
429 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
430
431NEXT_PAGE(level2_kernel_pgt) 471NEXT_PAGE(level2_kernel_pgt)
432 /* 472 /*
433 * 512 MB kernel mapping. We spend a full page on this pagetable 473 * 512 MB kernel mapping. We spend a full page on this pagetable
@@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt)
442 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, 482 PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
443 KERNEL_IMAGE_SIZE/PMD_SIZE) 483 KERNEL_IMAGE_SIZE/PMD_SIZE)
444 484
445NEXT_PAGE(level2_spare_pgt) 485NEXT_PAGE(level2_fixmap_pgt)
446 .fill 512, 8, 0 486 .fill 506,8,0
487 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
488 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
489 .fill 5,8,0
490
491NEXT_PAGE(level1_fixmap_pgt)
492 .fill 512,8,0
447 493
448#undef PMDS 494#undef PMDS
449#undef NEXT_PAGE
450 495
451 .data 496 .data
452 .align 16 497 .align 16
@@ -472,6 +517,5 @@ ENTRY(nmi_idt_table)
472 .skip IDT_ENTRIES * 16 517 .skip IDT_ENTRIES * 16
473 518
474 __PAGE_ALIGNED_BSS 519 __PAGE_ALIGNED_BSS
475 .align PAGE_SIZE 520NEXT_PAGE(empty_zero_page)
476ENTRY(empty_zero_page)
477 .skip PAGE_SIZE 521 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 9c3bd4a2050e..0fa69127209a 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(csum_partial_copy_generic);
26EXPORT_SYMBOL(__get_user_1); 26EXPORT_SYMBOL(__get_user_1);
27EXPORT_SYMBOL(__get_user_2); 27EXPORT_SYMBOL(__get_user_2);
28EXPORT_SYMBOL(__get_user_4); 28EXPORT_SYMBOL(__get_user_4);
29EXPORT_SYMBOL(__get_user_8);
29 30
30EXPORT_SYMBOL(__put_user_1); 31EXPORT_SYMBOL(__put_user_1);
31EXPORT_SYMBOL(__put_user_2); 32EXPORT_SYMBOL(__put_user_2);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 2b44ea5f269d..b686a904d7c3 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -297,9 +297,9 @@ static void kvm_register_steal_time(void)
297 297
298 memset(st, 0, sizeof(*st)); 298 memset(st, 0, sizeof(*st));
299 299
300 wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); 300 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
301 printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", 301 pr_info("kvm-stealtime: cpu %d, msr %llx\n",
302 cpu, __pa(st)); 302 cpu, (unsigned long long) slow_virt_to_phys(st));
303} 303}
304 304
305static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; 305static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
@@ -324,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void)
324 return; 324 return;
325 325
326 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 326 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
327 u64 pa = __pa(&__get_cpu_var(apf_reason)); 327 u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason));
328 328
329#ifdef CONFIG_PREEMPT 329#ifdef CONFIG_PREEMPT
330 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 330 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -340,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void)
340 /* Size alignment is implied but just to make it explicit. */ 340 /* Size alignment is implied but just to make it explicit. */
341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); 341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
342 __get_cpu_var(kvm_apic_eoi) = 0; 342 __get_cpu_var(kvm_apic_eoi) = 0;
343 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; 343 pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi))
344 | KVM_MSR_ENABLED;
344 wrmsrl(MSR_KVM_PV_EOI_EN, pa); 345 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
345 } 346 }
346 347
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 220a360010f8..9f966dc0b9e4 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -162,8 +162,8 @@ int kvm_register_clock(char *txt)
162 int low, high, ret; 162 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; 163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
164 164
165 low = (int)__pa(src) | 1; 165 low = (int)slow_virt_to_phys(src) | 1;
166 high = ((u64)__pa(src) >> 32); 166 high = ((u64)slow_virt_to_phys(src) >> 32);
167 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 167 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
169 cpu, high, low, txt); 169 cpu, high, low, txt);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db39db6..4eabc160696f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -16,125 +16,12 @@
16#include <linux/io.h> 16#include <linux/io.h>
17#include <linux/suspend.h> 17#include <linux/suspend.h>
18 18
19#include <asm/init.h>
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/debugreg.h> 23#include <asm/debugreg.h>
23 24
24static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
25 unsigned long addr)
26{
27 pud_t *pud;
28 pmd_t *pmd;
29 struct page *page;
30 int result = -ENOMEM;
31
32 addr &= PMD_MASK;
33 pgd += pgd_index(addr);
34 if (!pgd_present(*pgd)) {
35 page = kimage_alloc_control_pages(image, 0);
36 if (!page)
37 goto out;
38 pud = (pud_t *)page_address(page);
39 clear_page(pud);
40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
41 }
42 pud = pud_offset(pgd, addr);
43 if (!pud_present(*pud)) {
44 page = kimage_alloc_control_pages(image, 0);
45 if (!page)
46 goto out;
47 pmd = (pmd_t *)page_address(page);
48 clear_page(pmd);
49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
50 }
51 pmd = pmd_offset(pud, addr);
52 if (!pmd_present(*pmd))
53 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
54 result = 0;
55out:
56 return result;
57}
58
59static void init_level2_page(pmd_t *level2p, unsigned long addr)
60{
61 unsigned long end_addr;
62
63 addr &= PAGE_MASK;
64 end_addr = addr + PUD_SIZE;
65 while (addr < end_addr) {
66 set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
67 addr += PMD_SIZE;
68 }
69}
70
71static int init_level3_page(struct kimage *image, pud_t *level3p,
72 unsigned long addr, unsigned long last_addr)
73{
74 unsigned long end_addr;
75 int result;
76
77 result = 0;
78 addr &= PAGE_MASK;
79 end_addr = addr + PGDIR_SIZE;
80 while ((addr < last_addr) && (addr < end_addr)) {
81 struct page *page;
82 pmd_t *level2p;
83
84 page = kimage_alloc_control_pages(image, 0);
85 if (!page) {
86 result = -ENOMEM;
87 goto out;
88 }
89 level2p = (pmd_t *)page_address(page);
90 init_level2_page(level2p, addr);
91 set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
92 addr += PUD_SIZE;
93 }
94 /* clear the unused entries */
95 while (addr < end_addr) {
96 pud_clear(level3p++);
97 addr += PUD_SIZE;
98 }
99out:
100 return result;
101}
102
103
104static int init_level4_page(struct kimage *image, pgd_t *level4p,
105 unsigned long addr, unsigned long last_addr)
106{
107 unsigned long end_addr;
108 int result;
109
110 result = 0;
111 addr &= PAGE_MASK;
112 end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
113 while ((addr < last_addr) && (addr < end_addr)) {
114 struct page *page;
115 pud_t *level3p;
116
117 page = kimage_alloc_control_pages(image, 0);
118 if (!page) {
119 result = -ENOMEM;
120 goto out;
121 }
122 level3p = (pud_t *)page_address(page);
123 result = init_level3_page(image, level3p, addr, last_addr);
124 if (result)
125 goto out;
126 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
127 addr += PGDIR_SIZE;
128 }
129 /* clear the unused entries */
130 while (addr < end_addr) {
131 pgd_clear(level4p++);
132 addr += PGDIR_SIZE;
133 }
134out:
135 return result;
136}
137
138static void free_transition_pgtable(struct kimage *image) 25static void free_transition_pgtable(struct kimage *image)
139{ 26{
140 free_page((unsigned long)image->arch.pud); 27 free_page((unsigned long)image->arch.pud);
@@ -184,22 +71,62 @@ err:
184 return result; 71 return result;
185} 72}
186 73
74static void *alloc_pgt_page(void *data)
75{
76 struct kimage *image = (struct kimage *)data;
77 struct page *page;
78 void *p = NULL;
79
80 page = kimage_alloc_control_pages(image, 0);
81 if (page) {
82 p = page_address(page);
83 clear_page(p);
84 }
85
86 return p;
87}
187 88
188static int init_pgtable(struct kimage *image, unsigned long start_pgtable) 89static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
189{ 90{
91 struct x86_mapping_info info = {
92 .alloc_pgt_page = alloc_pgt_page,
93 .context = image,
94 .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
95 };
96 unsigned long mstart, mend;
190 pgd_t *level4p; 97 pgd_t *level4p;
191 int result; 98 int result;
99 int i;
100
192 level4p = (pgd_t *)__va(start_pgtable); 101 level4p = (pgd_t *)__va(start_pgtable);
193 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 102 clear_page(level4p);
194 if (result) 103 for (i = 0; i < nr_pfn_mapped; i++) {
195 return result; 104 mstart = pfn_mapped[i].start << PAGE_SHIFT;
105 mend = pfn_mapped[i].end << PAGE_SHIFT;
106
107 result = kernel_ident_mapping_init(&info,
108 level4p, mstart, mend);
109 if (result)
110 return result;
111 }
112
196 /* 113 /*
197 * image->start may be outside 0 ~ max_pfn, for example when 114 * segments's mem ranges could be outside 0 ~ max_pfn,
198 * jump back to original kernel from kexeced kernel 115 * for example when jump back to original kernel from kexeced kernel.
116 * or first kernel is booted with user mem map, and second kernel
117 * could be loaded out of that range.
199 */ 118 */
200 result = init_one_level2_page(image, level4p, image->start); 119 for (i = 0; i < image->nr_segments; i++) {
201 if (result) 120 mstart = image->segment[i].mem;
202 return result; 121 mend = mstart + image->segment[i].memsz;
122
123 result = kernel_ident_mapping_init(&info,
124 level4p, mstart, mend);
125
126 if (result)
127 return result;
128 }
129
203 return init_transition_pgtable(image, level4p); 130 return init_transition_pgtable(image, level4p);
204} 131}
205 132
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8b24289cc10c..915f5efefcf5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -108,17 +108,16 @@
108#include <asm/topology.h> 108#include <asm/topology.h>
109#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/amd_nb.h> 110#include <asm/amd_nb.h>
111#ifdef CONFIG_X86_64
112#include <asm/numa_64.h>
113#endif
114#include <asm/mce.h> 111#include <asm/mce.h>
115#include <asm/alternative.h> 112#include <asm/alternative.h>
116#include <asm/prom.h> 113#include <asm/prom.h>
117 114
118/* 115/*
119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 116 * max_low_pfn_mapped: highest direct mapped pfn under 4GB
120 * The direct mapping extends to max_pfn_mapped, so that we can directly access 117 * max_pfn_mapped: highest direct mapped pfn over 4GB
121 * apertures, ACPI and other tables without having to play with fixmaps. 118 *
119 * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
120 * represented by pfn_mapped
122 */ 121 */
123unsigned long max_low_pfn_mapped; 122unsigned long max_low_pfn_mapped;
124unsigned long max_pfn_mapped; 123unsigned long max_pfn_mapped;
@@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align)
276 return ret; 275 return ret;
277} 276}
278 277
279#ifdef CONFIG_X86_64 278#ifdef CONFIG_X86_32
280static void __init init_gbpages(void)
281{
282 if (direct_gbpages && cpu_has_gbpages)
283 printk(KERN_INFO "Using GB pages for direct mapping\n");
284 else
285 direct_gbpages = 0;
286}
287#else
288static inline void init_gbpages(void)
289{
290}
291static void __init cleanup_highmap(void) 279static void __init cleanup_highmap(void)
292{ 280{
293} 281}
@@ -296,8 +284,8 @@ static void __init cleanup_highmap(void)
296static void __init reserve_brk(void) 284static void __init reserve_brk(void)
297{ 285{
298 if (_brk_end > _brk_start) 286 if (_brk_end > _brk_start)
299 memblock_reserve(__pa(_brk_start), 287 memblock_reserve(__pa_symbol(_brk_start),
300 __pa(_brk_end) - __pa(_brk_start)); 288 _brk_end - _brk_start);
301 289
302 /* Mark brk area as locked down and no longer taking any 290 /* Mark brk area as locked down and no longer taking any
303 new allocations */ 291 new allocations */
@@ -306,27 +294,43 @@ static void __init reserve_brk(void)
306 294
307#ifdef CONFIG_BLK_DEV_INITRD 295#ifdef CONFIG_BLK_DEV_INITRD
308 296
297static u64 __init get_ramdisk_image(void)
298{
299 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
300
301 ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
302
303 return ramdisk_image;
304}
305static u64 __init get_ramdisk_size(void)
306{
307 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
308
309 ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
310
311 return ramdisk_size;
312}
313
309#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 314#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
310static void __init relocate_initrd(void) 315static void __init relocate_initrd(void)
311{ 316{
312 /* Assume only end is not page aligned */ 317 /* Assume only end is not page aligned */
313 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 318 u64 ramdisk_image = get_ramdisk_image();
314 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 319 u64 ramdisk_size = get_ramdisk_size();
315 u64 area_size = PAGE_ALIGN(ramdisk_size); 320 u64 area_size = PAGE_ALIGN(ramdisk_size);
316 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
317 u64 ramdisk_here; 321 u64 ramdisk_here;
318 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
319 char *p, *q; 323 char *p, *q;
320 324
321 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into directly mapped mem */
322 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, 326 ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
323 PAGE_SIZE); 327 area_size, PAGE_SIZE);
324 328
325 if (!ramdisk_here) 329 if (!ramdisk_here)
326 panic("Cannot find place for new RAMDISK of size %lld\n", 330 panic("Cannot find place for new RAMDISK of size %lld\n",
327 ramdisk_size); 331 ramdisk_size);
328 332
329 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the mem currently occupied by
330 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
331 memblock_reserve(ramdisk_here, area_size); 335 memblock_reserve(ramdisk_here, area_size);
332 initrd_start = ramdisk_here + PAGE_OFFSET; 336 initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -336,17 +340,7 @@ static void __init relocate_initrd(void)
336 340
337 q = (char *)initrd_start; 341 q = (char *)initrd_start;
338 342
339 /* Copy any lowmem portion of the initrd */ 343 /* Copy the initrd */
340 if (ramdisk_image < end_of_lowmem) {
341 clen = end_of_lowmem - ramdisk_image;
342 p = (char *)__va(ramdisk_image);
343 memcpy(q, p, clen);
344 q += clen;
345 ramdisk_image += clen;
346 ramdisk_size -= clen;
347 }
348
349 /* Copy the highmem portion of the initrd */
350 while (ramdisk_size) { 344 while (ramdisk_size) {
351 slop = ramdisk_image & ~PAGE_MASK; 345 slop = ramdisk_image & ~PAGE_MASK;
352 clen = ramdisk_size; 346 clen = ramdisk_size;
@@ -360,22 +354,35 @@ static void __init relocate_initrd(void)
360 ramdisk_image += clen; 354 ramdisk_image += clen;
361 ramdisk_size -= clen; 355 ramdisk_size -= clen;
362 } 356 }
363 /* high pages is not converted by early_res_to_bootmem */ 357
364 ramdisk_image = boot_params.hdr.ramdisk_image; 358 ramdisk_image = get_ramdisk_image();
365 ramdisk_size = boot_params.hdr.ramdisk_size; 359 ramdisk_size = get_ramdisk_size();
366 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 360 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
367 " [mem %#010llx-%#010llx]\n", 361 " [mem %#010llx-%#010llx]\n",
368 ramdisk_image, ramdisk_image + ramdisk_size - 1, 362 ramdisk_image, ramdisk_image + ramdisk_size - 1,
369 ramdisk_here, ramdisk_here + ramdisk_size - 1); 363 ramdisk_here, ramdisk_here + ramdisk_size - 1);
370} 364}
371 365
366static void __init early_reserve_initrd(void)
367{
368 /* Assume only end is not page aligned */
369 u64 ramdisk_image = get_ramdisk_image();
370 u64 ramdisk_size = get_ramdisk_size();
371 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
372
373 if (!boot_params.hdr.type_of_loader ||
374 !ramdisk_image || !ramdisk_size)
375 return; /* No initrd provided by bootloader */
376
377 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
378}
372static void __init reserve_initrd(void) 379static void __init reserve_initrd(void)
373{ 380{
374 /* Assume only end is not page aligned */ 381 /* Assume only end is not page aligned */
375 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 382 u64 ramdisk_image = get_ramdisk_image();
376 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 383 u64 ramdisk_size = get_ramdisk_size();
377 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 384 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
378 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 385 u64 mapped_size;
379 386
380 if (!boot_params.hdr.type_of_loader || 387 if (!boot_params.hdr.type_of_loader ||
381 !ramdisk_image || !ramdisk_size) 388 !ramdisk_image || !ramdisk_size)
@@ -383,22 +390,18 @@ static void __init reserve_initrd(void)
383 390
384 initrd_start = 0; 391 initrd_start = 0;
385 392
386 if (ramdisk_size >= (end_of_lowmem>>1)) { 393 mapped_size = memblock_mem_size(max_pfn_mapped);
394 if (ramdisk_size >= (mapped_size>>1))
387 panic("initrd too large to handle, " 395 panic("initrd too large to handle, "
388 "disabling initrd (%lld needed, %lld available)\n", 396 "disabling initrd (%lld needed, %lld available)\n",
389 ramdisk_size, end_of_lowmem>>1); 397 ramdisk_size, mapped_size>>1);
390 }
391 398
392 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, 399 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
393 ramdisk_end - 1); 400 ramdisk_end - 1);
394 401
395 402 if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
396 if (ramdisk_end <= end_of_lowmem) { 403 PFN_DOWN(ramdisk_end))) {
397 /* All in lowmem, easy case */ 404 /* All are mapped, easy case */
398 /*
399 * don't need to reserve again, already reserved early
400 * in i386_start_kernel
401 */
402 initrd_start = ramdisk_image + PAGE_OFFSET; 405 initrd_start = ramdisk_image + PAGE_OFFSET;
403 initrd_end = initrd_start + ramdisk_size; 406 initrd_end = initrd_start + ramdisk_size;
404 return; 407 return;
@@ -409,6 +412,9 @@ static void __init reserve_initrd(void)
409 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); 412 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
410} 413}
411#else 414#else
415static void __init early_reserve_initrd(void)
416{
417}
412static void __init reserve_initrd(void) 418static void __init reserve_initrd(void)
413{ 419{
414} 420}
@@ -419,8 +425,6 @@ static void __init parse_setup_data(void)
419 struct setup_data *data; 425 struct setup_data *data;
420 u64 pa_data; 426 u64 pa_data;
421 427
422 if (boot_params.hdr.version < 0x0209)
423 return;
424 pa_data = boot_params.hdr.setup_data; 428 pa_data = boot_params.hdr.setup_data;
425 while (pa_data) { 429 while (pa_data) {
426 u32 data_len, map_len; 430 u32 data_len, map_len;
@@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void)
456 u64 pa_data; 460 u64 pa_data;
457 int found = 0; 461 int found = 0;
458 462
459 if (boot_params.hdr.version < 0x0209)
460 return;
461 pa_data = boot_params.hdr.setup_data; 463 pa_data = boot_params.hdr.setup_data;
462 while (pa_data) { 464 while (pa_data) {
463 data = early_memremap(pa_data, sizeof(*data)); 465 data = early_memremap(pa_data, sizeof(*data));
@@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
481 struct setup_data *data; 483 struct setup_data *data;
482 u64 pa_data; 484 u64 pa_data;
483 485
484 if (boot_params.hdr.version < 0x0209)
485 return;
486 pa_data = boot_params.hdr.setup_data; 486 pa_data = boot_params.hdr.setup_data;
487 while (pa_data) { 487 while (pa_data) {
488 data = early_memremap(pa_data, sizeof(*data)); 488 data = early_memremap(pa_data, sizeof(*data));
@@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void)
501/* 501/*
502 * Keep the crash kernel below this limit. On 32 bits earlier kernels 502 * Keep the crash kernel below this limit. On 32 bits earlier kernels
503 * would limit the kernel to the low 512 MiB due to mapping restrictions. 503 * would limit the kernel to the low 512 MiB due to mapping restrictions.
504 * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
505 * limit once kexec-tools are fixed.
506 */ 504 */
507#ifdef CONFIG_X86_32 505#ifdef CONFIG_X86_32
508# define CRASH_KERNEL_ADDR_MAX (512 << 20) 506# define CRASH_KERNEL_ADDR_MAX (512 << 20)
509#else 507#else
510# define CRASH_KERNEL_ADDR_MAX (896 << 20) 508# define CRASH_KERNEL_ADDR_MAX MAXMEM
509#endif
510
511static void __init reserve_crashkernel_low(void)
512{
513#ifdef CONFIG_X86_64
514 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long low_base = 0, low_size = 0;
516 unsigned long total_low_mem;
517 unsigned long long base;
518 int ret;
519
520 total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
521 ret = parse_crashkernel_low(boot_command_line, total_low_mem,
522 &low_size, &base);
523 if (ret != 0 || low_size <= 0)
524 return;
525
526 low_base = memblock_find_in_range(low_size, (1ULL<<32),
527 low_size, alignment);
528
529 if (!low_base) {
530 pr_info("crashkernel low reservation failed - No suitable area found.\n");
531
532 return;
533 }
534
535 memblock_reserve(low_base, low_size);
536 pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
537 (unsigned long)(low_size >> 20),
538 (unsigned long)(low_base >> 20),
539 (unsigned long)(total_low_mem >> 20));
540 crashk_low_res.start = low_base;
541 crashk_low_res.end = low_base + low_size - 1;
542 insert_resource(&iomem_resource, &crashk_low_res);
511#endif 543#endif
544}
512 545
513static void __init reserve_crashkernel(void) 546static void __init reserve_crashkernel(void)
514{ 547{
548 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long total_mem; 549 unsigned long long total_mem;
516 unsigned long long crash_size, crash_base; 550 unsigned long long crash_size, crash_base;
517 int ret; 551 int ret;
@@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void)
525 559
526 /* 0 means: find the address automatically */ 560 /* 0 means: find the address automatically */
527 if (crash_base <= 0) { 561 if (crash_base <= 0) {
528 const unsigned long long alignment = 16<<20; /* 16M */
529
530 /* 562 /*
531 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX 563 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
532 */ 564 */
@@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void)
537 pr_info("crashkernel reservation failed - No suitable area found.\n"); 569 pr_info("crashkernel reservation failed - No suitable area found.\n");
538 return; 570 return;
539 } 571 }
572
540 } else { 573 } else {
541 unsigned long long start; 574 unsigned long long start;
542 575
@@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void)
558 crashk_res.start = crash_base; 591 crashk_res.start = crash_base;
559 crashk_res.end = crash_base + crash_size - 1; 592 crashk_res.end = crash_base + crash_size - 1;
560 insert_resource(&iomem_resource, &crashk_res); 593 insert_resource(&iomem_resource, &crashk_res);
594
595 if (crash_base >= (1ULL<<32))
596 reserve_crashkernel_low();
561} 597}
562#else 598#else
563static void __init reserve_crashkernel(void) 599static void __init reserve_crashkernel(void)
@@ -608,8 +644,6 @@ static __init void reserve_ibft_region(void)
608 memblock_reserve(addr, size); 644 memblock_reserve(addr, size);
609} 645}
610 646
611static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
612
613static bool __init snb_gfx_workaround_needed(void) 647static bool __init snb_gfx_workaround_needed(void)
614{ 648{
615#ifdef CONFIG_PCI 649#ifdef CONFIG_PCI
@@ -698,8 +732,7 @@ static void __init trim_bios_range(void)
698 * since some BIOSes are known to corrupt low memory. See the 732 * since some BIOSes are known to corrupt low memory. See the
699 * Kconfig help text for X86_RESERVE_LOW. 733 * Kconfig help text for X86_RESERVE_LOW.
700 */ 734 */
701 e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), 735 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
702 E820_RAM, E820_RESERVED);
703 736
704 /* 737 /*
705 * special case: Some BIOSen report the PC BIOS 738 * special case: Some BIOSen report the PC BIOS
@@ -711,6 +744,29 @@ static void __init trim_bios_range(void)
711 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 744 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
712} 745}
713 746
747/* called before trim_bios_range() to spare extra sanitize */
748static void __init e820_add_kernel_range(void)
749{
750 u64 start = __pa_symbol(_text);
751 u64 size = __pa_symbol(_end) - start;
752
753 /*
754 * Complain if .text .data and .bss are not marked as E820_RAM and
755 * attempt to fix it by adding the range. We may have a confused BIOS,
756 * or the user may have used memmap=exactmap or memmap=xxM$yyM to
757 * exclude kernel range. If we really are running on top non-RAM,
758 * we will crash later anyways.
759 */
760 if (e820_all_mapped(start, start + size, E820_RAM))
761 return;
762
763 pr_warn(".text .data .bss are not marked as E820_RAM!\n");
764 e820_remove_range(start, size, E820_RAM, 0);
765 e820_add_region(start, size, E820_RAM);
766}
767
768static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
769
714static int __init parse_reservelow(char *p) 770static int __init parse_reservelow(char *p)
715{ 771{
716 unsigned long long size; 772 unsigned long long size;
@@ -733,6 +789,11 @@ static int __init parse_reservelow(char *p)
733 789
734early_param("reservelow", parse_reservelow); 790early_param("reservelow", parse_reservelow);
735 791
792static void __init trim_low_memory_range(void)
793{
794 memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
795}
796
736/* 797/*
737 * Determine if we were loaded by an EFI loader. If so, then we have also been 798 * Determine if we were loaded by an EFI loader. If so, then we have also been
738 * passed the efi memmap, systab, etc., so we should use these data structures 799 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -748,6 +809,17 @@ early_param("reservelow", parse_reservelow);
748 809
749void __init setup_arch(char **cmdline_p) 810void __init setup_arch(char **cmdline_p)
750{ 811{
812 memblock_reserve(__pa_symbol(_text),
813 (unsigned long)__bss_stop - (unsigned long)_text);
814
815 early_reserve_initrd();
816
817 /*
818 * At this point everything still needed from the boot loader
819 * or BIOS or kernel text should be early reserved or marked not
820 * RAM in e820. All other memory is free game.
821 */
822
751#ifdef CONFIG_X86_32 823#ifdef CONFIG_X86_32
752 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 824 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
753 visws_early_detect(); 825 visws_early_detect();
@@ -835,12 +907,12 @@ void __init setup_arch(char **cmdline_p)
835 init_mm.end_data = (unsigned long) _edata; 907 init_mm.end_data = (unsigned long) _edata;
836 init_mm.brk = _brk_end; 908 init_mm.brk = _brk_end;
837 909
838 code_resource.start = virt_to_phys(_text); 910 code_resource.start = __pa_symbol(_text);
839 code_resource.end = virt_to_phys(_etext)-1; 911 code_resource.end = __pa_symbol(_etext)-1;
840 data_resource.start = virt_to_phys(_etext); 912 data_resource.start = __pa_symbol(_etext);
841 data_resource.end = virt_to_phys(_edata)-1; 913 data_resource.end = __pa_symbol(_edata)-1;
842 bss_resource.start = virt_to_phys(&__bss_start); 914 bss_resource.start = __pa_symbol(__bss_start);
843 bss_resource.end = virt_to_phys(&__bss_stop)-1; 915 bss_resource.end = __pa_symbol(__bss_stop)-1;
844 916
845#ifdef CONFIG_CMDLINE_BOOL 917#ifdef CONFIG_CMDLINE_BOOL
846#ifdef CONFIG_CMDLINE_OVERRIDE 918#ifdef CONFIG_CMDLINE_OVERRIDE
@@ -906,6 +978,7 @@ void __init setup_arch(char **cmdline_p)
906 insert_resource(&iomem_resource, &data_resource); 978 insert_resource(&iomem_resource, &data_resource);
907 insert_resource(&iomem_resource, &bss_resource); 979 insert_resource(&iomem_resource, &bss_resource);
908 980
981 e820_add_kernel_range();
909 trim_bios_range(); 982 trim_bios_range();
910#ifdef CONFIG_X86_32 983#ifdef CONFIG_X86_32
911 if (ppro_with_ram_bug()) { 984 if (ppro_with_ram_bug()) {
@@ -955,6 +1028,8 @@ void __init setup_arch(char **cmdline_p)
955 1028
956 reserve_ibft_region(); 1029 reserve_ibft_region();
957 1030
1031 early_alloc_pgt_buf();
1032
958 /* 1033 /*
959 * Need to conclude brk, before memblock_x86_fill() 1034 * Need to conclude brk, before memblock_x86_fill()
960 * it could use memblock_find_in_range, could overlap with 1035 * it could use memblock_find_in_range, could overlap with
@@ -964,7 +1039,7 @@ void __init setup_arch(char **cmdline_p)
964 1039
965 cleanup_highmap(); 1040 cleanup_highmap();
966 1041
967 memblock.current_limit = get_max_mapped(); 1042 memblock.current_limit = ISA_END_ADDRESS;
968 memblock_x86_fill(); 1043 memblock_x86_fill();
969 1044
970 /* 1045 /*
@@ -981,41 +1056,22 @@ void __init setup_arch(char **cmdline_p)
981 setup_bios_corruption_check(); 1056 setup_bios_corruption_check();
982#endif 1057#endif
983 1058
1059#ifdef CONFIG_X86_32
984 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", 1060 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
985 (max_pfn_mapped<<PAGE_SHIFT) - 1); 1061 (max_pfn_mapped<<PAGE_SHIFT) - 1);
1062#endif
986 1063
987 setup_real_mode(); 1064 reserve_real_mode();
988 1065
989 trim_platform_memory_ranges(); 1066 trim_platform_memory_ranges();
1067 trim_low_memory_range();
990 1068
991 init_gbpages(); 1069 init_mem_mapping();
992
993 /* max_pfn_mapped is updated here */
994 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
995 max_pfn_mapped = max_low_pfn_mapped;
996
997#ifdef CONFIG_X86_64
998 if (max_pfn > max_low_pfn) {
999 int i;
1000 unsigned long start, end;
1001 unsigned long start_pfn, end_pfn;
1002
1003 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
1004 NULL) {
1005 1070
1006 end = PFN_PHYS(end_pfn); 1071 early_trap_pf_init();
1007 if (end <= (1UL<<32))
1008 continue;
1009 1072
1010 start = PFN_PHYS(start_pfn); 1073 setup_real_mode();
1011 max_pfn_mapped = init_memory_mapping(
1012 max((1UL<<32), start), end);
1013 }
1014 1074
1015 /* can we preseve max_low_pfn ?*/
1016 max_low_pfn = max_pfn;
1017 }
1018#endif
1019 memblock.current_limit = get_max_mapped(); 1075 memblock.current_limit = get_max_mapped();
1020 dma_contiguous_reserve(0); 1076 dma_contiguous_reserve(0);
1021 1077
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca11f4e9..68bda7a84159 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -688,10 +688,19 @@ void __init early_trap_init(void)
688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
689 /* int3 can be called from all */ 689 /* int3 can be called from all */
690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
691#ifdef CONFIG_X86_32
691 set_intr_gate(X86_TRAP_PF, &page_fault); 692 set_intr_gate(X86_TRAP_PF, &page_fault);
693#endif
692 load_idt(&idt_descr); 694 load_idt(&idt_descr);
693} 695}
694 696
697void __init early_trap_pf_init(void)
698{
699#ifdef CONFIG_X86_64
700 set_intr_gate(X86_TRAP_PF, &page_fault);
701#endif
702}
703
695void __init trap_init(void) 704void __init trap_init(void)
696{ 705{
697 int i; 706 int i;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1330dd102950..b014d9414d08 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -59,6 +59,9 @@ EXPORT_SYMBOL(memcpy);
59EXPORT_SYMBOL(__memcpy); 59EXPORT_SYMBOL(__memcpy);
60EXPORT_SYMBOL(memmove); 60EXPORT_SYMBOL(memmove);
61 61
62#ifndef CONFIG_DEBUG_VIRTUAL
63EXPORT_SYMBOL(phys_base);
64#endif
62EXPORT_SYMBOL(empty_zero_page); 65EXPORT_SYMBOL(empty_zero_page);
63#ifndef CONFIG_PARAVIRT 66#ifndef CONFIG_PARAVIRT
64EXPORT_SYMBOL(native_load_gs_index); 67EXPORT_SYMBOL(native_load_gs_index);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index d065d67c2672..45a14dbbddaf 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -63,10 +63,6 @@ struct x86_init_ops x86_init __initdata = {
63 .banner = default_banner, 63 .banner = default_banner,
64 }, 64 },
65 65
66 .mapping = {
67 .pagetable_reserve = native_pagetable_reserve,
68 },
69
70 .paging = { 66 .paging = {
71 .pagetable_init = native_pagetable_init, 67 .pagetable_init = native_pagetable_init,
72 }, 68 },