aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2013-02-15 12:25:08 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2013-02-15 12:25:08 -0500
commit0da3e7f526fde7a6522a3038b7ce609fc50f6707 (patch)
treec6e2dacd96fe7eac8312f3d7c22e0995dc423879 /arch/x86/kernel
parent95c9608478d639dcffc14ea47b31bff021a99ed1 (diff)
parent68d00bbebb5a48b7a9056a8c03476a71ecbc30a6 (diff)
Merge branch 'x86/mm2' into x86/mm
x86/mm2 is testing out fine, but has developed conflicts with x86/mm due to patches in adjacent code. Merge them so we can drop x86/mm2 and have a unified branch. Resolved Conflicts: arch/x86/kernel/setup.c
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/amd_gart_64.c5
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/intel.c1
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/head32.c21
-rw-r--r--arch/x86/kernel/head64.c133
-rw-r--r--arch/x86/kernel/head_64.S210
-rw-r--r--arch/x86/kernel/machine_kexec_64.c171
-rw-r--r--arch/x86/kernel/setup.c231
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/kernel/x86_init.c4
12 files changed, 449 insertions, 362 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bacf4b0d91f4..cfc755dc1607 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
51 51
52#ifdef CONFIG_X86_64 52#ifdef CONFIG_X86_64
53# include <asm/proto.h> 53# include <asm/proto.h>
54# include <asm/numa_64.h>
55#endif /* X86 */ 54#endif /* X86 */
56 55
57#define BAD_MADT_ENTRY(entry, end) ( \ 56#define BAD_MADT_ENTRY(entry, end) ( \
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e66311200cbd..b574b295a2f9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
768 aper_base = info.aper_base; 768 aper_base = info.aper_base;
769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
770 770
771 if (end_pfn > max_low_pfn_mapped) { 771 start_pfn = PFN_DOWN(aper_base);
772 start_pfn = (aper_base>>PAGE_SHIFT); 772 if (!pfn_range_is_mapped(start_pfn, end_pfn))
773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
774 }
775 774
776 pr_info("PCI-DMA: using GART IOMMU.\n"); 775 pr_info("PCI-DMA: using GART IOMMU.\n");
777 iommu_size = check_iommu_size(info.aper_base, aper_size); 776 iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 15239fffd6fe..eafb084e80f8 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,6 @@
12#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
13 13
14#ifdef CONFIG_X86_64 14#ifdef CONFIG_X86_64
15# include <asm/numa_64.h>
16# include <asm/mmconfig.h> 15# include <asm/mmconfig.h>
17# include <asm/cacheflush.h> 16# include <asm/cacheflush.h>
18#endif 17#endif
@@ -685,12 +684,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
685 * benefit in doing so. 684 * benefit in doing so.
686 */ 685 */
687 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { 686 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
687 unsigned long pfn = tseg >> PAGE_SHIFT;
688
688 printk(KERN_DEBUG "tseg: %010llx\n", tseg); 689 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
689 if ((tseg>>PMD_SHIFT) < 690 if (pfn_range_is_mapped(pfn, pfn + 1))
690 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
691 ((tseg>>PMD_SHIFT) <
692 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
693 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
694 set_memory_4k((unsigned long)__va(tseg), 1); 691 set_memory_4k((unsigned long)__va(tseg), 1);
695 } 692 }
696 } 693 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fdfefa27b948..1905ce98bee0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -17,7 +17,6 @@
17 17
18#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
19#include <linux/topology.h> 19#include <linux/topology.h>
20#include <asm/numa_64.h>
21#endif 20#endif
22 21
23#include "cpu.h" 22#include "cpu.h"
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index df06ade26bef..d32abeabbda5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
835} 835}
836early_param("mem", parse_memopt); 836early_param("mem", parse_memopt);
837 837
838static int __init parse_memmap_opt(char *p) 838static int __init parse_memmap_one(char *p)
839{ 839{
840 char *oldp; 840 char *oldp;
841 u64 start_at, mem_size; 841 u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
877 877
878 return *p == '\0' ? 0 : -EINVAL; 878 return *p == '\0' ? 0 : -EINVAL;
879} 879}
880static int __init parse_memmap_opt(char *str)
881{
882 while (str) {
883 char *k = strchr(str, ',');
884
885 if (k)
886 *k++ = 0;
887
888 parse_memmap_one(str);
889 str = k;
890 }
891
892 return 0;
893}
880early_param("memmap", parse_memmap_opt); 894early_param("memmap", parse_memmap_opt);
881 895
882void __init finish_e820_parsing(void) 896void __init finish_e820_parsing(void)
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index e17554832991..138463a24877 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,6 +18,7 @@
18#include <asm/io_apic.h> 18#include <asm/io_apic.h>
19#include <asm/bios_ebda.h> 19#include <asm/bios_ebda.h>
20#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
21#include <asm/bootparam_utils.h>
21 22
22static void __init i386_default_early_setup(void) 23static void __init i386_default_early_setup(void)
23{ 24{
@@ -30,19 +31,7 @@ static void __init i386_default_early_setup(void)
30 31
31void __init i386_start_kernel(void) 32void __init i386_start_kernel(void)
32{ 33{
33 memblock_reserve(__pa_symbol(_text), 34 sanitize_boot_params(&boot_params);
34 (unsigned long)__bss_stop - (unsigned long)_text);
35
36#ifdef CONFIG_BLK_DEV_INITRD
37 /* Reserve INITRD */
38 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
39 /* Assume only end is not page aligned */
40 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
41 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
42 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
43 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
44 }
45#endif
46 35
47 /* Call the subarch specific early setup function */ 36 /* Call the subarch specific early setup function */
48 switch (boot_params.hdr.hardware_subarch) { 37 switch (boot_params.hdr.hardware_subarch) {
@@ -57,11 +46,5 @@ void __init i386_start_kernel(void)
57 break; 46 break;
58 } 47 }
59 48
60 /*
61 * At this point everything still needed from the boot loader
62 * or BIOS or kernel text should be early reserved or marked not
63 * RAM in e820. All other memory is free game.
64 */
65
66 start_kernel(); 49 start_kernel();
67} 50}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7b215a50ec1e..57334f4cd3af 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,12 +25,83 @@
25#include <asm/kdebug.h> 25#include <asm/kdebug.h>
26#include <asm/e820.h> 26#include <asm/e820.h>
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/bootparam_utils.h>
28 29
29static void __init zap_identity_mappings(void) 30/*
31 * Manage page tables very early on.
32 */
33extern pgd_t early_level4_pgt[PTRS_PER_PGD];
34extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
35static unsigned int __initdata next_early_pgt = 2;
36
37/* Wipe all early page tables except for the kernel symbol map */
38static void __init reset_early_page_tables(void)
39{
40 unsigned long i;
41
42 for (i = 0; i < PTRS_PER_PGD-1; i++)
43 early_level4_pgt[i].pgd = 0;
44
45 next_early_pgt = 0;
46
47 write_cr3(__pa(early_level4_pgt));
48}
49
50/* Create a new PMD entry */
51int __init early_make_pgtable(unsigned long address)
30{ 52{
31 pgd_t *pgd = pgd_offset_k(0UL); 53 unsigned long physaddr = address - __PAGE_OFFSET;
32 pgd_clear(pgd); 54 unsigned long i;
33 __flush_tlb_all(); 55 pgdval_t pgd, *pgd_p;
56 pudval_t pud, *pud_p;
57 pmdval_t pmd, *pmd_p;
58
59 /* Invalid address or early pgt is done ? */
60 if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
61 return -1;
62
63again:
64 pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
65 pgd = *pgd_p;
66
67 /*
68 * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
69 * critical -- __PAGE_OFFSET would point us back into the dynamic
70 * range and we might end up looping forever...
71 */
72 if (pgd)
73 pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
74 else {
75 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
76 reset_early_page_tables();
77 goto again;
78 }
79
80 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
81 for (i = 0; i < PTRS_PER_PUD; i++)
82 pud_p[i] = 0;
83 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
84 }
85 pud_p += pud_index(address);
86 pud = *pud_p;
87
88 if (pud)
89 pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
90 else {
91 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
92 reset_early_page_tables();
93 goto again;
94 }
95
96 pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
97 for (i = 0; i < PTRS_PER_PMD; i++)
98 pmd_p[i] = 0;
99 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
100 }
101 pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
102 pmd_p[pmd_index(address)] = pmd;
103
104 return 0;
34} 105}
35 106
36/* Don't add a printk in there. printk relies on the PDA which is not initialized 107/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -41,13 +112,25 @@ static void __init clear_bss(void)
41 (unsigned long) __bss_stop - (unsigned long) __bss_start); 112 (unsigned long) __bss_stop - (unsigned long) __bss_start);
42} 113}
43 114
115static unsigned long get_cmd_line_ptr(void)
116{
117 unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
118
119 cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
120
121 return cmd_line_ptr;
122}
123
44static void __init copy_bootdata(char *real_mode_data) 124static void __init copy_bootdata(char *real_mode_data)
45{ 125{
46 char * command_line; 126 char * command_line;
127 unsigned long cmd_line_ptr;
47 128
48 memcpy(&boot_params, real_mode_data, sizeof boot_params); 129 memcpy(&boot_params, real_mode_data, sizeof boot_params);
49 if (boot_params.hdr.cmd_line_ptr) { 130 sanitize_boot_params(&boot_params);
50 command_line = __va(boot_params.hdr.cmd_line_ptr); 131 cmd_line_ptr = get_cmd_line_ptr();
132 if (cmd_line_ptr) {
133 command_line = __va(cmd_line_ptr);
51 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 134 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
52 } 135 }
53} 136}
@@ -70,14 +153,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
70 (__START_KERNEL & PGDIR_MASK))); 153 (__START_KERNEL & PGDIR_MASK)));
71 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); 154 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
72 155
156 /* Kill off the identity-map trampoline */
157 reset_early_page_tables();
158
73 /* clear bss before set_intr_gate with early_idt_handler */ 159 /* clear bss before set_intr_gate with early_idt_handler */
74 clear_bss(); 160 clear_bss();
75 161
76 /* Make NULL pointers segfault */
77 zap_identity_mappings();
78
79 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
80
81 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 162 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
82#ifdef CONFIG_EARLY_PRINTK 163#ifdef CONFIG_EARLY_PRINTK
83 set_intr_gate(i, &early_idt_handlers[i]); 164 set_intr_gate(i, &early_idt_handlers[i]);
@@ -87,37 +168,25 @@ void __init x86_64_start_kernel(char * real_mode_data)
87 } 168 }
88 load_idt((const struct desc_ptr *)&idt_descr); 169 load_idt((const struct desc_ptr *)&idt_descr);
89 170
171 copy_bootdata(__va(real_mode_data));
172
90 if (console_loglevel == 10) 173 if (console_loglevel == 10)
91 early_printk("Kernel alive\n"); 174 early_printk("Kernel alive\n");
92 175
176 clear_page(init_level4_pgt);
177 /* set init_level4_pgt kernel high mapping*/
178 init_level4_pgt[511] = early_level4_pgt[511];
179
93 x86_64_start_reservations(real_mode_data); 180 x86_64_start_reservations(real_mode_data);
94} 181}
95 182
96void __init x86_64_start_reservations(char *real_mode_data) 183void __init x86_64_start_reservations(char *real_mode_data)
97{ 184{
98 copy_bootdata(__va(real_mode_data)); 185 /* version is always not zero if it is copied */
99 186 if (!boot_params.hdr.version)
100 memblock_reserve(__pa_symbol(_text), 187 copy_bootdata(__va(real_mode_data));
101 (unsigned long)__bss_stop - (unsigned long)_text);
102
103#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */
105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
106 /* Assume only end is not page aligned */
107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
110 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
111 }
112#endif
113 188
114 reserve_ebda_region(); 189 reserve_ebda_region();
115 190
116 /*
117 * At this point everything still needed from the boot loader
118 * or BIOS or kernel text should be early reserved or marked not
119 * RAM in e820. All other memory is free game.
120 */
121
122 start_kernel(); 191 start_kernel();
123} 192}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c4b9cc..d94f6d68be2a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
47 .code64 47 .code64
48 .globl startup_64 48 .globl startup_64
49startup_64: 49startup_64:
50
51 /* 50 /*
52 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 51 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
53 * and someone has loaded an identity mapped page table 52 * and someone has loaded an identity mapped page table
54 * for us. These identity mapped page tables map all of the 53 * for us. These identity mapped page tables map all of the
55 * kernel pages and possibly all of memory. 54 * kernel pages and possibly all of memory.
56 * 55 *
57 * %esi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
58 * 57 *
59 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
60 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
66 * tables and then reload them. 65 * tables and then reload them.
67 */ 66 */
68 67
69 /* Compute the delta between the address I am compiled to run at and the 68 /*
69 * Compute the delta between the address I am compiled to run at and the
70 * address I am actually running at. 70 * address I am actually running at.
71 */ 71 */
72 leaq _text(%rip), %rbp 72 leaq _text(%rip), %rbp
@@ -78,45 +78,62 @@ startup_64:
78 testl %eax, %eax 78 testl %eax, %eax
79 jnz bad_address 79 jnz bad_address
80 80
81 /* Is the address too large? */ 81 /*
82 leaq _text(%rip), %rdx 82 * Is the address too large?
83 movq $PGDIR_SIZE, %rax
84 cmpq %rax, %rdx
85 jae bad_address
86
87 /* Fixup the physical addresses in the page table
88 */ 83 */
89 addq %rbp, init_level4_pgt + 0(%rip) 84 leaq _text(%rip), %rax
90 addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) 85 shrq $MAX_PHYSMEM_BITS, %rax
91 addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) 86 jnz bad_address
92 87
93 addq %rbp, level3_ident_pgt + 0(%rip) 88 /*
89 * Fixup the physical addresses in the page table
90 */
91 addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
94 92
95 addq %rbp, level3_kernel_pgt + (510*8)(%rip) 93 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
96 addq %rbp, level3_kernel_pgt + (511*8)(%rip) 94 addq %rbp, level3_kernel_pgt + (511*8)(%rip)
97 95
98 addq %rbp, level2_fixmap_pgt + (506*8)(%rip) 96 addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
99 97
100 /* Add an Identity mapping if I am above 1G */ 98 /*
99 * Set up the identity mapping for the switchover. These
100 * entries should *NOT* have the global bit set! This also
101 * creates a bunch of nonsense entries but that is fine --
102 * it avoids problems around wraparound.
103 */
101 leaq _text(%rip), %rdi 104 leaq _text(%rip), %rdi
102 andq $PMD_PAGE_MASK, %rdi 105 leaq early_level4_pgt(%rip), %rbx
103 106
104 movq %rdi, %rax 107 movq %rdi, %rax
105 shrq $PUD_SHIFT, %rax 108 shrq $PGDIR_SHIFT, %rax
106 andq $(PTRS_PER_PUD - 1), %rax
107 jz ident_complete
108 109
109 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx 110 leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
110 leaq level3_ident_pgt(%rip), %rbx 111 movq %rdx, 0(%rbx,%rax,8)
111 movq %rdx, 0(%rbx, %rax, 8) 112 movq %rdx, 8(%rbx,%rax,8)
112 113
114 addq $4096, %rdx
113 movq %rdi, %rax 115 movq %rdi, %rax
114 shrq $PMD_SHIFT, %rax 116 shrq $PUD_SHIFT, %rax
115 andq $(PTRS_PER_PMD - 1), %rax 117 andl $(PTRS_PER_PUD-1), %eax
116 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx 118 movq %rdx, (4096+0)(%rbx,%rax,8)
117 leaq level2_spare_pgt(%rip), %rbx 119 movq %rdx, (4096+8)(%rbx,%rax,8)
118 movq %rdx, 0(%rbx, %rax, 8) 120
119ident_complete: 121 addq $8192, %rbx
122 movq %rdi, %rax
123 shrq $PMD_SHIFT, %rdi
124 addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
125 leaq (_end - 1)(%rip), %rcx
126 shrq $PMD_SHIFT, %rcx
127 subq %rdi, %rcx
128 incl %ecx
129
1301:
131 andq $(PTRS_PER_PMD - 1), %rdi
132 movq %rax, (%rbx,%rdi,8)
133 incq %rdi
134 addq $PMD_SIZE, %rax
135 decl %ecx
136 jnz 1b
120 137
121 /* 138 /*
122 * Fixup the kernel text+data virtual addresses. Note that 139 * Fixup the kernel text+data virtual addresses. Note that
@@ -124,7 +141,6 @@ ident_complete:
124 * cleanup_highmap() fixes this up along with the mappings 141 * cleanup_highmap() fixes this up along with the mappings
125 * beyond _end. 142 * beyond _end.
126 */ 143 */
127
128 leaq level2_kernel_pgt(%rip), %rdi 144 leaq level2_kernel_pgt(%rip), %rdi
129 leaq 4096(%rdi), %r8 145 leaq 4096(%rdi), %r8
130 /* See if it is a valid page table entry */ 146 /* See if it is a valid page table entry */
@@ -139,17 +155,14 @@ ident_complete:
139 /* Fixup phys_base */ 155 /* Fixup phys_base */
140 addq %rbp, phys_base(%rip) 156 addq %rbp, phys_base(%rip)
141 157
142 /* Due to ENTRY(), sometimes the empty space gets filled with 158 movq $(early_level4_pgt - __START_KERNEL_map), %rax
143 * zeros. Better take a jmp than relying on empty space being 159 jmp 1f
144 * filled with 0x90 (nop)
145 */
146 jmp secondary_startup_64
147ENTRY(secondary_startup_64) 160ENTRY(secondary_startup_64)
148 /* 161 /*
149 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 162 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
150 * and someone has loaded a mapped page table. 163 * and someone has loaded a mapped page table.
151 * 164 *
152 * %esi holds a physical pointer to real_mode_data. 165 * %rsi holds a physical pointer to real_mode_data.
153 * 166 *
154 * We come here either from startup_64 (using physical addresses) 167 * We come here either from startup_64 (using physical addresses)
155 * or from trampoline.S (using virtual addresses). 168 * or from trampoline.S (using virtual addresses).
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64)
159 * after the boot processor executes this code. 172 * after the boot processor executes this code.
160 */ 173 */
161 174
175 movq $(init_level4_pgt - __START_KERNEL_map), %rax
1761:
177
162 /* Enable PAE mode and PGE */ 178 /* Enable PAE mode and PGE */
163 movl $(X86_CR4_PAE | X86_CR4_PGE), %eax 179 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
164 movq %rax, %cr4 180 movq %rcx, %cr4
165 181
166 /* Setup early boot stage 4 level pagetables. */ 182 /* Setup early boot stage 4 level pagetables. */
167 movq $(init_level4_pgt - __START_KERNEL_map), %rax
168 addq phys_base(%rip), %rax 183 addq phys_base(%rip), %rax
169 movq %rax, %cr3 184 movq %rax, %cr3
170 185
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64)
196 movq %rax, %cr0 211 movq %rax, %cr0
197 212
198 /* Setup a boot time stack */ 213 /* Setup a boot time stack */
199 movq stack_start(%rip),%rsp 214 movq stack_start(%rip), %rsp
200 215
201 /* zero EFLAGS after setting rsp */ 216 /* zero EFLAGS after setting rsp */
202 pushq $0 217 pushq $0
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64)
236 movl initial_gs+4(%rip),%edx 251 movl initial_gs+4(%rip),%edx
237 wrmsr 252 wrmsr
238 253
239 /* esi is pointer to real mode structure with interesting info. 254 /* rsi is pointer to real mode structure with interesting info.
240 pass it to C */ 255 pass it to C */
241 movl %esi, %edi 256 movq %rsi, %rdi
242 257
243 /* Finally jump to run C code and to be on real kernel address 258 /* Finally jump to run C code and to be on real kernel address
244 * Since we are running on identity-mapped space we have to jump 259 * Since we are running on identity-mapped space we have to jump
245 * to the full 64bit address, this is only possible as indirect 260 * to the full 64bit address, this is only possible as indirect
246 * jump. In addition we need to ensure %cs is set so we make this 261 * jump. In addition we need to ensure %cs is set so we make this
247 * a far return. 262 * a far return.
263 *
264 * Note: do not change to far jump indirect with 64bit offset.
265 *
266 * AMD does not support far jump indirect with 64bit offset.
267 * AMD64 Architecture Programmer's Manual, Volume 3: states only
268 * JMP FAR mem16:16 FF /5 Far jump indirect,
269 * with the target specified by a far pointer in memory.
270 * JMP FAR mem16:32 FF /5 Far jump indirect,
271 * with the target specified by a far pointer in memory.
272 *
273 * Intel64 does support 64bit offset.
274 * Software Developer Manual Vol 2: states:
275 * FF /5 JMP m16:16 Jump far, absolute indirect,
276 * address given in m16:16
277 * FF /5 JMP m16:32 Jump far, absolute indirect,
278 * address given in m16:32.
279 * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
280 * address given in m16:64.
248 */ 281 */
249 movq initial_code(%rip),%rax 282 movq initial_code(%rip),%rax
250 pushq $0 # fake return address to stop unwinder 283 pushq $0 # fake return address to stop unwinder
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0)
270 303
271 /* SMP bootup changes these two */ 304 /* SMP bootup changes these two */
272 __REFDATA 305 __REFDATA
273 .align 8 306 .balign 8
274 ENTRY(initial_code) 307 GLOBAL(initial_code)
275 .quad x86_64_start_kernel 308 .quad x86_64_start_kernel
276 ENTRY(initial_gs) 309 GLOBAL(initial_gs)
277 .quad INIT_PER_CPU_VAR(irq_stack_union) 310 .quad INIT_PER_CPU_VAR(irq_stack_union)
278 311
279 ENTRY(stack_start) 312 GLOBAL(stack_start)
280 .quad init_thread_union+THREAD_SIZE-8 313 .quad init_thread_union+THREAD_SIZE-8
281 .word 0 314 .word 0
282 __FINITDATA 315 __FINITDATA
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0)
284bad_address: 317bad_address:
285 jmp bad_address 318 jmp bad_address
286 319
287 .section ".init.text","ax" 320 __INIT
288 .globl early_idt_handlers 321 .globl early_idt_handlers
289early_idt_handlers: 322early_idt_handlers:
290 # 104(%rsp) %rflags 323 # 104(%rsp) %rflags
@@ -321,14 +354,22 @@ ENTRY(early_idt_handler)
321 pushq %r11 # 0(%rsp) 354 pushq %r11 # 0(%rsp)
322 355
323 cmpl $__KERNEL_CS,96(%rsp) 356 cmpl $__KERNEL_CS,96(%rsp)
324 jne 10f 357 jne 11f
358
359 cmpl $14,72(%rsp) # Page fault?
360 jnz 10f
361 GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
362 call early_make_pgtable
363 andl %eax,%eax
364 jz 20f # All good
325 365
36610:
326 leaq 88(%rsp),%rdi # Pointer to %rip 367 leaq 88(%rsp),%rdi # Pointer to %rip
327 call early_fixup_exception 368 call early_fixup_exception
328 andl %eax,%eax 369 andl %eax,%eax
329 jnz 20f # Found an exception entry 370 jnz 20f # Found an exception entry
330 371
33110: 37211:
332#ifdef CONFIG_EARLY_PRINTK 373#ifdef CONFIG_EARLY_PRINTK
333 GET_CR2_INTO(%r9) # can clobber any volatile register if pv 374 GET_CR2_INTO(%r9) # can clobber any volatile register if pv
334 movl 80(%rsp),%r8d # error code 375 movl 80(%rsp),%r8d # error code
@@ -350,7 +391,7 @@ ENTRY(early_idt_handler)
3501: hlt 3911: hlt
351 jmp 1b 392 jmp 1b
352 393
35320: # Exception table entry found 39420: # Exception table entry found or page table generated
354 popq %r11 395 popq %r11
355 popq %r10 396 popq %r10
356 popq %r9 397 popq %r9
@@ -364,6 +405,8 @@ ENTRY(early_idt_handler)
364 decl early_recursion_flag(%rip) 405 decl early_recursion_flag(%rip)
365 INTERRUPT_RETURN 406 INTERRUPT_RETURN
366 407
408 __INITDATA
409
367 .balign 4 410 .balign 4
368early_recursion_flag: 411early_recursion_flag:
369 .long 0 412 .long 0
@@ -374,11 +417,10 @@ early_idt_msg:
374early_idt_ripmsg: 417early_idt_ripmsg:
375 .asciz "RIP %s\n" 418 .asciz "RIP %s\n"
376#endif /* CONFIG_EARLY_PRINTK */ 419#endif /* CONFIG_EARLY_PRINTK */
377 .previous
378 420
379#define NEXT_PAGE(name) \ 421#define NEXT_PAGE(name) \
380 .balign PAGE_SIZE; \ 422 .balign PAGE_SIZE; \
381ENTRY(name) 423GLOBAL(name)
382 424
383/* Automate the creation of 1 to 1 mapping pmd entries */ 425/* Automate the creation of 1 to 1 mapping pmd entries */
384#define PMDS(START, PERM, COUNT) \ 426#define PMDS(START, PERM, COUNT) \
@@ -388,24 +430,37 @@ ENTRY(name)
388 i = i + 1 ; \ 430 i = i + 1 ; \
389 .endr 431 .endr
390 432
433 __INITDATA
434NEXT_PAGE(early_level4_pgt)
435 .fill 511,8,0
436 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
437
438NEXT_PAGE(early_dynamic_pgts)
439 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
440
391 .data 441 .data
392 /* 442
393 * This default setting generates an ident mapping at address 0x100000 443#ifndef CONFIG_XEN
394 * and a mapping for the kernel that precisely maps virtual address
395 * 0xffffffff80000000 to physical address 0x000000. (always using
396 * 2Mbyte large pages provided by PAE mode)
397 */
398NEXT_PAGE(init_level4_pgt) 444NEXT_PAGE(init_level4_pgt)
399 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 445 .fill 512,8,0
400 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 446#else
401 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 447NEXT_PAGE(init_level4_pgt)
402 .org init_level4_pgt + L4_START_KERNEL*8, 0 448 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
449 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
450 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
451 .org init_level4_pgt + L4_START_KERNEL*8, 0
403 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 452 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
404 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 453 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
405 454
406NEXT_PAGE(level3_ident_pgt) 455NEXT_PAGE(level3_ident_pgt)
407 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 456 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
408 .fill 511,8,0 457 .fill 511, 8, 0
458NEXT_PAGE(level2_ident_pgt)
459 /* Since I easily can, map the first 1G.
460 * Don't set NX because code runs from these pages.
461 */
462 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
463#endif
409 464
410NEXT_PAGE(level3_kernel_pgt) 465NEXT_PAGE(level3_kernel_pgt)
411 .fill L3_START_KERNEL,8,0 466 .fill L3_START_KERNEL,8,0
@@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt)
413 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 468 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
414 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 469 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
415 470
416NEXT_PAGE(level2_fixmap_pgt)
417 .fill 506,8,0
418 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
419 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
420 .fill 5,8,0
421
422NEXT_PAGE(level1_fixmap_pgt)
423 .fill 512,8,0
424
425NEXT_PAGE(level2_ident_pgt)
426 /* Since I easily can, map the first 1G.
427 * Don't set NX because code runs from these pages.
428 */
429 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
430
431NEXT_PAGE(level2_kernel_pgt) 471NEXT_PAGE(level2_kernel_pgt)
432 /* 472 /*
433 * 512 MB kernel mapping. We spend a full page on this pagetable 473 * 512 MB kernel mapping. We spend a full page on this pagetable
@@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt)
442 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, 482 PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
443 KERNEL_IMAGE_SIZE/PMD_SIZE) 483 KERNEL_IMAGE_SIZE/PMD_SIZE)
444 484
445NEXT_PAGE(level2_spare_pgt) 485NEXT_PAGE(level2_fixmap_pgt)
446 .fill 512, 8, 0 486 .fill 506,8,0
487 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
488 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
489 .fill 5,8,0
490
491NEXT_PAGE(level1_fixmap_pgt)
492 .fill 512,8,0
447 493
448#undef PMDS 494#undef PMDS
449#undef NEXT_PAGE
450 495
451 .data 496 .data
452 .align 16 497 .align 16
@@ -472,6 +517,5 @@ ENTRY(nmi_idt_table)
472 .skip IDT_ENTRIES * 16 517 .skip IDT_ENTRIES * 16
473 518
474 __PAGE_ALIGNED_BSS 519 __PAGE_ALIGNED_BSS
475 .align PAGE_SIZE 520NEXT_PAGE(empty_zero_page)
476ENTRY(empty_zero_page)
477 .skip PAGE_SIZE 521 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db39db6..4eabc160696f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -16,125 +16,12 @@
16#include <linux/io.h> 16#include <linux/io.h>
17#include <linux/suspend.h> 17#include <linux/suspend.h>
18 18
19#include <asm/init.h>
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/debugreg.h> 23#include <asm/debugreg.h>
23 24
24static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
25 unsigned long addr)
26{
27 pud_t *pud;
28 pmd_t *pmd;
29 struct page *page;
30 int result = -ENOMEM;
31
32 addr &= PMD_MASK;
33 pgd += pgd_index(addr);
34 if (!pgd_present(*pgd)) {
35 page = kimage_alloc_control_pages(image, 0);
36 if (!page)
37 goto out;
38 pud = (pud_t *)page_address(page);
39 clear_page(pud);
40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
41 }
42 pud = pud_offset(pgd, addr);
43 if (!pud_present(*pud)) {
44 page = kimage_alloc_control_pages(image, 0);
45 if (!page)
46 goto out;
47 pmd = (pmd_t *)page_address(page);
48 clear_page(pmd);
49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
50 }
51 pmd = pmd_offset(pud, addr);
52 if (!pmd_present(*pmd))
53 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
54 result = 0;
55out:
56 return result;
57}
58
59static void init_level2_page(pmd_t *level2p, unsigned long addr)
60{
61 unsigned long end_addr;
62
63 addr &= PAGE_MASK;
64 end_addr = addr + PUD_SIZE;
65 while (addr < end_addr) {
66 set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
67 addr += PMD_SIZE;
68 }
69}
70
71static int init_level3_page(struct kimage *image, pud_t *level3p,
72 unsigned long addr, unsigned long last_addr)
73{
74 unsigned long end_addr;
75 int result;
76
77 result = 0;
78 addr &= PAGE_MASK;
79 end_addr = addr + PGDIR_SIZE;
80 while ((addr < last_addr) && (addr < end_addr)) {
81 struct page *page;
82 pmd_t *level2p;
83
84 page = kimage_alloc_control_pages(image, 0);
85 if (!page) {
86 result = -ENOMEM;
87 goto out;
88 }
89 level2p = (pmd_t *)page_address(page);
90 init_level2_page(level2p, addr);
91 set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
92 addr += PUD_SIZE;
93 }
94 /* clear the unused entries */
95 while (addr < end_addr) {
96 pud_clear(level3p++);
97 addr += PUD_SIZE;
98 }
99out:
100 return result;
101}
102
103
104static int init_level4_page(struct kimage *image, pgd_t *level4p,
105 unsigned long addr, unsigned long last_addr)
106{
107 unsigned long end_addr;
108 int result;
109
110 result = 0;
111 addr &= PAGE_MASK;
112 end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
113 while ((addr < last_addr) && (addr < end_addr)) {
114 struct page *page;
115 pud_t *level3p;
116
117 page = kimage_alloc_control_pages(image, 0);
118 if (!page) {
119 result = -ENOMEM;
120 goto out;
121 }
122 level3p = (pud_t *)page_address(page);
123 result = init_level3_page(image, level3p, addr, last_addr);
124 if (result)
125 goto out;
126 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
127 addr += PGDIR_SIZE;
128 }
129 /* clear the unused entries */
130 while (addr < end_addr) {
131 pgd_clear(level4p++);
132 addr += PGDIR_SIZE;
133 }
134out:
135 return result;
136}
137
138static void free_transition_pgtable(struct kimage *image) 25static void free_transition_pgtable(struct kimage *image)
139{ 26{
140 free_page((unsigned long)image->arch.pud); 27 free_page((unsigned long)image->arch.pud);
@@ -184,22 +71,62 @@ err:
184 return result; 71 return result;
185} 72}
186 73
74static void *alloc_pgt_page(void *data)
75{
76 struct kimage *image = (struct kimage *)data;
77 struct page *page;
78 void *p = NULL;
79
80 page = kimage_alloc_control_pages(image, 0);
81 if (page) {
82 p = page_address(page);
83 clear_page(p);
84 }
85
86 return p;
87}
187 88
188static int init_pgtable(struct kimage *image, unsigned long start_pgtable) 89static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
189{ 90{
91 struct x86_mapping_info info = {
92 .alloc_pgt_page = alloc_pgt_page,
93 .context = image,
94 .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
95 };
96 unsigned long mstart, mend;
190 pgd_t *level4p; 97 pgd_t *level4p;
191 int result; 98 int result;
99 int i;
100
192 level4p = (pgd_t *)__va(start_pgtable); 101 level4p = (pgd_t *)__va(start_pgtable);
193 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 102 clear_page(level4p);
194 if (result) 103 for (i = 0; i < nr_pfn_mapped; i++) {
195 return result; 104 mstart = pfn_mapped[i].start << PAGE_SHIFT;
105 mend = pfn_mapped[i].end << PAGE_SHIFT;
106
107 result = kernel_ident_mapping_init(&info,
108 level4p, mstart, mend);
109 if (result)
110 return result;
111 }
112
196 /* 113 /*
197 * image->start may be outside 0 ~ max_pfn, for example when 114 * segments's mem ranges could be outside 0 ~ max_pfn,
198 * jump back to original kernel from kexeced kernel 115 * for example when jump back to original kernel from kexeced kernel.
116 * or first kernel is booted with user mem map, and second kernel
117 * could be loaded out of that range.
199 */ 118 */
200 result = init_one_level2_page(image, level4p, image->start); 119 for (i = 0; i < image->nr_segments; i++) {
201 if (result) 120 mstart = image->segment[i].mem;
202 return result; 121 mend = mstart + image->segment[i].memsz;
122
123 result = kernel_ident_mapping_init(&info,
124 level4p, mstart, mend);
125
126 if (result)
127 return result;
128 }
129
203 return init_transition_pgtable(image, level4p); 130 return init_transition_pgtable(image, level4p);
204} 131}
205 132
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0aebd776018e..878cf1d326e5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -108,17 +108,16 @@
108#include <asm/topology.h> 108#include <asm/topology.h>
109#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/amd_nb.h> 110#include <asm/amd_nb.h>
111#ifdef CONFIG_X86_64
112#include <asm/numa_64.h>
113#endif
114#include <asm/mce.h> 111#include <asm/mce.h>
115#include <asm/alternative.h> 112#include <asm/alternative.h>
116#include <asm/prom.h> 113#include <asm/prom.h>
117 114
118/* 115/*
119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 116 * max_low_pfn_mapped: highest direct mapped pfn under 4GB
120 * The direct mapping extends to max_pfn_mapped, so that we can directly access 117 * max_pfn_mapped: highest direct mapped pfn over 4GB
121 * apertures, ACPI and other tables without having to play with fixmaps. 118 *
119 * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
120 * represented by pfn_mapped
122 */ 121 */
123unsigned long max_low_pfn_mapped; 122unsigned long max_low_pfn_mapped;
124unsigned long max_pfn_mapped; 123unsigned long max_pfn_mapped;
@@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align)
276 return ret; 275 return ret;
277} 276}
278 277
279#ifdef CONFIG_X86_64 278#ifdef CONFIG_X86_32
280static void __init init_gbpages(void)
281{
282 if (direct_gbpages && cpu_has_gbpages)
283 printk(KERN_INFO "Using GB pages for direct mapping\n");
284 else
285 direct_gbpages = 0;
286}
287#else
288static inline void init_gbpages(void)
289{
290}
291static void __init cleanup_highmap(void) 279static void __init cleanup_highmap(void)
292{ 280{
293} 281}
@@ -306,27 +294,43 @@ static void __init reserve_brk(void)
306 294
307#ifdef CONFIG_BLK_DEV_INITRD 295#ifdef CONFIG_BLK_DEV_INITRD
308 296
297static u64 __init get_ramdisk_image(void)
298{
299 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
300
301 ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
302
303 return ramdisk_image;
304}
305static u64 __init get_ramdisk_size(void)
306{
307 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
308
309 ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
310
311 return ramdisk_size;
312}
313
309#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 314#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
310static void __init relocate_initrd(void) 315static void __init relocate_initrd(void)
311{ 316{
312 /* Assume only end is not page aligned */ 317 /* Assume only end is not page aligned */
313 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 318 u64 ramdisk_image = get_ramdisk_image();
314 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 319 u64 ramdisk_size = get_ramdisk_size();
315 u64 area_size = PAGE_ALIGN(ramdisk_size); 320 u64 area_size = PAGE_ALIGN(ramdisk_size);
316 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
317 u64 ramdisk_here; 321 u64 ramdisk_here;
318 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
319 char *p, *q; 323 char *p, *q;
320 324
321 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into directly mapped mem */
322 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, 326 ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
323 PAGE_SIZE); 327 area_size, PAGE_SIZE);
324 328
325 if (!ramdisk_here) 329 if (!ramdisk_here)
326 panic("Cannot find place for new RAMDISK of size %lld\n", 330 panic("Cannot find place for new RAMDISK of size %lld\n",
327 ramdisk_size); 331 ramdisk_size);
328 332
329 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the mem currently occupied by
330 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
331 memblock_reserve(ramdisk_here, area_size); 335 memblock_reserve(ramdisk_here, area_size);
332 initrd_start = ramdisk_here + PAGE_OFFSET; 336 initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -336,17 +340,7 @@ static void __init relocate_initrd(void)
336 340
337 q = (char *)initrd_start; 341 q = (char *)initrd_start;
338 342
339 /* Copy any lowmem portion of the initrd */ 343 /* Copy the initrd */
340 if (ramdisk_image < end_of_lowmem) {
341 clen = end_of_lowmem - ramdisk_image;
342 p = (char *)__va(ramdisk_image);
343 memcpy(q, p, clen);
344 q += clen;
345 ramdisk_image += clen;
346 ramdisk_size -= clen;
347 }
348
349 /* Copy the highmem portion of the initrd */
350 while (ramdisk_size) { 344 while (ramdisk_size) {
351 slop = ramdisk_image & ~PAGE_MASK; 345 slop = ramdisk_image & ~PAGE_MASK;
352 clen = ramdisk_size; 346 clen = ramdisk_size;
@@ -360,22 +354,35 @@ static void __init relocate_initrd(void)
360 ramdisk_image += clen; 354 ramdisk_image += clen;
361 ramdisk_size -= clen; 355 ramdisk_size -= clen;
362 } 356 }
363 /* high pages is not converted by early_res_to_bootmem */ 357
364 ramdisk_image = boot_params.hdr.ramdisk_image; 358 ramdisk_image = get_ramdisk_image();
365 ramdisk_size = boot_params.hdr.ramdisk_size; 359 ramdisk_size = get_ramdisk_size();
366 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 360 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
367 " [mem %#010llx-%#010llx]\n", 361 " [mem %#010llx-%#010llx]\n",
368 ramdisk_image, ramdisk_image + ramdisk_size - 1, 362 ramdisk_image, ramdisk_image + ramdisk_size - 1,
369 ramdisk_here, ramdisk_here + ramdisk_size - 1); 363 ramdisk_here, ramdisk_here + ramdisk_size - 1);
370} 364}
371 365
366static void __init early_reserve_initrd(void)
367{
368 /* Assume only end is not page aligned */
369 u64 ramdisk_image = get_ramdisk_image();
370 u64 ramdisk_size = get_ramdisk_size();
371 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
372
373 if (!boot_params.hdr.type_of_loader ||
374 !ramdisk_image || !ramdisk_size)
375 return; /* No initrd provided by bootloader */
376
377 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
378}
372static void __init reserve_initrd(void) 379static void __init reserve_initrd(void)
373{ 380{
374 /* Assume only end is not page aligned */ 381 /* Assume only end is not page aligned */
375 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 382 u64 ramdisk_image = get_ramdisk_image();
376 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 383 u64 ramdisk_size = get_ramdisk_size();
377 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 384 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
378 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 385 u64 mapped_size;
379 386
380 if (!boot_params.hdr.type_of_loader || 387 if (!boot_params.hdr.type_of_loader ||
381 !ramdisk_image || !ramdisk_size) 388 !ramdisk_image || !ramdisk_size)
@@ -383,22 +390,18 @@ static void __init reserve_initrd(void)
383 390
384 initrd_start = 0; 391 initrd_start = 0;
385 392
386 if (ramdisk_size >= (end_of_lowmem>>1)) { 393 mapped_size = memblock_mem_size(max_pfn_mapped);
394 if (ramdisk_size >= (mapped_size>>1))
387 panic("initrd too large to handle, " 395 panic("initrd too large to handle, "
388 "disabling initrd (%lld needed, %lld available)\n", 396 "disabling initrd (%lld needed, %lld available)\n",
389 ramdisk_size, end_of_lowmem>>1); 397 ramdisk_size, mapped_size>>1);
390 }
391 398
392 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, 399 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
393 ramdisk_end - 1); 400 ramdisk_end - 1);
394 401
395 402 if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
396 if (ramdisk_end <= end_of_lowmem) { 403 PFN_DOWN(ramdisk_end))) {
397 /* All in lowmem, easy case */ 404 /* All are mapped, easy case */
398 /*
399 * don't need to reserve again, already reserved early
400 * in i386_start_kernel
401 */
402 initrd_start = ramdisk_image + PAGE_OFFSET; 405 initrd_start = ramdisk_image + PAGE_OFFSET;
403 initrd_end = initrd_start + ramdisk_size; 406 initrd_end = initrd_start + ramdisk_size;
404 return; 407 return;
@@ -409,6 +412,9 @@ static void __init reserve_initrd(void)
409 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); 412 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
410} 413}
411#else 414#else
415static void __init early_reserve_initrd(void)
416{
417}
412static void __init reserve_initrd(void) 418static void __init reserve_initrd(void)
413{ 419{
414} 420}
@@ -419,8 +425,6 @@ static void __init parse_setup_data(void)
419 struct setup_data *data; 425 struct setup_data *data;
420 u64 pa_data; 426 u64 pa_data;
421 427
422 if (boot_params.hdr.version < 0x0209)
423 return;
424 pa_data = boot_params.hdr.setup_data; 428 pa_data = boot_params.hdr.setup_data;
425 while (pa_data) { 429 while (pa_data) {
426 u32 data_len, map_len; 430 u32 data_len, map_len;
@@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void)
456 u64 pa_data; 460 u64 pa_data;
457 int found = 0; 461 int found = 0;
458 462
459 if (boot_params.hdr.version < 0x0209)
460 return;
461 pa_data = boot_params.hdr.setup_data; 463 pa_data = boot_params.hdr.setup_data;
462 while (pa_data) { 464 while (pa_data) {
463 data = early_memremap(pa_data, sizeof(*data)); 465 data = early_memremap(pa_data, sizeof(*data));
@@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
481 struct setup_data *data; 483 struct setup_data *data;
482 u64 pa_data; 484 u64 pa_data;
483 485
484 if (boot_params.hdr.version < 0x0209)
485 return;
486 pa_data = boot_params.hdr.setup_data; 486 pa_data = boot_params.hdr.setup_data;
487 while (pa_data) { 487 while (pa_data) {
488 data = early_memremap(pa_data, sizeof(*data)); 488 data = early_memremap(pa_data, sizeof(*data));
@@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void)
501/* 501/*
502 * Keep the crash kernel below this limit. On 32 bits earlier kernels 502 * Keep the crash kernel below this limit. On 32 bits earlier kernels
503 * would limit the kernel to the low 512 MiB due to mapping restrictions. 503 * would limit the kernel to the low 512 MiB due to mapping restrictions.
504 * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
505 * limit once kexec-tools are fixed.
506 */ 504 */
507#ifdef CONFIG_X86_32 505#ifdef CONFIG_X86_32
508# define CRASH_KERNEL_ADDR_MAX (512 << 20) 506# define CRASH_KERNEL_ADDR_MAX (512 << 20)
509#else 507#else
510# define CRASH_KERNEL_ADDR_MAX (896 << 20) 508# define CRASH_KERNEL_ADDR_MAX MAXMEM
511#endif 509#endif
512 510
511static void __init reserve_crashkernel_low(void)
512{
513#ifdef CONFIG_X86_64
514 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long low_base = 0, low_size = 0;
516 unsigned long total_low_mem;
517 unsigned long long base;
518 int ret;
519
520 total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
521 ret = parse_crashkernel_low(boot_command_line, total_low_mem,
522 &low_size, &base);
523 if (ret != 0 || low_size <= 0)
524 return;
525
526 low_base = memblock_find_in_range(low_size, (1ULL<<32),
527 low_size, alignment);
528
529 if (!low_base) {
530 pr_info("crashkernel low reservation failed - No suitable area found.\n");
531
532 return;
533 }
534
535 memblock_reserve(low_base, low_size);
536 pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
537 (unsigned long)(low_size >> 20),
538 (unsigned long)(low_base >> 20),
539 (unsigned long)(total_low_mem >> 20));
540 crashk_low_res.start = low_base;
541 crashk_low_res.end = low_base + low_size - 1;
542 insert_resource(&iomem_resource, &crashk_low_res);
543#endif
544}
545
513static void __init reserve_crashkernel(void) 546static void __init reserve_crashkernel(void)
514{ 547{
548 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long total_mem; 549 unsigned long long total_mem;
516 unsigned long long crash_size, crash_base; 550 unsigned long long crash_size, crash_base;
517 int ret; 551 int ret;
@@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void)
525 559
526 /* 0 means: find the address automatically */ 560 /* 0 means: find the address automatically */
527 if (crash_base <= 0) { 561 if (crash_base <= 0) {
528 const unsigned long long alignment = 16<<20; /* 16M */
529
530 /* 562 /*
531 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX 563 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
532 */ 564 */
@@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void)
537 pr_info("crashkernel reservation failed - No suitable area found.\n"); 569 pr_info("crashkernel reservation failed - No suitable area found.\n");
538 return; 570 return;
539 } 571 }
572
540 } else { 573 } else {
541 unsigned long long start; 574 unsigned long long start;
542 575
@@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void)
558 crashk_res.start = crash_base; 591 crashk_res.start = crash_base;
559 crashk_res.end = crash_base + crash_size - 1; 592 crashk_res.end = crash_base + crash_size - 1;
560 insert_resource(&iomem_resource, &crashk_res); 593 insert_resource(&iomem_resource, &crashk_res);
594
595 if (crash_base >= (1ULL<<32))
596 reserve_crashkernel_low();
561} 597}
562#else 598#else
563static void __init reserve_crashkernel(void) 599static void __init reserve_crashkernel(void)
@@ -708,6 +744,27 @@ static void __init trim_bios_range(void)
708 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 744 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
709} 745}
710 746
747/* called before trim_bios_range() to spare extra sanitize */
748static void __init e820_add_kernel_range(void)
749{
750 u64 start = __pa_symbol(_text);
751 u64 size = __pa_symbol(_end) - start;
752
753 /*
754 * Complain if .text .data and .bss are not marked as E820_RAM and
755 * attempt to fix it by adding the range. We may have a confused BIOS,
756 * or the user may have used memmap=exactmap or memmap=xxM$yyM to
757 * exclude kernel range. If we really are running on top non-RAM,
758 * we will crash later anyways.
759 */
760 if (e820_all_mapped(start, start + size, E820_RAM))
761 return;
762
763 pr_warn(".text .data .bss are not marked as E820_RAM!\n");
764 e820_remove_range(start, size, E820_RAM, 0);
765 e820_add_region(start, size, E820_RAM);
766}
767
711static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; 768static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
712 769
713static int __init parse_reservelow(char *p) 770static int __init parse_reservelow(char *p)
@@ -752,6 +809,17 @@ static void __init trim_low_memory_range(void)
752 809
753void __init setup_arch(char **cmdline_p) 810void __init setup_arch(char **cmdline_p)
754{ 811{
812 memblock_reserve(__pa_symbol(_text),
813 (unsigned long)__bss_stop - (unsigned long)_text);
814
815 early_reserve_initrd();
816
817 /*
818 * At this point everything still needed from the boot loader
819 * or BIOS or kernel text should be early reserved or marked not
820 * RAM in e820. All other memory is free game.
821 */
822
755#ifdef CONFIG_X86_32 823#ifdef CONFIG_X86_32
756 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 824 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
757 visws_early_detect(); 825 visws_early_detect();
@@ -910,6 +978,7 @@ void __init setup_arch(char **cmdline_p)
910 insert_resource(&iomem_resource, &data_resource); 978 insert_resource(&iomem_resource, &data_resource);
911 insert_resource(&iomem_resource, &bss_resource); 979 insert_resource(&iomem_resource, &bss_resource);
912 980
981 e820_add_kernel_range();
913 trim_bios_range(); 982 trim_bios_range();
914#ifdef CONFIG_X86_32 983#ifdef CONFIG_X86_32
915 if (ppro_with_ram_bug()) { 984 if (ppro_with_ram_bug()) {
@@ -959,6 +1028,8 @@ void __init setup_arch(char **cmdline_p)
959 1028
960 reserve_ibft_region(); 1029 reserve_ibft_region();
961 1030
1031 early_alloc_pgt_buf();
1032
962 /* 1033 /*
963 * Need to conclude brk, before memblock_x86_fill() 1034 * Need to conclude brk, before memblock_x86_fill()
964 * it could use memblock_find_in_range, could overlap with 1035 * it could use memblock_find_in_range, could overlap with
@@ -968,7 +1039,7 @@ void __init setup_arch(char **cmdline_p)
968 1039
969 cleanup_highmap(); 1040 cleanup_highmap();
970 1041
971 memblock.current_limit = get_max_mapped(); 1042 memblock.current_limit = ISA_END_ADDRESS;
972 memblock_x86_fill(); 1043 memblock_x86_fill();
973 1044
974 /* 1045 /*
@@ -985,42 +1056,22 @@ void __init setup_arch(char **cmdline_p)
985 setup_bios_corruption_check(); 1056 setup_bios_corruption_check();
986#endif 1057#endif
987 1058
1059#ifdef CONFIG_X86_32
988 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", 1060 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
989 (max_pfn_mapped<<PAGE_SHIFT) - 1); 1061 (max_pfn_mapped<<PAGE_SHIFT) - 1);
1062#endif
990 1063
991 setup_real_mode(); 1064 reserve_real_mode();
992 1065
993 trim_platform_memory_ranges(); 1066 trim_platform_memory_ranges();
994 trim_low_memory_range(); 1067 trim_low_memory_range();
995 1068
996 init_gbpages(); 1069 init_mem_mapping();
997
998 /* max_pfn_mapped is updated here */
999 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
1000 max_pfn_mapped = max_low_pfn_mapped;
1001
1002#ifdef CONFIG_X86_64
1003 if (max_pfn > max_low_pfn) {
1004 int i;
1005 unsigned long start, end;
1006 unsigned long start_pfn, end_pfn;
1007
1008 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
1009 NULL) {
1010 1070
1011 end = PFN_PHYS(end_pfn); 1071 early_trap_pf_init();
1012 if (end <= (1UL<<32))
1013 continue;
1014 1072
1015 start = PFN_PHYS(start_pfn); 1073 setup_real_mode();
1016 max_pfn_mapped = init_memory_mapping(
1017 max((1UL<<32), start), end);
1018 }
1019 1074
1020 /* can we preseve max_low_pfn ?*/
1021 max_low_pfn = max_pfn;
1022 }
1023#endif
1024 memblock.current_limit = get_max_mapped(); 1075 memblock.current_limit = get_max_mapped();
1025 dma_contiguous_reserve(0); 1076 dma_contiguous_reserve(0);
1026 1077
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca11f4e9..68bda7a84159 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -688,10 +688,19 @@ void __init early_trap_init(void)
688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
689 /* int3 can be called from all */ 689 /* int3 can be called from all */
690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
691#ifdef CONFIG_X86_32
691 set_intr_gate(X86_TRAP_PF, &page_fault); 692 set_intr_gate(X86_TRAP_PF, &page_fault);
693#endif
692 load_idt(&idt_descr); 694 load_idt(&idt_descr);
693} 695}
694 696
697void __init early_trap_pf_init(void)
698{
699#ifdef CONFIG_X86_64
700 set_intr_gate(X86_TRAP_PF, &page_fault);
701#endif
702}
703
695void __init trap_init(void) 704void __init trap_init(void)
696{ 705{
697 int i; 706 int i;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 7a3d075a814a..50cf83ecd32e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = {
62 .banner = default_banner, 62 .banner = default_banner,
63 }, 63 },
64 64
65 .mapping = {
66 .pagetable_reserve = native_pagetable_reserve,
67 },
68
69 .paging = { 65 .paging = {
70 .pagetable_init = native_pagetable_init, 66 .pagetable_init = native_pagetable_init,
71 }, 67 },