aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-16 18:54:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-16 18:54:01 -0400
commit9a45f036af363aec1efec08827c825d69c115a9a (patch)
treed9a81016dacbbcdf87d8e2ec3dcebed6b5029870 /arch/x86
parent168f1a7163b37294a0ef33829e1ed54d41e33c42 (diff)
parentd2d3462f9f08da364c8fbd41e8e32229d610d49d (diff)
Merge branch 'x86-boot-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 boot updates from Ingo Molnar: "The biggest changes in this cycle were: - prepare for more KASLR related changes, by restructuring, cleaning up and fixing the existing boot code. (Kees Cook, Baoquan He, Yinghai Lu) - simplifly/concentrate subarch handling code, eliminate paravirt_enabled() usage. (Luis R Rodriguez)" * 'x86-boot-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (50 commits) x86/KASLR: Clarify purpose of each get_random_long() x86/KASLR: Add virtual address choosing function x86/KASLR: Return earliest overlap when avoiding regions x86/KASLR: Add 'struct slot_area' to manage random_addr slots x86/boot: Add missing file header comments x86/KASLR: Initialize mapping_info every time x86/boot: Comment what finalize_identity_maps() does x86/KASLR: Build identity mappings on demand x86/boot: Split out kernel_ident_mapping_init() x86/boot: Clean up indenting for asm/boot.h x86/KASLR: Improve comments around the mem_avoid[] logic x86/boot: Simplify pointer casting in choose_random_location() x86/KASLR: Consolidate mem_avoid[] entries x86/boot: Clean up pointer casting x86/boot: Warn on future overlapping memcpy() use x86/boot: Extract error reporting functions x86/boot: Correctly bounds-check relocations x86/KASLR: Clean up unused code from old 'run_size' and rename it to 'kernel_total_size' x86/boot: Fix "run_size" calculation x86/boot: Calculate decompression size during boot not build ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig72
-rw-r--r--arch/x86/Makefile3
-rw-r--r--arch/x86/boot/Makefile13
-rw-r--r--arch/x86/boot/compressed/Makefile23
-rw-r--r--arch/x86/boot/compressed/aslr.c339
-rw-r--r--arch/x86/boot/compressed/cmdline.c4
-rw-r--r--arch/x86/boot/compressed/error.c22
-rw-r--r--arch/x86/boot/compressed/error.h7
-rw-r--r--arch/x86/boot/compressed/head_32.S22
-rw-r--r--arch/x86/boot/compressed/head_64.S19
-rw-r--r--arch/x86/boot/compressed/kaslr.c510
-rw-r--r--arch/x86/boot/compressed/misc.c188
-rw-r--r--arch/x86/boot/compressed/misc.h27
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c34
-rw-r--r--arch/x86/boot/compressed/pagetable.c129
-rw-r--r--arch/x86/boot/compressed/string.c37
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S1
-rw-r--r--arch/x86/boot/early_serial_console.c4
-rw-r--r--arch/x86/boot/header.S109
-rw-r--r--arch/x86/include/asm/boot.h39
-rw-r--r--arch/x86/include/asm/page.h5
-rw-r--r--arch/x86/include/asm/page_64_types.h8
-rw-r--r--arch/x86/include/asm/paravirt.h11
-rw-r--r--arch/x86/include/asm/paravirt_types.h6
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/x86_init.h50
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h41
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/boot.c9
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/ebda.c (renamed from arch/x86/kernel/head.c)2
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/kvm.c8
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/platform-quirks.c35
-rw-r--r--arch/x86/kernel/rtc.c18
-rw-r--r--arch/x86/kernel/tboot.c6
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/lguest/boot.c3
-rw-r--r--arch/x86/mm/ident_map.c79
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c74
-rw-r--r--arch/x86/tools/calc_run_size.sh42
-rw-r--r--arch/x86/xen/enlighten.c12
47 files changed, 1250 insertions, 783 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a494fa34713a..7bb15747fea2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1921,54 +1921,38 @@ config RELOCATABLE
1921 (CONFIG_PHYSICAL_START) is used as the minimum location. 1921 (CONFIG_PHYSICAL_START) is used as the minimum location.
1922 1922
1923config RANDOMIZE_BASE 1923config RANDOMIZE_BASE
1924 bool "Randomize the address of the kernel image" 1924 bool "Randomize the address of the kernel image (KASLR)"
1925 depends on RELOCATABLE 1925 depends on RELOCATABLE
1926 default n 1926 default n
1927 ---help--- 1927 ---help---
1928 Randomizes the physical and virtual address at which the 1928 In support of Kernel Address Space Layout Randomization (KASLR),
1929 kernel image is decompressed, as a security feature that 1929 this randomizes the physical address at which the kernel image
1930 deters exploit attempts relying on knowledge of the location 1930 is decompressed and the virtual address where the kernel
1931 of kernel internals. 1931 image is mapped, as a security feature that deters exploit
1932 attempts relying on knowledge of the location of kernel
1933 code internals.
1934
1935 The kernel physical and virtual address can be randomized
1936 from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that
1937 using RANDOMIZE_BASE reduces the memory space available to
1938 kernel modules from 1.5GB to 1GB.)
1939
1940 Entropy is generated using the RDRAND instruction if it is
1941 supported. If RDTSC is supported, its value is mixed into
1942 the entropy pool as well. If neither RDRAND nor RDTSC are
1943 supported, then entropy is read from the i8254 timer.
1944
1945 Since the kernel is built using 2GB addressing, and
1946 PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of
1947 entropy is theoretically possible. Currently, with the
1948 default value for PHYSICAL_ALIGN and due to page table
1949 layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits.
1950
1951 If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot
1952 time. To enable it, boot with "kaslr" on the kernel command
1953 line (which will also disable hibernation).
1932 1954
1933 Entropy is generated using the RDRAND instruction if it is 1955 If unsure, say N.
1934 supported. If RDTSC is supported, it is used as well. If
1935 neither RDRAND nor RDTSC are supported, then randomness is
1936 read from the i8254 timer.
1937
1938 The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET,
1939 and aligned according to PHYSICAL_ALIGN. Since the kernel is
1940 built using 2GiB addressing, and PHYSICAL_ALGIN must be at a
1941 minimum of 2MiB, only 10 bits of entropy is theoretically
1942 possible. At best, due to page table layouts, 64-bit can use
1943 9 bits of entropy and 32-bit uses 8 bits.
1944
1945 If unsure, say N.
1946
1947config RANDOMIZE_BASE_MAX_OFFSET
1948 hex "Maximum kASLR offset allowed" if EXPERT
1949 depends on RANDOMIZE_BASE
1950 range 0x0 0x20000000 if X86_32
1951 default "0x20000000" if X86_32
1952 range 0x0 0x40000000 if X86_64
1953 default "0x40000000" if X86_64
1954 ---help---
1955 The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical
1956 memory is used to determine the maximal offset in bytes that will
1957 be applied to the kernel when kernel Address Space Layout
1958 Randomization (kASLR) is active. This must be a multiple of
1959 PHYSICAL_ALIGN.
1960
1961 On 32-bit this is limited to 512MiB by page table layouts. The
1962 default is 512MiB.
1963
1964 On 64-bit this is limited by how the kernel fixmap page table is
1965 positioned, so this cannot be larger than 1GiB currently. Without
1966 RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel
1967 and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the
1968 modules area will shrink to compensate, up to the current maximum
1969 1GiB to 1GiB split. The default is 1GiB.
1970
1971 If unsure, leave at the default value.
1972 1956
1973# Relocation on x86 needs some additional build support 1957# Relocation on x86 needs some additional build support
1974config X86_NEED_RELOCS 1958config X86_NEED_RELOCS
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 4086abca0b32..6fce7f096b88 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -208,7 +208,8 @@ endif
208 208
209head-y := arch/x86/kernel/head_$(BITS).o 209head-y := arch/x86/kernel/head_$(BITS).o
210head-y += arch/x86/kernel/head$(BITS).o 210head-y += arch/x86/kernel/head$(BITS).o
211head-y += arch/x86/kernel/head.o 211head-y += arch/x86/kernel/ebda.o
212head-y += arch/x86/kernel/platform-quirks.o
212 213
213libs-y += arch/x86/lib/ 214libs-y += arch/x86/lib/
214 215
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index b1ef9e489084..700a9c6e6159 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -86,16 +86,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
86 86
87SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) 87SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
88 88
89sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' 89sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
90
91quiet_cmd_voffset = VOFFSET $@
92 cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
93
94targets += voffset.h
95$(obj)/voffset.h: vmlinux FORCE
96 $(call if_changed,voffset)
97
98sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
99 90
100quiet_cmd_zoffset = ZOFFSET $@ 91quiet_cmd_zoffset = ZOFFSET $@
101 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ 92 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
@@ -106,7 +97,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
106 97
107 98
108AFLAGS_header.o += -I$(obj) 99AFLAGS_header.o += -I$(obj)
109$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h 100$(obj)/header.o: $(obj)/zoffset.h
110 101
111LDFLAGS_setup.elf := -T 102LDFLAGS_setup.elf := -T
112$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE 103$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 8774cb23064f..cfdd8c3f8af2 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -57,12 +57,27 @@ LDFLAGS_vmlinux := -T
57hostprogs-y := mkpiggy 57hostprogs-y := mkpiggy
58HOST_EXTRACFLAGS += -I$(srctree)/tools/include 58HOST_EXTRACFLAGS += -I$(srctree)/tools/include
59 59
60sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
61
62quiet_cmd_voffset = VOFFSET $@
63 cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
64
65targets += ../voffset.h
66
67$(obj)/../voffset.h: vmlinux FORCE
68 $(call if_changed,voffset)
69
70$(obj)/misc.o: $(obj)/../voffset.h
71
60vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ 72vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
61 $(obj)/string.o $(obj)/cmdline.o \ 73 $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \
62 $(obj)/piggy.o $(obj)/cpuflags.o 74 $(obj)/piggy.o $(obj)/cpuflags.o
63 75
64vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o 76vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
65vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o 77vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
78ifdef CONFIG_X86_64
79 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
80endif
66 81
67$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone 82$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
68 83
@@ -109,10 +124,8 @@ suffix-$(CONFIG_KERNEL_XZ) := xz
109suffix-$(CONFIG_KERNEL_LZO) := lzo 124suffix-$(CONFIG_KERNEL_LZO) := lzo
110suffix-$(CONFIG_KERNEL_LZ4) := lz4 125suffix-$(CONFIG_KERNEL_LZ4) := lz4
111 126
112RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \
113 $(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh)
114quiet_cmd_mkpiggy = MKPIGGY $@ 127quiet_cmd_mkpiggy = MKPIGGY $@
115 cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) 128 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
116 129
117targets += piggy.S 130targets += piggy.S
118$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE 131$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
deleted file mode 100644
index 6a9b96b4624d..000000000000
--- a/arch/x86/boot/compressed/aslr.c
+++ /dev/null
@@ -1,339 +0,0 @@
1#include "misc.h"
2
3#include <asm/msr.h>
4#include <asm/archrandom.h>
5#include <asm/e820.h>
6
7#include <generated/compile.h>
8#include <linux/module.h>
9#include <linux/uts.h>
10#include <linux/utsname.h>
11#include <generated/utsrelease.h>
12
13/* Simplified build-specific string for starting entropy. */
14static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
15 LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
16
17#define I8254_PORT_CONTROL 0x43
18#define I8254_PORT_COUNTER0 0x40
19#define I8254_CMD_READBACK 0xC0
20#define I8254_SELECT_COUNTER0 0x02
21#define I8254_STATUS_NOTREADY 0x40
22static inline u16 i8254(void)
23{
24 u16 status, timer;
25
26 do {
27 outb(I8254_PORT_CONTROL,
28 I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
29 status = inb(I8254_PORT_COUNTER0);
30 timer = inb(I8254_PORT_COUNTER0);
31 timer |= inb(I8254_PORT_COUNTER0) << 8;
32 } while (status & I8254_STATUS_NOTREADY);
33
34 return timer;
35}
36
37static unsigned long rotate_xor(unsigned long hash, const void *area,
38 size_t size)
39{
40 size_t i;
41 unsigned long *ptr = (unsigned long *)area;
42
43 for (i = 0; i < size / sizeof(hash); i++) {
44 /* Rotate by odd number of bits and XOR. */
45 hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
46 hash ^= ptr[i];
47 }
48
49 return hash;
50}
51
52/* Attempt to create a simple but unpredictable starting entropy. */
53static unsigned long get_random_boot(void)
54{
55 unsigned long hash = 0;
56
57 hash = rotate_xor(hash, build_str, sizeof(build_str));
58 hash = rotate_xor(hash, real_mode, sizeof(*real_mode));
59
60 return hash;
61}
62
63static unsigned long get_random_long(void)
64{
65#ifdef CONFIG_X86_64
66 const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
67#else
68 const unsigned long mix_const = 0x3f39e593UL;
69#endif
70 unsigned long raw, random = get_random_boot();
71 bool use_i8254 = true;
72
73 debug_putstr("KASLR using");
74
75 if (has_cpuflag(X86_FEATURE_RDRAND)) {
76 debug_putstr(" RDRAND");
77 if (rdrand_long(&raw)) {
78 random ^= raw;
79 use_i8254 = false;
80 }
81 }
82
83 if (has_cpuflag(X86_FEATURE_TSC)) {
84 debug_putstr(" RDTSC");
85 raw = rdtsc();
86
87 random ^= raw;
88 use_i8254 = false;
89 }
90
91 if (use_i8254) {
92 debug_putstr(" i8254");
93 random ^= i8254();
94 }
95
96 /* Circular multiply for better bit diffusion */
97 asm("mul %3"
98 : "=a" (random), "=d" (raw)
99 : "a" (random), "rm" (mix_const));
100 random += raw;
101
102 debug_putstr("...\n");
103
104 return random;
105}
106
107struct mem_vector {
108 unsigned long start;
109 unsigned long size;
110};
111
112#define MEM_AVOID_MAX 5
113static struct mem_vector mem_avoid[MEM_AVOID_MAX];
114
115static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
116{
117 /* Item at least partially before region. */
118 if (item->start < region->start)
119 return false;
120 /* Item at least partially after region. */
121 if (item->start + item->size > region->start + region->size)
122 return false;
123 return true;
124}
125
126static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
127{
128 /* Item one is entirely before item two. */
129 if (one->start + one->size <= two->start)
130 return false;
131 /* Item one is entirely after item two. */
132 if (one->start >= two->start + two->size)
133 return false;
134 return true;
135}
136
137static void mem_avoid_init(unsigned long input, unsigned long input_size,
138 unsigned long output, unsigned long output_size)
139{
140 u64 initrd_start, initrd_size;
141 u64 cmd_line, cmd_line_size;
142 unsigned long unsafe, unsafe_len;
143 char *ptr;
144
145 /*
146 * Avoid the region that is unsafe to overlap during
147 * decompression (see calculations at top of misc.c).
148 */
149 unsafe_len = (output_size >> 12) + 32768 + 18;
150 unsafe = (unsigned long)input + input_size - unsafe_len;
151 mem_avoid[0].start = unsafe;
152 mem_avoid[0].size = unsafe_len;
153
154 /* Avoid initrd. */
155 initrd_start = (u64)real_mode->ext_ramdisk_image << 32;
156 initrd_start |= real_mode->hdr.ramdisk_image;
157 initrd_size = (u64)real_mode->ext_ramdisk_size << 32;
158 initrd_size |= real_mode->hdr.ramdisk_size;
159 mem_avoid[1].start = initrd_start;
160 mem_avoid[1].size = initrd_size;
161
162 /* Avoid kernel command line. */
163 cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32;
164 cmd_line |= real_mode->hdr.cmd_line_ptr;
165 /* Calculate size of cmd_line. */
166 ptr = (char *)(unsigned long)cmd_line;
167 for (cmd_line_size = 0; ptr[cmd_line_size++]; )
168 ;
169 mem_avoid[2].start = cmd_line;
170 mem_avoid[2].size = cmd_line_size;
171
172 /* Avoid heap memory. */
173 mem_avoid[3].start = (unsigned long)free_mem_ptr;
174 mem_avoid[3].size = BOOT_HEAP_SIZE;
175
176 /* Avoid stack memory. */
177 mem_avoid[4].start = (unsigned long)free_mem_end_ptr;
178 mem_avoid[4].size = BOOT_STACK_SIZE;
179}
180
181/* Does this memory vector overlap a known avoided area? */
182static bool mem_avoid_overlap(struct mem_vector *img)
183{
184 int i;
185 struct setup_data *ptr;
186
187 for (i = 0; i < MEM_AVOID_MAX; i++) {
188 if (mem_overlaps(img, &mem_avoid[i]))
189 return true;
190 }
191
192 /* Avoid all entries in the setup_data linked list. */
193 ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data;
194 while (ptr) {
195 struct mem_vector avoid;
196
197 avoid.start = (unsigned long)ptr;
198 avoid.size = sizeof(*ptr) + ptr->len;
199
200 if (mem_overlaps(img, &avoid))
201 return true;
202
203 ptr = (struct setup_data *)(unsigned long)ptr->next;
204 }
205
206 return false;
207}
208
209static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
210 CONFIG_PHYSICAL_ALIGN];
211static unsigned long slot_max;
212
213static void slots_append(unsigned long addr)
214{
215 /* Overflowing the slots list should be impossible. */
216 if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
217 CONFIG_PHYSICAL_ALIGN)
218 return;
219
220 slots[slot_max++] = addr;
221}
222
223static unsigned long slots_fetch_random(void)
224{
225 /* Handle case of no slots stored. */
226 if (slot_max == 0)
227 return 0;
228
229 return slots[get_random_long() % slot_max];
230}
231
232static void process_e820_entry(struct e820entry *entry,
233 unsigned long minimum,
234 unsigned long image_size)
235{
236 struct mem_vector region, img;
237
238 /* Skip non-RAM entries. */
239 if (entry->type != E820_RAM)
240 return;
241
242 /* Ignore entries entirely above our maximum. */
243 if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
244 return;
245
246 /* Ignore entries entirely below our minimum. */
247 if (entry->addr + entry->size < minimum)
248 return;
249
250 region.start = entry->addr;
251 region.size = entry->size;
252
253 /* Potentially raise address to minimum location. */
254 if (region.start < minimum)
255 region.start = minimum;
256
257 /* Potentially raise address to meet alignment requirements. */
258 region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
259
260 /* Did we raise the address above the bounds of this e820 region? */
261 if (region.start > entry->addr + entry->size)
262 return;
263
264 /* Reduce size by any delta from the original address. */
265 region.size -= region.start - entry->addr;
266
267 /* Reduce maximum size to fit end of image within maximum limit. */
268 if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
269 region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start;
270
271 /* Walk each aligned slot and check for avoided areas. */
272 for (img.start = region.start, img.size = image_size ;
273 mem_contains(&region, &img) ;
274 img.start += CONFIG_PHYSICAL_ALIGN) {
275 if (mem_avoid_overlap(&img))
276 continue;
277 slots_append(img.start);
278 }
279}
280
281static unsigned long find_random_addr(unsigned long minimum,
282 unsigned long size)
283{
284 int i;
285 unsigned long addr;
286
287 /* Make sure minimum is aligned. */
288 minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
289
290 /* Verify potential e820 positions, appending to slots list. */
291 for (i = 0; i < real_mode->e820_entries; i++) {
292 process_e820_entry(&real_mode->e820_map[i], minimum, size);
293 }
294
295 return slots_fetch_random();
296}
297
298unsigned char *choose_kernel_location(struct boot_params *boot_params,
299 unsigned char *input,
300 unsigned long input_size,
301 unsigned char *output,
302 unsigned long output_size)
303{
304 unsigned long choice = (unsigned long)output;
305 unsigned long random;
306
307#ifdef CONFIG_HIBERNATION
308 if (!cmdline_find_option_bool("kaslr")) {
309 debug_putstr("KASLR disabled by default...\n");
310 goto out;
311 }
312#else
313 if (cmdline_find_option_bool("nokaslr")) {
314 debug_putstr("KASLR disabled by cmdline...\n");
315 goto out;
316 }
317#endif
318
319 boot_params->hdr.loadflags |= KASLR_FLAG;
320
321 /* Record the various known unsafe memory ranges. */
322 mem_avoid_init((unsigned long)input, input_size,
323 (unsigned long)output, output_size);
324
325 /* Walk e820 and find a random address. */
326 random = find_random_addr(choice, output_size);
327 if (!random) {
328 debug_putstr("KASLR could not find suitable E820 region...\n");
329 goto out;
330 }
331
332 /* Always enforce the minimum. */
333 if (random < choice)
334 goto out;
335
336 choice = random;
337out:
338 return (unsigned char *)choice;
339}
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index b68e3033e6b9..73ccf63b0f48 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -15,9 +15,9 @@ static inline char rdfs8(addr_t addr)
15#include "../cmdline.c" 15#include "../cmdline.c"
16static unsigned long get_cmd_line_ptr(void) 16static unsigned long get_cmd_line_ptr(void)
17{ 17{
18 unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; 18 unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
19 19
20 cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; 20 cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32;
21 21
22 return cmd_line_ptr; 22 return cmd_line_ptr;
23} 23}
diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c
new file mode 100644
index 000000000000..6248740b68b5
--- /dev/null
+++ b/arch/x86/boot/compressed/error.c
@@ -0,0 +1,22 @@
1/*
2 * Callers outside of misc.c need access to the error reporting routines,
3 * but the *_putstr() functions need to stay in misc.c because of how
4 * memcpy() and memmove() are defined for the compressed boot environment.
5 */
6#include "misc.h"
7
8void warn(char *m)
9{
10 error_putstr("\n\n");
11 error_putstr(m);
12 error_putstr("\n\n");
13}
14
15void error(char *m)
16{
17 warn(m);
18 error_putstr(" -- System halted");
19
20 while (1)
21 asm("hlt");
22}
diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h
new file mode 100644
index 000000000000..2e59dac07f9e
--- /dev/null
+++ b/arch/x86/boot/compressed/error.h
@@ -0,0 +1,7 @@
1#ifndef BOOT_COMPRESSED_ERROR_H
2#define BOOT_COMPRESSED_ERROR_H
3
4void warn(char *m);
5void error(char *m);
6
7#endif /* BOOT_COMPRESSED_ERROR_H */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 0256064da8da..1038524270e7 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -176,7 +176,9 @@ preferred_addr:
1761: 1761:
177 177
178 /* Target address to relocate to for decompression */ 178 /* Target address to relocate to for decompression */
179 addl $z_extract_offset, %ebx 179 movl BP_init_size(%esi), %eax
180 subl $_end, %eax
181 addl %eax, %ebx
180 182
181 /* Set up the stack */ 183 /* Set up the stack */
182 leal boot_stack_end(%ebx), %esp 184 leal boot_stack_end(%ebx), %esp
@@ -233,24 +235,28 @@ relocated:
2332: 2352:
234 236
235/* 237/*
236 * Do the decompression, and jump to the new kernel.. 238 * Do the extraction, and jump to the new kernel..
237 */ 239 */
238 /* push arguments for decompress_kernel: */ 240 /* push arguments for extract_kernel: */
239 pushl $z_run_size /* size of kernel with .bss and .brk */
240 pushl $z_output_len /* decompressed length, end of relocs */ 241 pushl $z_output_len /* decompressed length, end of relocs */
241 leal z_extract_offset_negative(%ebx), %ebp 242
243 movl BP_init_size(%esi), %eax
244 subl $_end, %eax
245 movl %ebx, %ebp
246 subl %eax, %ebp
242 pushl %ebp /* output address */ 247 pushl %ebp /* output address */
248
243 pushl $z_input_len /* input_len */ 249 pushl $z_input_len /* input_len */
244 leal input_data(%ebx), %eax 250 leal input_data(%ebx), %eax
245 pushl %eax /* input_data */ 251 pushl %eax /* input_data */
246 leal boot_heap(%ebx), %eax 252 leal boot_heap(%ebx), %eax
247 pushl %eax /* heap area */ 253 pushl %eax /* heap area */
248 pushl %esi /* real mode pointer */ 254 pushl %esi /* real mode pointer */
249 call decompress_kernel /* returns kernel location in %eax */ 255 call extract_kernel /* returns kernel location in %eax */
250 addl $28, %esp 256 addl $24, %esp
251 257
252/* 258/*
253 * Jump to the decompressed kernel. 259 * Jump to the extracted kernel.
254 */ 260 */
255 xorl %ebx, %ebx 261 xorl %ebx, %ebx
256 jmp *%eax 262 jmp *%eax
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 86558a199139..0d80a7ad65cd 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -110,7 +110,9 @@ ENTRY(startup_32)
1101: 1101:
111 111
112 /* Target address to relocate to for decompression */ 112 /* Target address to relocate to for decompression */
113 addl $z_extract_offset, %ebx 113 movl BP_init_size(%esi), %eax
114 subl $_end, %eax
115 addl %eax, %ebx
114 116
115/* 117/*
116 * Prepare for entering 64 bit mode 118 * Prepare for entering 64 bit mode
@@ -132,7 +134,7 @@ ENTRY(startup_32)
132 /* Initialize Page tables to 0 */ 134 /* Initialize Page tables to 0 */
133 leal pgtable(%ebx), %edi 135 leal pgtable(%ebx), %edi
134 xorl %eax, %eax 136 xorl %eax, %eax
135 movl $((4096*6)/4), %ecx 137 movl $(BOOT_INIT_PGT_SIZE/4), %ecx
136 rep stosl 138 rep stosl
137 139
138 /* Build Level 4 */ 140 /* Build Level 4 */
@@ -338,7 +340,9 @@ preferred_addr:
3381: 3401:
339 341
340 /* Target address to relocate to for decompression */ 342 /* Target address to relocate to for decompression */
341 leaq z_extract_offset(%rbp), %rbx 343 movl BP_init_size(%rsi), %ebx
344 subl $_end, %ebx
345 addq %rbp, %rbx
342 346
343 /* Set up the stack */ 347 /* Set up the stack */
344 leaq boot_stack_end(%rbx), %rsp 348 leaq boot_stack_end(%rbx), %rsp
@@ -408,19 +412,16 @@ relocated:
4082: 4122:
409 413
410/* 414/*
411 * Do the decompression, and jump to the new kernel.. 415 * Do the extraction, and jump to the new kernel..
412 */ 416 */
413 pushq %rsi /* Save the real mode argument */ 417 pushq %rsi /* Save the real mode argument */
414 movq $z_run_size, %r9 /* size of kernel with .bss and .brk */
415 pushq %r9
416 movq %rsi, %rdi /* real mode address */ 418 movq %rsi, %rdi /* real mode address */
417 leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ 419 leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
418 leaq input_data(%rip), %rdx /* input_data */ 420 leaq input_data(%rip), %rdx /* input_data */
419 movl $z_input_len, %ecx /* input_len */ 421 movl $z_input_len, %ecx /* input_len */
420 movq %rbp, %r8 /* output target address */ 422 movq %rbp, %r8 /* output target address */
421 movq $z_output_len, %r9 /* decompressed length, end of relocs */ 423 movq $z_output_len, %r9 /* decompressed length, end of relocs */
422 call decompress_kernel /* returns kernel location in %rax */ 424 call extract_kernel /* returns kernel location in %rax */
423 popq %r9
424 popq %rsi 425 popq %rsi
425 426
426/* 427/*
@@ -485,4 +486,4 @@ boot_stack_end:
485 .section ".pgtable","a",@nobits 486 .section ".pgtable","a",@nobits
486 .balign 4096 487 .balign 4096
487pgtable: 488pgtable:
488 .fill 6*4096, 1, 0 489 .fill BOOT_PGT_SIZE, 1, 0
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
new file mode 100644
index 000000000000..cfeb0259ed81
--- /dev/null
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -0,0 +1,510 @@
1/*
2 * kaslr.c
3 *
4 * This contains the routines needed to generate a reasonable level of
5 * entropy to choose a randomized kernel base address offset in support
6 * of Kernel Address Space Layout Randomization (KASLR). Additionally
7 * handles walking the physical memory maps (and tracking memory regions
8 * to avoid) in order to select a physical memory location that can
9 * contain the entire properly aligned running kernel image.
10 *
11 */
12#include "misc.h"
13#include "error.h"
14
15#include <asm/msr.h>
16#include <asm/archrandom.h>
17#include <asm/e820.h>
18
19#include <generated/compile.h>
20#include <linux/module.h>
21#include <linux/uts.h>
22#include <linux/utsname.h>
23#include <generated/utsrelease.h>
24
25/* Simplified build-specific string for starting entropy. */
26static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
27 LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
28
29#define I8254_PORT_CONTROL 0x43
30#define I8254_PORT_COUNTER0 0x40
31#define I8254_CMD_READBACK 0xC0
32#define I8254_SELECT_COUNTER0 0x02
33#define I8254_STATUS_NOTREADY 0x40
34static inline u16 i8254(void)
35{
36 u16 status, timer;
37
38 do {
39 outb(I8254_PORT_CONTROL,
40 I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
41 status = inb(I8254_PORT_COUNTER0);
42 timer = inb(I8254_PORT_COUNTER0);
43 timer |= inb(I8254_PORT_COUNTER0) << 8;
44 } while (status & I8254_STATUS_NOTREADY);
45
46 return timer;
47}
48
49static unsigned long rotate_xor(unsigned long hash, const void *area,
50 size_t size)
51{
52 size_t i;
53 unsigned long *ptr = (unsigned long *)area;
54
55 for (i = 0; i < size / sizeof(hash); i++) {
56 /* Rotate by odd number of bits and XOR. */
57 hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
58 hash ^= ptr[i];
59 }
60
61 return hash;
62}
63
64/* Attempt to create a simple but unpredictable starting entropy. */
65static unsigned long get_random_boot(void)
66{
67 unsigned long hash = 0;
68
69 hash = rotate_xor(hash, build_str, sizeof(build_str));
70 hash = rotate_xor(hash, boot_params, sizeof(*boot_params));
71
72 return hash;
73}
74
75static unsigned long get_random_long(const char *purpose)
76{
77#ifdef CONFIG_X86_64
78 const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
79#else
80 const unsigned long mix_const = 0x3f39e593UL;
81#endif
82 unsigned long raw, random = get_random_boot();
83 bool use_i8254 = true;
84
85 debug_putstr(purpose);
86 debug_putstr(" KASLR using");
87
88 if (has_cpuflag(X86_FEATURE_RDRAND)) {
89 debug_putstr(" RDRAND");
90 if (rdrand_long(&raw)) {
91 random ^= raw;
92 use_i8254 = false;
93 }
94 }
95
96 if (has_cpuflag(X86_FEATURE_TSC)) {
97 debug_putstr(" RDTSC");
98 raw = rdtsc();
99
100 random ^= raw;
101 use_i8254 = false;
102 }
103
104 if (use_i8254) {
105 debug_putstr(" i8254");
106 random ^= i8254();
107 }
108
109 /* Circular multiply for better bit diffusion */
110 asm("mul %3"
111 : "=a" (random), "=d" (raw)
112 : "a" (random), "rm" (mix_const));
113 random += raw;
114
115 debug_putstr("...\n");
116
117 return random;
118}
119
120struct mem_vector {
121 unsigned long start;
122 unsigned long size;
123};
124
125enum mem_avoid_index {
126 MEM_AVOID_ZO_RANGE = 0,
127 MEM_AVOID_INITRD,
128 MEM_AVOID_CMDLINE,
129 MEM_AVOID_BOOTPARAMS,
130 MEM_AVOID_MAX,
131};
132
133static struct mem_vector mem_avoid[MEM_AVOID_MAX];
134
135static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
136{
137 /* Item at least partially before region. */
138 if (item->start < region->start)
139 return false;
140 /* Item at least partially after region. */
141 if (item->start + item->size > region->start + region->size)
142 return false;
143 return true;
144}
145
146static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
147{
148 /* Item one is entirely before item two. */
149 if (one->start + one->size <= two->start)
150 return false;
151 /* Item one is entirely after item two. */
152 if (one->start >= two->start + two->size)
153 return false;
154 return true;
155}
156
157/*
158 * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
159 * The mem_avoid array is used to store the ranges that need to be avoided
160 * when KASLR searches for an appropriate random address. We must avoid any
161 * regions that are unsafe to overlap with during decompression, and other
162 * things like the initrd, cmdline and boot_params. This comment seeks to
163 * explain mem_avoid as clearly as possible since incorrect mem_avoid
164 * memory ranges lead to really hard to debug boot failures.
165 *
166 * The initrd, cmdline, and boot_params are trivial to identify for
167 * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and
168 * MEM_AVOID_BOOTPARAMS respectively below.
169 *
170 * What is not obvious how to avoid is the range of memory that is used
171 * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover
172 * the compressed kernel (ZO) and its run space, which is used to extract
173 * the uncompressed kernel (VO) and relocs.
174 *
175 * ZO's full run size sits against the end of the decompression buffer, so
176 * we can calculate where text, data, bss, etc of ZO are positioned more
177 * easily.
178 *
179 * For additional background, the decompression calculations can be found
180 * in header.S, and the memory diagram is based on the one found in misc.c.
181 *
182 * The following conditions are already enforced by the image layouts and
183 * associated code:
184 * - input + input_size >= output + output_size
185 * - kernel_total_size <= init_size
186 * - kernel_total_size <= output_size (see Note below)
187 * - output + init_size >= output + output_size
188 *
189 * (Note that kernel_total_size and output_size have no fundamental
190 * relationship, but output_size is passed to choose_random_location
191 * as a maximum of the two. The diagram is showing a case where
192 * kernel_total_size is larger than output_size, but this case is
193 * handled by bumping output_size.)
194 *
195 * The above conditions can be illustrated by a diagram:
196 *
197 * 0 output input input+input_size output+init_size
198 * | | | | |
199 * | | | | |
200 * |-----|--------|--------|--------------|-----------|--|-------------|
201 * | | |
202 * | | |
203 * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size
204 *
205 * [output, output+init_size) is the entire memory range used for
206 * extracting the compressed image.
207 *
208 * [output, output+kernel_total_size) is the range needed for the
209 * uncompressed kernel (VO) and its run size (bss, brk, etc).
210 *
211 * [output, output+output_size) is VO plus relocs (i.e. the entire
212 * uncompressed payload contained by ZO). This is the area of the buffer
213 * written to during decompression.
214 *
215 * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case
216 * range of the copied ZO and decompression code. (i.e. the range
217 * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.)
218 *
219 * [input, input+input_size) is the original copied compressed image (ZO)
220 * (i.e. it does not include its run size). This range must be avoided
221 * because it contains the data used for decompression.
222 *
223 * [input+input_size, output+init_size) is [_text, _end) for ZO. This
224 * range includes ZO's heap and stack, and must be avoided since it
225 * performs the decompression.
226 *
227 * Since the above two ranges need to be avoided and they are adjacent,
228 * they can be merged, resulting in: [input, output+init_size) which
229 * becomes the MEM_AVOID_ZO_RANGE below.
230 */
231static void mem_avoid_init(unsigned long input, unsigned long input_size,
232 unsigned long output)
233{
234 unsigned long init_size = boot_params->hdr.init_size;
235 u64 initrd_start, initrd_size;
236 u64 cmd_line, cmd_line_size;
237 char *ptr;
238
239 /*
240 * Avoid the region that is unsafe to overlap during
241 * decompression.
242 */
243 mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
244 mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
245 add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start,
246 mem_avoid[MEM_AVOID_ZO_RANGE].size);
247
248 /* Avoid initrd. */
249 initrd_start = (u64)boot_params->ext_ramdisk_image << 32;
250 initrd_start |= boot_params->hdr.ramdisk_image;
251 initrd_size = (u64)boot_params->ext_ramdisk_size << 32;
252 initrd_size |= boot_params->hdr.ramdisk_size;
253 mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
254 mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
255 /* No need to set mapping for initrd, it will be handled in VO. */
256
257 /* Avoid kernel command line. */
258 cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32;
259 cmd_line |= boot_params->hdr.cmd_line_ptr;
260 /* Calculate size of cmd_line. */
261 ptr = (char *)(unsigned long)cmd_line;
262 for (cmd_line_size = 0; ptr[cmd_line_size++]; )
263 ;
264 mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
265 mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
266 add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
267 mem_avoid[MEM_AVOID_CMDLINE].size);
268
269 /* Avoid boot parameters. */
270 mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
271 mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
272 add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start,
273 mem_avoid[MEM_AVOID_BOOTPARAMS].size);
274
275 /* We don't need to set a mapping for setup_data. */
276
277#ifdef CONFIG_X86_VERBOSE_BOOTUP
278 /* Make sure video RAM can be used. */
279 add_identity_map(0, PMD_SIZE);
280#endif
281}
282
283/*
284 * Does this memory vector overlap a known avoided area? If so, record the
285 * overlap region with the lowest address.
286 */
287static bool mem_avoid_overlap(struct mem_vector *img,
288 struct mem_vector *overlap)
289{
290 int i;
291 struct setup_data *ptr;
292 unsigned long earliest = img->start + img->size;
293 bool is_overlapping = false;
294
295 for (i = 0; i < MEM_AVOID_MAX; i++) {
296 if (mem_overlaps(img, &mem_avoid[i]) &&
297 mem_avoid[i].start < earliest) {
298 *overlap = mem_avoid[i];
299 is_overlapping = true;
300 }
301 }
302
303 /* Avoid all entries in the setup_data linked list. */
304 ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
305 while (ptr) {
306 struct mem_vector avoid;
307
308 avoid.start = (unsigned long)ptr;
309 avoid.size = sizeof(*ptr) + ptr->len;
310
311 if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
312 *overlap = avoid;
313 is_overlapping = true;
314 }
315
316 ptr = (struct setup_data *)(unsigned long)ptr->next;
317 }
318
319 return is_overlapping;
320}
321
322static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN];
323
324struct slot_area {
325 unsigned long addr;
326 int num;
327};
328
329#define MAX_SLOT_AREA 100
330
331static struct slot_area slot_areas[MAX_SLOT_AREA];
332
333static unsigned long slot_max;
334
335static unsigned long slot_area_index;
336
337static void store_slot_info(struct mem_vector *region, unsigned long image_size)
338{
339 struct slot_area slot_area;
340
341 if (slot_area_index == MAX_SLOT_AREA)
342 return;
343
344 slot_area.addr = region->start;
345 slot_area.num = (region->size - image_size) /
346 CONFIG_PHYSICAL_ALIGN + 1;
347
348 if (slot_area.num > 0) {
349 slot_areas[slot_area_index++] = slot_area;
350 slot_max += slot_area.num;
351 }
352}
353
354static void slots_append(unsigned long addr)
355{
356 /* Overflowing the slots list should be impossible. */
357 if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN)
358 return;
359
360 slots[slot_max++] = addr;
361}
362
363static unsigned long slots_fetch_random(void)
364{
365 /* Handle case of no slots stored. */
366 if (slot_max == 0)
367 return 0;
368
369 return slots[get_random_long("Physical") % slot_max];
370}
371
372static void process_e820_entry(struct e820entry *entry,
373 unsigned long minimum,
374 unsigned long image_size)
375{
376 struct mem_vector region, img, overlap;
377
378 /* Skip non-RAM entries. */
379 if (entry->type != E820_RAM)
380 return;
381
382 /* Ignore entries entirely above our maximum. */
383 if (entry->addr >= KERNEL_IMAGE_SIZE)
384 return;
385
386 /* Ignore entries entirely below our minimum. */
387 if (entry->addr + entry->size < minimum)
388 return;
389
390 region.start = entry->addr;
391 region.size = entry->size;
392
393 /* Potentially raise address to minimum location. */
394 if (region.start < minimum)
395 region.start = minimum;
396
397 /* Potentially raise address to meet alignment requirements. */
398 region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
399
400 /* Did we raise the address above the bounds of this e820 region? */
401 if (region.start > entry->addr + entry->size)
402 return;
403
404 /* Reduce size by any delta from the original address. */
405 region.size -= region.start - entry->addr;
406
407 /* Reduce maximum size to fit end of image within maximum limit. */
408 if (region.start + region.size > KERNEL_IMAGE_SIZE)
409 region.size = KERNEL_IMAGE_SIZE - region.start;
410
411 /* Walk each aligned slot and check for avoided areas. */
412 for (img.start = region.start, img.size = image_size ;
413 mem_contains(&region, &img) ;
414 img.start += CONFIG_PHYSICAL_ALIGN) {
415 if (mem_avoid_overlap(&img, &overlap))
416 continue;
417 slots_append(img.start);
418 }
419}
420
421static unsigned long find_random_phys_addr(unsigned long minimum,
422 unsigned long image_size)
423{
424 int i;
425 unsigned long addr;
426
427 /* Make sure minimum is aligned. */
428 minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
429
430 /* Verify potential e820 positions, appending to slots list. */
431 for (i = 0; i < boot_params->e820_entries; i++) {
432 process_e820_entry(&boot_params->e820_map[i], minimum,
433 image_size);
434 }
435
436 return slots_fetch_random();
437}
438
439static unsigned long find_random_virt_addr(unsigned long minimum,
440 unsigned long image_size)
441{
442 unsigned long slots, random_addr;
443
444 /* Make sure minimum is aligned. */
445 minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
446 /* Align image_size for easy slot calculations. */
447 image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN);
448
449 /*
450 * There are how many CONFIG_PHYSICAL_ALIGN-sized slots
451 * that can hold image_size within the range of minimum to
452 * KERNEL_IMAGE_SIZE?
453 */
454 slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
455 CONFIG_PHYSICAL_ALIGN + 1;
456
457 random_addr = get_random_long("Virtual") % slots;
458
459 return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
460}
461
462/*
463 * Since this function examines addresses much more numerically,
464 * it takes the input and output pointers as 'unsigned long'.
465 */
466unsigned char *choose_random_location(unsigned long input,
467 unsigned long input_size,
468 unsigned long output,
469 unsigned long output_size)
470{
471 unsigned long choice = output;
472 unsigned long random_addr;
473
474#ifdef CONFIG_HIBERNATION
475 if (!cmdline_find_option_bool("kaslr")) {
476 warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected).");
477 goto out;
478 }
479#else
480 if (cmdline_find_option_bool("nokaslr")) {
481 warn("KASLR disabled: 'nokaslr' on cmdline.");
482 goto out;
483 }
484#endif
485
486 boot_params->hdr.loadflags |= KASLR_FLAG;
487
488 /* Record the various known unsafe memory ranges. */
489 mem_avoid_init(input, input_size, output);
490
491 /* Walk e820 and find a random address. */
492 random_addr = find_random_phys_addr(output, output_size);
493 if (!random_addr) {
494 warn("KASLR disabled: could not find suitable E820 region!");
495 goto out;
496 }
497
498 /* Always enforce the minimum. */
499 if (random_addr < choice)
500 goto out;
501
502 choice = random_addr;
503
504 add_identity_map(choice, output_size);
505
506 /* This actually loads the identity pagetable on x86_64. */
507 finalize_identity_maps();
508out:
509 return (unsigned char *)choice;
510}
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 79dac1758e7c..f14db4e21654 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -1,8 +1,10 @@
1/* 1/*
2 * misc.c 2 * misc.c
3 * 3 *
4 * This is a collection of several routines from gzip-1.0.3 4 * This is a collection of several routines used to extract the kernel
5 * adapted for Linux. 5 * which includes KASLR relocation, decompression, ELF parsing, and
6 * relocation processing. Additionally included are the screen and serial
7 * output functions and related debugging support functions.
6 * 8 *
7 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 9 * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
8 * puts by Nick Holloway 1993, better puts by Martin Mares 1995 10 * puts by Nick Holloway 1993, better puts by Martin Mares 1995
@@ -10,111 +12,37 @@
10 */ 12 */
11 13
12#include "misc.h" 14#include "misc.h"
15#include "error.h"
13#include "../string.h" 16#include "../string.h"
14 17#include "../voffset.h"
15/* WARNING!!
16 * This code is compiled with -fPIC and it is relocated dynamically
17 * at run time, but no relocation processing is performed.
18 * This means that it is not safe to place pointers in static structures.
19 */
20 18
21/* 19/*
22 * Getting to provable safe in place decompression is hard. 20 * WARNING!!
23 * Worst case behaviours need to be analyzed. 21 * This code is compiled with -fPIC and it is relocated dynamically at
24 * Background information: 22 * run time, but no relocation processing is performed. This means that
25 * 23 * it is not safe to place pointers in static structures.
26 * The file layout is:
27 * magic[2]
28 * method[1]
29 * flags[1]
30 * timestamp[4]
31 * extraflags[1]
32 * os[1]
33 * compressed data blocks[N]
34 * crc[4] orig_len[4]
35 *
36 * resulting in 18 bytes of non compressed data overhead.
37 *
38 * Files divided into blocks
39 * 1 bit (last block flag)
40 * 2 bits (block type)
41 *
42 * 1 block occurs every 32K -1 bytes or when there 50% compression
43 * has been achieved. The smallest block type encoding is always used.
44 *
45 * stored:
46 * 32 bits length in bytes.
47 *
48 * fixed:
49 * magic fixed tree.
50 * symbols.
51 *
52 * dynamic:
53 * dynamic tree encoding.
54 * symbols.
55 *
56 *
57 * The buffer for decompression in place is the length of the
58 * uncompressed data, plus a small amount extra to keep the algorithm safe.
59 * The compressed data is placed at the end of the buffer. The output
60 * pointer is placed at the start of the buffer and the input pointer
61 * is placed where the compressed data starts. Problems will occur
62 * when the output pointer overruns the input pointer.
63 *
64 * The output pointer can only overrun the input pointer if the input
65 * pointer is moving faster than the output pointer. A condition only
66 * triggered by data whose compressed form is larger than the uncompressed
67 * form.
68 *
69 * The worst case at the block level is a growth of the compressed data
70 * of 5 bytes per 32767 bytes.
71 *
72 * The worst case internal to a compressed block is very hard to figure.
73 * The worst case can at least be boundined by having one bit that represents
74 * 32764 bytes and then all of the rest of the bytes representing the very
75 * very last byte.
76 *
77 * All of which is enough to compute an amount of extra data that is required
78 * to be safe. To avoid problems at the block level allocating 5 extra bytes
79 * per 32767 bytes of data is sufficient. To avoind problems internal to a
80 * block adding an extra 32767 bytes (the worst case uncompressed block size)
81 * is sufficient, to ensure that in the worst case the decompressed data for
82 * block will stop the byte before the compressed data for a block begins.
83 * To avoid problems with the compressed data's meta information an extra 18
84 * bytes are needed. Leading to the formula:
85 *
86 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
87 *
88 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
89 * Adding 32768 instead of 32767 just makes for round numbers.
90 * Adding the decompressor_size is necessary as it musht live after all
91 * of the data as well. Last I measured the decompressor is about 14K.
92 * 10K of actual data and 4K of bss.
93 *
94 */ 24 */
95 25
96/* 26/* Macros used by the included decompressor code below. */
97 * gzip declarations
98 */
99#define STATIC static 27#define STATIC static
100 28
101#undef memcpy
102
103/* 29/*
104 * Use a normal definition of memset() from string.c. There are already 30 * Use normal definitions of mem*() from string.c. There are already
105 * included header files which expect a definition of memset() and by 31 * included header files which expect a definition of memset() and by
106 * the time we define memset macro, it is too late. 32 * the time we define memset macro, it is too late.
107 */ 33 */
34#undef memcpy
108#undef memset 35#undef memset
109#define memzero(s, n) memset((s), 0, (n)) 36#define memzero(s, n) memset((s), 0, (n))
37#define memmove memmove
110 38
111 39/* Functions used by the included decompressor code below. */
112static void error(char *m); 40void *memmove(void *dest, const void *src, size_t n);
113 41
114/* 42/*
115 * This is set up by the setup-routine at boot-time 43 * This is set up by the setup-routine at boot-time
116 */ 44 */
117struct boot_params *real_mode; /* Pointer to real-mode data */ 45struct boot_params *boot_params;
118 46
119memptr free_mem_ptr; 47memptr free_mem_ptr;
120memptr free_mem_end_ptr; 48memptr free_mem_end_ptr;
@@ -146,12 +74,16 @@ static int lines, cols;
146#ifdef CONFIG_KERNEL_LZ4 74#ifdef CONFIG_KERNEL_LZ4
147#include "../../../../lib/decompress_unlz4.c" 75#include "../../../../lib/decompress_unlz4.c"
148#endif 76#endif
77/*
78 * NOTE: When adding a new decompressor, please update the analysis in
79 * ../header.S.
80 */
149 81
150static void scroll(void) 82static void scroll(void)
151{ 83{
152 int i; 84 int i;
153 85
154 memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); 86 memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
155 for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2) 87 for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2)
156 vidmem[i] = ' '; 88 vidmem[i] = ' ';
157} 89}
@@ -184,12 +116,12 @@ void __putstr(const char *s)
184 } 116 }
185 } 117 }
186 118
187 if (real_mode->screen_info.orig_video_mode == 0 && 119 if (boot_params->screen_info.orig_video_mode == 0 &&
188 lines == 0 && cols == 0) 120 lines == 0 && cols == 0)
189 return; 121 return;
190 122
191 x = real_mode->screen_info.orig_x; 123 x = boot_params->screen_info.orig_x;
192 y = real_mode->screen_info.orig_y; 124 y = boot_params->screen_info.orig_y;
193 125
194 while ((c = *s++) != '\0') { 126 while ((c = *s++) != '\0') {
195 if (c == '\n') { 127 if (c == '\n') {
@@ -210,8 +142,8 @@ void __putstr(const char *s)
210 } 142 }
211 } 143 }
212 144
213 real_mode->screen_info.orig_x = x; 145 boot_params->screen_info.orig_x = x;
214 real_mode->screen_info.orig_y = y; 146 boot_params->screen_info.orig_y = y;
215 147
216 pos = (x + cols * y) * 2; /* Update cursor position */ 148 pos = (x + cols * y) * 2; /* Update cursor position */
217 outb(14, vidport); 149 outb(14, vidport);
@@ -237,23 +169,13 @@ void __puthex(unsigned long value)
237 } 169 }
238} 170}
239 171
240static void error(char *x)
241{
242 error_putstr("\n\n");
243 error_putstr(x);
244 error_putstr("\n\n -- System halted");
245
246 while (1)
247 asm("hlt");
248}
249
250#if CONFIG_X86_NEED_RELOCS 172#if CONFIG_X86_NEED_RELOCS
251static void handle_relocations(void *output, unsigned long output_len) 173static void handle_relocations(void *output, unsigned long output_len)
252{ 174{
253 int *reloc; 175 int *reloc;
254 unsigned long delta, map, ptr; 176 unsigned long delta, map, ptr;
255 unsigned long min_addr = (unsigned long)output; 177 unsigned long min_addr = (unsigned long)output;
256 unsigned long max_addr = min_addr + output_len; 178 unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
257 179
258 /* 180 /*
259 * Calculate the delta between where vmlinux was linked to load 181 * Calculate the delta between where vmlinux was linked to load
@@ -295,7 +217,7 @@ static void handle_relocations(void *output, unsigned long output_len)
295 * So we work backwards from the end of the decompressed image. 217 * So we work backwards from the end of the decompressed image.
296 */ 218 */
297 for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { 219 for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
298 int extended = *reloc; 220 long extended = *reloc;
299 extended += map; 221 extended += map;
300 222
301 ptr = (unsigned long)extended; 223 ptr = (unsigned long)extended;
@@ -372,9 +294,7 @@ static void parse_elf(void *output)
372#else 294#else
373 dest = (void *)(phdr->p_paddr); 295 dest = (void *)(phdr->p_paddr);
374#endif 296#endif
375 memcpy(dest, 297 memmove(dest, output + phdr->p_offset, phdr->p_filesz);
376 output + phdr->p_offset,
377 phdr->p_filesz);
378 break; 298 break;
379 default: /* Ignore other PT_* */ break; 299 default: /* Ignore other PT_* */ break;
380 } 300 }
@@ -383,23 +303,41 @@ static void parse_elf(void *output)
383 free(phdrs); 303 free(phdrs);
384} 304}
385 305
386asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, 306/*
307 * The compressed kernel image (ZO), has been moved so that its position
308 * is against the end of the buffer used to hold the uncompressed kernel
309 * image (VO) and the execution environment (.bss, .brk), which makes sure
310 * there is room to do the in-place decompression. (See header.S for the
311 * calculations.)
312 *
313 * |-----compressed kernel image------|
314 * V V
315 * 0 extract_offset +INIT_SIZE
316 * |-----------|---------------|-------------------------|--------|
317 * | | | |
318 * VO__text startup_32 of ZO VO__end ZO__end
319 * ^ ^
320 * |-------uncompressed kernel image---------|
321 *
322 */
323asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
387 unsigned char *input_data, 324 unsigned char *input_data,
388 unsigned long input_len, 325 unsigned long input_len,
389 unsigned char *output, 326 unsigned char *output,
390 unsigned long output_len, 327 unsigned long output_len)
391 unsigned long run_size)
392{ 328{
329 const unsigned long kernel_total_size = VO__end - VO__text;
393 unsigned char *output_orig = output; 330 unsigned char *output_orig = output;
394 331
395 real_mode = rmode; 332 /* Retain x86 boot parameters pointer passed from startup_32/64. */
333 boot_params = rmode;
396 334
397 /* Clear it for solely in-kernel use */ 335 /* Clear flags intended for solely in-kernel use. */
398 real_mode->hdr.loadflags &= ~KASLR_FLAG; 336 boot_params->hdr.loadflags &= ~KASLR_FLAG;
399 337
400 sanitize_boot_params(real_mode); 338 sanitize_boot_params(boot_params);
401 339
402 if (real_mode->screen_info.orig_video_mode == 7) { 340 if (boot_params->screen_info.orig_video_mode == 7) {
403 vidmem = (char *) 0xb0000; 341 vidmem = (char *) 0xb0000;
404 vidport = 0x3b4; 342 vidport = 0x3b4;
405 } else { 343 } else {
@@ -407,11 +345,11 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
407 vidport = 0x3d4; 345 vidport = 0x3d4;
408 } 346 }
409 347
410 lines = real_mode->screen_info.orig_video_lines; 348 lines = boot_params->screen_info.orig_video_lines;
411 cols = real_mode->screen_info.orig_video_cols; 349 cols = boot_params->screen_info.orig_video_cols;
412 350
413 console_init(); 351 console_init();
414 debug_putstr("early console in decompress_kernel\n"); 352 debug_putstr("early console in extract_kernel\n");
415 353
416 free_mem_ptr = heap; /* Heap */ 354 free_mem_ptr = heap; /* Heap */
417 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 355 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
@@ -421,16 +359,16 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
421 debug_putaddr(input_len); 359 debug_putaddr(input_len);
422 debug_putaddr(output); 360 debug_putaddr(output);
423 debug_putaddr(output_len); 361 debug_putaddr(output_len);
424 debug_putaddr(run_size); 362 debug_putaddr(kernel_total_size);
425 363
426 /* 364 /*
427 * The memory hole needed for the kernel is the larger of either 365 * The memory hole needed for the kernel is the larger of either
428 * the entire decompressed kernel plus relocation table, or the 366 * the entire decompressed kernel plus relocation table, or the
429 * entire decompressed kernel plus .bss and .brk sections. 367 * entire decompressed kernel plus .bss and .brk sections.
430 */ 368 */
431 output = choose_kernel_location(real_mode, input_data, input_len, output, 369 output = choose_random_location((unsigned long)input_data, input_len,
432 output_len > run_size ? output_len 370 (unsigned long)output,
433 : run_size); 371 max(output_len, kernel_total_size));
434 372
435 /* Validate memory location choices. */ 373 /* Validate memory location choices. */
436 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) 374 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3783dc3e10b3..b6fec1ff10e4 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -32,7 +32,7 @@
32/* misc.c */ 32/* misc.c */
33extern memptr free_mem_ptr; 33extern memptr free_mem_ptr;
34extern memptr free_mem_end_ptr; 34extern memptr free_mem_end_ptr;
35extern struct boot_params *real_mode; /* Pointer to real-mode data */ 35extern struct boot_params *boot_params;
36void __putstr(const char *s); 36void __putstr(const char *s);
37void __puthex(unsigned long value); 37void __puthex(unsigned long value);
38#define error_putstr(__x) __putstr(__x) 38#define error_putstr(__x) __putstr(__x)
@@ -66,26 +66,35 @@ int cmdline_find_option_bool(const char *option);
66 66
67 67
68#if CONFIG_RANDOMIZE_BASE 68#if CONFIG_RANDOMIZE_BASE
69/* aslr.c */ 69/* kaslr.c */
70unsigned char *choose_kernel_location(struct boot_params *boot_params, 70unsigned char *choose_random_location(unsigned long input_ptr,
71 unsigned char *input,
72 unsigned long input_size, 71 unsigned long input_size,
73 unsigned char *output, 72 unsigned long output_ptr,
74 unsigned long output_size); 73 unsigned long output_size);
75/* cpuflags.c */ 74/* cpuflags.c */
76bool has_cpuflag(int flag); 75bool has_cpuflag(int flag);
77#else 76#else
78static inline 77static inline
79unsigned char *choose_kernel_location(struct boot_params *boot_params, 78unsigned char *choose_random_location(unsigned long input_ptr,
80 unsigned char *input,
81 unsigned long input_size, 79 unsigned long input_size,
82 unsigned char *output, 80 unsigned long output_ptr,
83 unsigned long output_size) 81 unsigned long output_size)
84{ 82{
85 return output; 83 return (unsigned char *)output_ptr;
86} 84}
87#endif 85#endif
88 86
87#ifdef CONFIG_X86_64
88void add_identity_map(unsigned long start, unsigned long size);
89void finalize_identity_maps(void);
90extern unsigned char _pgtable[];
91#else
92static inline void add_identity_map(unsigned long start, unsigned long size)
93{ }
94static inline void finalize_identity_maps(void)
95{ }
96#endif
97
89#ifdef CONFIG_EARLY_PRINTK 98#ifdef CONFIG_EARLY_PRINTK
90/* early_serial_console.c */ 99/* early_serial_console.c */
91extern int early_serial_base; 100extern int early_serial_base;
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index d8222f213182..72bad2c8debe 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -18,11 +18,10 @@
18 * 18 *
19 * H. Peter Anvin <hpa@linux.intel.com> 19 * H. Peter Anvin <hpa@linux.intel.com>
20 * 20 *
21 * ----------------------------------------------------------------------- */ 21 * -----------------------------------------------------------------------
22 22 *
23/* 23 * Outputs a small assembly wrapper with the appropriate symbols defined.
24 * Compute the desired load offset from a compressed program; outputs 24 *
25 * a small assembly wrapper with the appropriate symbols defined.
26 */ 25 */
27 26
28#include <stdlib.h> 27#include <stdlib.h>
@@ -35,14 +34,11 @@ int main(int argc, char *argv[])
35{ 34{
36 uint32_t olen; 35 uint32_t olen;
37 long ilen; 36 long ilen;
38 unsigned long offs;
39 unsigned long run_size;
40 FILE *f = NULL; 37 FILE *f = NULL;
41 int retval = 1; 38 int retval = 1;
42 39
43 if (argc < 3) { 40 if (argc < 2) {
44 fprintf(stderr, "Usage: %s compressed_file run_size\n", 41 fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
45 argv[0]);
46 goto bail; 42 goto bail;
47 } 43 }
48 44
@@ -67,29 +63,11 @@ int main(int argc, char *argv[])
67 ilen = ftell(f); 63 ilen = ftell(f);
68 olen = get_unaligned_le32(&olen); 64 olen = get_unaligned_le32(&olen);
69 65
70 /*
71 * Now we have the input (compressed) and output (uncompressed)
72 * sizes, compute the necessary decompression offset...
73 */
74
75 offs = (olen > ilen) ? olen - ilen : 0;
76 offs += olen >> 12; /* Add 8 bytes for each 32K block */
77 offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79 run_size = atoi(argv[2]);
80
81 printf(".section \".rodata..compressed\",\"a\",@progbits\n"); 66 printf(".section \".rodata..compressed\",\"a\",@progbits\n");
82 printf(".globl z_input_len\n"); 67 printf(".globl z_input_len\n");
83 printf("z_input_len = %lu\n", ilen); 68 printf("z_input_len = %lu\n", ilen);
84 printf(".globl z_output_len\n"); 69 printf(".globl z_output_len\n");
85 printf("z_output_len = %lu\n", (unsigned long)olen); 70 printf("z_output_len = %lu\n", (unsigned long)olen);
86 printf(".globl z_extract_offset\n");
87 printf("z_extract_offset = 0x%lx\n", offs);
88 /* z_extract_offset_negative allows simplification of head_32.S */
89 printf(".globl z_extract_offset_negative\n");
90 printf("z_extract_offset_negative = -0x%lx\n", offs);
91 printf(".globl z_run_size\n");
92 printf("z_run_size = %lu\n", run_size);
93 71
94 printf(".globl input_data, input_data_end\n"); 72 printf(".globl input_data, input_data_end\n");
95 printf("input_data:\n"); 73 printf("input_data:\n");
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
new file mode 100644
index 000000000000..34b95df14e69
--- /dev/null
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -0,0 +1,129 @@
1/*
2 * This code is used on x86_64 to create page table identity mappings on
3 * demand by building up a new set of page tables (or appending to the
4 * existing ones), and then switching over to them when ready.
5 */
6
7/*
8 * Since we're dealing with identity mappings, physical and virtual
9 * addresses are the same, so override these defines which are ultimately
10 * used by the headers in misc.h.
11 */
12#define __pa(x) ((unsigned long)(x))
13#define __va(x) ((void *)((unsigned long)(x)))
14
15#include "misc.h"
16
17/* These actually do the work of building the kernel identity maps. */
18#include <asm/init.h>
19#include <asm/pgtable.h>
20#include "../../mm/ident_map.c"
21
22/* Used by pgtable.h asm code to force instruction serialization. */
23unsigned long __force_order;
24
25/* Used to track our page table allocation area. */
26struct alloc_pgt_data {
27 unsigned char *pgt_buf;
28 unsigned long pgt_buf_size;
29 unsigned long pgt_buf_offset;
30};
31
32/*
33 * Allocates space for a page table entry, using struct alloc_pgt_data
34 * above. Besides the local callers, this is used as the allocation
35 * callback in mapping_info below.
36 */
37static void *alloc_pgt_page(void *context)
38{
39 struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
40 unsigned char *entry;
41
42 /* Validate there is space available for a new page. */
43 if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
44 debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
45 debug_putaddr(pages->pgt_buf_offset);
46 debug_putaddr(pages->pgt_buf_size);
47 return NULL;
48 }
49
50 entry = pages->pgt_buf + pages->pgt_buf_offset;
51 pages->pgt_buf_offset += PAGE_SIZE;
52
53 return entry;
54}
55
56/* Used to track our allocated page tables. */
57static struct alloc_pgt_data pgt_data;
58
59/* The top level page table entry pointer. */
60static unsigned long level4p;
61
62/* Locates and clears a region for a new top level page table. */
63static void prepare_level4(void)
64{
65 /*
66 * It should be impossible for this not to already be true,
67 * but since calling this a second time would rewind the other
68 * counters, let's just make sure this is reset too.
69 */
70 pgt_data.pgt_buf_offset = 0;
71
72 /*
73 * If we came here via startup_32(), cr3 will be _pgtable already
74 * and we must append to the existing area instead of entirely
75 * overwriting it.
76 */
77 level4p = read_cr3();
78 if (level4p == (unsigned long)_pgtable) {
79 debug_putstr("booted via startup_32()\n");
80 pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
81 pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
82 memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
83 } else {
84 debug_putstr("booted via startup_64()\n");
85 pgt_data.pgt_buf = _pgtable;
86 pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
87 memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
88 level4p = (unsigned long)alloc_pgt_page(&pgt_data);
89 }
90}
91
92/*
93 * Adds the specified range to what will become the new identity mappings.
94 * Once all ranges have been added, the new mapping is activated by calling
95 * finalize_identity_maps() below.
96 */
97void add_identity_map(unsigned long start, unsigned long size)
98{
99 struct x86_mapping_info mapping_info = {
100 .alloc_pgt_page = alloc_pgt_page,
101 .context = &pgt_data,
102 .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
103 };
104 unsigned long end = start + size;
105
106 /* Make sure we have a top level page table ready to use. */
107 if (!level4p)
108 prepare_level4();
109
110 /* Align boundary to 2M. */
111 start = round_down(start, PMD_SIZE);
112 end = round_up(end, PMD_SIZE);
113 if (start >= end)
114 return;
115
116 /* Build the mapping. */
117 kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
118 start, end);
119}
120
121/*
122 * This switches the page tables to the new level4 that has been built
123 * via calls to add_identity_map() above. If booted via startup_32(),
124 * this is effectively a no-op.
125 */
126void finalize_identity_maps(void)
127{
128 write_cr3(level4p);
129}
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 00e788be1db9..cea140ce6b42 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,7 +1,16 @@
1/*
2 * This provides an optimized implementation of memcpy, and a simplified
3 * implementation of memset and memmove. These are used here because the
4 * standard kernel runtime versions are not yet available and we don't
5 * trust the gcc built-in implementations as they may do unexpected things
6 * (e.g. FPU ops) in the minimal decompression stub execution environment.
7 */
8#include "error.h"
9
1#include "../string.c" 10#include "../string.c"
2 11
3#ifdef CONFIG_X86_32 12#ifdef CONFIG_X86_32
4void *memcpy(void *dest, const void *src, size_t n) 13static void *__memcpy(void *dest, const void *src, size_t n)
5{ 14{
6 int d0, d1, d2; 15 int d0, d1, d2;
7 asm volatile( 16 asm volatile(
@@ -15,7 +24,7 @@ void *memcpy(void *dest, const void *src, size_t n)
15 return dest; 24 return dest;
16} 25}
17#else 26#else
18void *memcpy(void *dest, const void *src, size_t n) 27static void *__memcpy(void *dest, const void *src, size_t n)
19{ 28{
20 long d0, d1, d2; 29 long d0, d1, d2;
21 asm volatile( 30 asm volatile(
@@ -39,3 +48,27 @@ void *memset(void *s, int c, size_t n)
39 ss[i] = c; 48 ss[i] = c;
40 return s; 49 return s;
41} 50}
51
52void *memmove(void *dest, const void *src, size_t n)
53{
54 unsigned char *d = dest;
55 const unsigned char *s = src;
56
57 if (d <= s || d - s >= n)
58 return __memcpy(dest, src, n);
59
60 while (n-- > 0)
61 d[n] = s[n];
62
63 return dest;
64}
65
66/* Detect and warn about potential overlaps, but handle them with memmove. */
67void *memcpy(void *dest, const void *src, size_t n)
68{
69 if (dest > src && dest - src < n) {
70 warn("Avoiding potentially unsafe overlapping memcpy()!");
71 return memmove(dest, src, n);
72 }
73 return __memcpy(dest, src, n);
74}
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 34d047c98284..e24e0a0c90c9 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -70,5 +70,6 @@ SECTIONS
70 _epgtable = . ; 70 _epgtable = . ;
71 } 71 }
72#endif 72#endif
73 . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */
73 _end = .; 74 _end = .;
74} 75}
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
index 45a07684bbab..f0b8d6d93164 100644
--- a/arch/x86/boot/early_serial_console.c
+++ b/arch/x86/boot/early_serial_console.c
@@ -1,3 +1,7 @@
1/*
2 * Serial port routines for use during early boot reporting. This code is
3 * included from both the compressed kernel and the regular kernel.
4 */
1#include "boot.h" 5#include "boot.h"
2 6
3#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ 7#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 6236b9ec4b76..3dd5be33aaa7 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -440,13 +440,116 @@ setup_data: .quad 0 # 64-bit physical pointer to
440 440
441pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr 441pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
442 442
443#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) 443#
444# Getting to provably safe in-place decompression is hard. Worst case
445# behaviours need to be analyzed. Here let's take the decompression of
446# a gzip-compressed kernel as example, to illustrate it:
447#
448# The file layout of gzip compressed kernel is:
449#
450# magic[2]
451# method[1]
452# flags[1]
453# timestamp[4]
454# extraflags[1]
455# os[1]
456# compressed data blocks[N]
457# crc[4] orig_len[4]
458#
459# ... resulting in +18 bytes overhead of uncompressed data.
460#
461# (For more information, please refer to RFC 1951 and RFC 1952.)
462#
463# Files divided into blocks
464# 1 bit (last block flag)
465# 2 bits (block type)
466#
467# 1 block occurs every 32K -1 bytes or when there 50% compression
468# has been achieved. The smallest block type encoding is always used.
469#
470# stored:
471# 32 bits length in bytes.
472#
473# fixed:
474# magic fixed tree.
475# symbols.
476#
477# dynamic:
478# dynamic tree encoding.
479# symbols.
480#
481#
482# The buffer for decompression in place is the length of the uncompressed
483# data, plus a small amount extra to keep the algorithm safe. The
484# compressed data is placed at the end of the buffer. The output pointer
485# is placed at the start of the buffer and the input pointer is placed
486# where the compressed data starts. Problems will occur when the output
487# pointer overruns the input pointer.
488#
489# The output pointer can only overrun the input pointer if the input
490# pointer is moving faster than the output pointer. A condition only
491# triggered by data whose compressed form is larger than the uncompressed
492# form.
493#
494# The worst case at the block level is a growth of the compressed data
495# of 5 bytes per 32767 bytes.
496#
497# The worst case internal to a compressed block is very hard to figure.
498# The worst case can at least be bounded by having one bit that represents
499# 32764 bytes and then all of the rest of the bytes representing the very
500# very last byte.
501#
502# All of which is enough to compute an amount of extra data that is required
503# to be safe. To avoid problems at the block level allocating 5 extra bytes
504# per 32767 bytes of data is sufficient. To avoid problems internal to a
505# block adding an extra 32767 bytes (the worst case uncompressed block size)
506# is sufficient, to ensure that in the worst case the decompressed data for
507# block will stop the byte before the compressed data for a block begins.
508# To avoid problems with the compressed data's meta information an extra 18
509# bytes are needed. Leading to the formula:
510#
511# extra_bytes = (uncompressed_size >> 12) + 32768 + 18
512#
513# Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
514# Adding 32768 instead of 32767 just makes for round numbers.
515#
516# Above analysis is for decompressing gzip compressed kernel only. Up to
517# now 6 different decompressor are supported all together. And among them
518# xz stores data in chunks and has maximum chunk of 64K. Hence safety
519# margin should be updated to cover all decompressors so that we don't
520# need to deal with each of them separately. Please check
521# the description in lib/decompressor_xxx.c for specific information.
522#
523# extra_bytes = (uncompressed_size >> 12) + 65536 + 128
524
525#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128)
526#if ZO_z_output_len > ZO_z_input_len
527# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \
528 ZO_z_input_len)
529#else
530# define ZO_z_extract_offset ZO_z_extra_bytes
531#endif
532
533/*
534 * The extract_offset has to be bigger than ZO head section. Otherwise when
535 * the head code is running to move ZO to the end of the buffer, it will
536 * overwrite the head code itself.
537 */
538#if (ZO__ehead - ZO_startup_32) > ZO_z_extract_offset
539# define ZO_z_min_extract_offset ((ZO__ehead - ZO_startup_32 + 4095) & ~4095)
540#else
541# define ZO_z_min_extract_offset ((ZO_z_extract_offset + 4095) & ~4095)
542#endif
543
544#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_min_extract_offset)
545
444#define VO_INIT_SIZE (VO__end - VO__text) 546#define VO_INIT_SIZE (VO__end - VO__text)
445#if ZO_INIT_SIZE > VO_INIT_SIZE 547#if ZO_INIT_SIZE > VO_INIT_SIZE
446#define INIT_SIZE ZO_INIT_SIZE 548# define INIT_SIZE ZO_INIT_SIZE
447#else 549#else
448#define INIT_SIZE VO_INIT_SIZE 550# define INIT_SIZE VO_INIT_SIZE
449#endif 551#endif
552
450init_size: .long INIT_SIZE # kernel initialization size 553init_size: .long INIT_SIZE # kernel initialization size
451handover_offset: .long 0 # Filled in by build.c 554handover_offset: .long 0 # Filled in by build.c
452 555
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6b8d6e8cd449..abd06b19ddd2 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -12,29 +12,46 @@
12 12
13/* Minimum kernel alignment, as a power of two */ 13/* Minimum kernel alignment, as a power of two */
14#ifdef CONFIG_X86_64 14#ifdef CONFIG_X86_64
15#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT 15# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
16#else 16#else
17#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) 17# define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER)
18#endif 18#endif
19#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) 19#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
20 20
21#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ 21#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
22 (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN) 22 (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
23#error "Invalid value for CONFIG_PHYSICAL_ALIGN" 23# error "Invalid value for CONFIG_PHYSICAL_ALIGN"
24#endif 24#endif
25 25
26#ifdef CONFIG_KERNEL_BZIP2 26#ifdef CONFIG_KERNEL_BZIP2
27#define BOOT_HEAP_SIZE 0x400000 27# define BOOT_HEAP_SIZE 0x400000
28#else /* !CONFIG_KERNEL_BZIP2 */ 28#else /* !CONFIG_KERNEL_BZIP2 */
29 29# define BOOT_HEAP_SIZE 0x10000
30#define BOOT_HEAP_SIZE 0x10000 30#endif
31
32#endif /* !CONFIG_KERNEL_BZIP2 */
33 31
34#ifdef CONFIG_X86_64 32#ifdef CONFIG_X86_64
35#define BOOT_STACK_SIZE 0x4000 33# define BOOT_STACK_SIZE 0x4000
36#else 34
37#define BOOT_STACK_SIZE 0x1000 35# define BOOT_INIT_PGT_SIZE (6*4096)
36# ifdef CONFIG_RANDOMIZE_BASE
37/*
38 * Assuming all cross the 512GB boundary:
39 * 1 page for level4
40 * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel
41 * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP).
42 * Total is 19 pages.
43 */
44# ifdef CONFIG_X86_VERBOSE_BOOTUP
45# define BOOT_PGT_SIZE (19*4096)
46# else /* !CONFIG_X86_VERBOSE_BOOTUP */
47# define BOOT_PGT_SIZE (17*4096)
48# endif
49# else /* !CONFIG_RANDOMIZE_BASE */
50# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE
51# endif
52
53#else /* !CONFIG_X86_64 */
54# define BOOT_STACK_SIZE 0x1000
38#endif 55#endif
39 56
40#endif /* _ASM_X86_BOOT_H */ 57#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 802dde30c928..cf8f619b305f 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
37 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) 37 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
38#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE 38#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
39 39
40#ifndef __pa
40#define __pa(x) __phys_addr((unsigned long)(x)) 41#define __pa(x) __phys_addr((unsigned long)(x))
42#endif
43
41#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) 44#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
42/* __pa_symbol should be used for C visible symbols. 45/* __pa_symbol should be used for C visible symbols.
43 This seems to be the official gcc blessed way to do such arithmetic. */ 46 This seems to be the official gcc blessed way to do such arithmetic. */
@@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
51#define __pa_symbol(x) \ 54#define __pa_symbol(x) \
52 __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) 55 __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
53 56
57#ifndef __va
54#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) 58#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
59#endif
55 60
56#define __boot_va(x) __va(x) 61#define __boot_va(x) __va(x)
57#define __boot_pa(x) __pa(x) 62#define __boot_pa(x) __pa(x)
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 4928cf0d5af0..d5c2f8b40faa 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -47,12 +47,10 @@
47 * are fully set up. If kernel ASLR is configured, it can extend the 47 * are fully set up. If kernel ASLR is configured, it can extend the
48 * kernel page table mapping, reducing the size of the modules area. 48 * kernel page table mapping, reducing the size of the modules area.
49 */ 49 */
50#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024) 50#if defined(CONFIG_RANDOMIZE_BASE)
51#if defined(CONFIG_RANDOMIZE_BASE) && \ 51#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
52 CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT
53#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET
54#else 52#else
55#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT 53#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
56#endif 54#endif
57 55
58#endif /* _ASM_X86_PAGE_64_DEFS_H */ 56#endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 3c731413f1de..2970d22d7766 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -15,17 +15,6 @@
15#include <linux/cpumask.h> 15#include <linux/cpumask.h>
16#include <asm/frame.h> 16#include <asm/frame.h>
17 17
18static inline int paravirt_enabled(void)
19{
20 return pv_info.paravirt_enabled;
21}
22
23static inline int paravirt_has_feature(unsigned int feature)
24{
25 WARN_ON_ONCE(!pv_info.paravirt_enabled);
26 return (pv_info.features & feature);
27}
28
29static inline void load_sp0(struct tss_struct *tss, 18static inline void load_sp0(struct tss_struct *tss,
30 struct thread_struct *thread) 19 struct thread_struct *thread)
31{ 20{
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b4a23eafa1b9..7fa9e7740ba3 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -69,15 +69,9 @@ struct pv_info {
69 u16 extra_user_64bit_cs; /* __USER_CS if none */ 69 u16 extra_user_64bit_cs; /* __USER_CS if none */
70#endif 70#endif
71 71
72 int paravirt_enabled;
73 unsigned int features; /* valid only if paravirt_enabled is set */
74 const char *name; 72 const char *name;
75}; 73};
76 74
77#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x)
78/* Supported features */
79#define PV_SUPPORTED_RTC (1<<0)
80
81struct pv_init_ops { 75struct pv_init_ops {
82 /* 76 /*
83 * Patch may replace one of the defined code sequences with 77 * Patch may replace one of the defined code sequences with
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9251aa962721..62c6cc3cc5d3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -480,8 +480,6 @@ static inline unsigned long current_top_of_stack(void)
480#include <asm/paravirt.h> 480#include <asm/paravirt.h>
481#else 481#else
482#define __cpuid native_cpuid 482#define __cpuid native_cpuid
483#define paravirt_enabled() 0
484#define paravirt_has(x) 0
485 483
486static inline void load_sp0(struct tss_struct *tss, 484static inline void load_sp0(struct tss_struct *tss,
487 struct thread_struct *thread) 485 struct thread_struct *thread)
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 1ae89a2721d6..4dcdf74dfed8 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -142,6 +142,44 @@ struct x86_cpuinit_ops {
142struct timespec; 142struct timespec;
143 143
144/** 144/**
145 * struct x86_legacy_devices - legacy x86 devices
146 *
147 * @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform
148 * is known to never have a PNPBIOS.
149 *
150 * These are devices known to require LPC or ISA bus. The definition of legacy
151 * devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag
152 * ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on
153 * the LPC or ISA bus. User visible devices are devices that have end-user
154 * accessible connectors (for example, LPT parallel port). Legacy devices on
155 * the LPC bus consist for example of serial and parallel ports, PS/2 keyboard
156 * / mouse, and the floppy disk controller. A system that lacks all known
157 * legacy devices can assume all devices can be detected exclusively via
158 * standard device enumeration mechanisms including the ACPI namespace.
159 *
160 * A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not
161 * have any of the legacy devices enumerated below present.
162 */
163struct x86_legacy_devices {
164 int pnpbios;
165};
166
167/**
168 * struct x86_legacy_features - legacy x86 features
169 *
170 * @rtc: this device has a CMOS real-time clock present
171 * @ebda_search: it's safe to search for the EBDA signature in the hardware's
172 * low RAM
173 * @devices: legacy x86 devices, refer to struct x86_legacy_devices
174 * documentation for further details.
175 */
176struct x86_legacy_features {
177 int rtc;
178 int ebda_search;
179 struct x86_legacy_devices devices;
180};
181
182/**
145 * struct x86_platform_ops - platform specific runtime functions 183 * struct x86_platform_ops - platform specific runtime functions
146 * @calibrate_tsc: calibrate TSC 184 * @calibrate_tsc: calibrate TSC
147 * @get_wallclock: get time from HW clock like RTC etc. 185 * @get_wallclock: get time from HW clock like RTC etc.
@@ -152,6 +190,14 @@ struct timespec;
152 * @save_sched_clock_state: save state for sched_clock() on suspend 190 * @save_sched_clock_state: save state for sched_clock() on suspend
153 * @restore_sched_clock_state: restore state for sched_clock() on resume 191 * @restore_sched_clock_state: restore state for sched_clock() on resume
154 * @apic_post_init: adjust apic if neeeded 192 * @apic_post_init: adjust apic if neeeded
193 * @legacy: legacy features
194 * @set_legacy_features: override legacy features. Use of this callback
195 * is highly discouraged. You should only need
196 * this if your hardware platform requires further
197 * custom fine tuning far beyong what may be
198 * possible in x86_early_init_platform_quirks() by
199 * only using the current x86_hardware_subarch
200 * semantics.
155 */ 201 */
156struct x86_platform_ops { 202struct x86_platform_ops {
157 unsigned long (*calibrate_tsc)(void); 203 unsigned long (*calibrate_tsc)(void);
@@ -165,6 +211,8 @@ struct x86_platform_ops {
165 void (*save_sched_clock_state)(void); 211 void (*save_sched_clock_state)(void);
166 void (*restore_sched_clock_state)(void); 212 void (*restore_sched_clock_state)(void);
167 void (*apic_post_init)(void); 213 void (*apic_post_init)(void);
214 struct x86_legacy_features legacy;
215 void (*set_legacy_features)(void);
168}; 216};
169 217
170struct pci_dev; 218struct pci_dev;
@@ -186,6 +234,8 @@ extern struct x86_cpuinit_ops x86_cpuinit;
186extern struct x86_platform_ops x86_platform; 234extern struct x86_platform_ops x86_platform;
187extern struct x86_msi_ops x86_msi; 235extern struct x86_msi_ops x86_msi;
188extern struct x86_io_apic_ops x86_io_apic_ops; 236extern struct x86_io_apic_ops x86_io_apic_ops;
237
238extern void x86_early_init_platform_quirks(void);
189extern void x86_init_noop(void); 239extern void x86_init_noop(void);
190extern void x86_init_uint_noop(unsigned int unused); 240extern void x86_init_uint_noop(unsigned int unused);
191 241
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 329254373479..c18ce67495fa 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -157,7 +157,46 @@ struct boot_params {
157 __u8 _pad9[276]; /* 0xeec */ 157 __u8 _pad9[276]; /* 0xeec */
158} __attribute__((packed)); 158} __attribute__((packed));
159 159
160enum { 160/**
161 * enum x86_hardware_subarch - x86 hardware subarchitecture
162 *
163 * The x86 hardware_subarch and hardware_subarch_data were added as of the x86
164 * boot protocol 2.07 to help distinguish and support custom x86 boot
165 * sequences. This enum represents accepted values for the x86
166 * hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not
167 * have or simply *cannot* make use of natural stubs like BIOS or EFI, the
168 * hardware_subarch can be used on the Linux entry path to revector to a
169 * subarchitecture stub when needed. This subarchitecture stub can be used to
170 * set up Linux boot parameters or for special care to account for nonstandard
171 * handling of page tables.
172 *
173 * These enums should only ever be used by x86 code, and the code that uses
174 * it should be well contained and compartamentalized.
175 *
176 * KVM and Xen HVM do not have a subarch as these are expected to follow
177 * standard x86 boot entries. If there is a genuine need for "hypervisor" type
178 * that should be considered separately in the future. Future guest types
179 * should seriously consider working with standard x86 boot stubs such as
180 * the BIOS or EFI boot stubs.
181 *
182 * WARNING: this enum is only used for legacy hacks, for platform features that
183 * are not easily enumerated or discoverable. You should not ever use
184 * this for new features.
185 *
186 * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
187 * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
188 * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
189 * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
190 * which start at asm startup_xen() entry point and later jump to the C
191 * xen_start_kernel() entry point. Both domU and dom0 type of guests are
192 * currently supportd through this PV boot path.
193 * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform
194 * systems which do not have the PCI legacy interfaces.
195 * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for
196 * for settop boxes and media devices, the use of a subarch for CE4100
197 * is more of a hack...
198 */
199enum x86_hardware_subarch {
161 X86_SUBARCH_PC = 0, 200 X86_SUBARCH_PC = 0,
162 X86_SUBARCH_LGUEST, 201 X86_SUBARCH_LGUEST,
163 X86_SUBARCH_XEN, 202 X86_SUBARCH_XEN,
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 616ebd22ef9a..9abf8551c7e4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,11 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds 5extra-y := head_$(BITS).o
6extra-y += head$(BITS).o
7extra-y += ebda.o
8extra-y += platform-quirks.o
9extra-y += vmlinux.lds
6 10
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 11CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 12
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2522e564269e..f115a58f7c84 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -913,6 +913,15 @@ late_initcall(hpet_insert_resource);
913 913
914static int __init acpi_parse_fadt(struct acpi_table_header *table) 914static int __init acpi_parse_fadt(struct acpi_table_header *table)
915{ 915{
916 if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) {
917 pr_debug("ACPI: no legacy devices present\n");
918 x86_platform.legacy.devices.pnpbios = 0;
919 }
920
921 if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
922 pr_debug("ACPI: not registering RTC platform device\n");
923 x86_platform.legacy.rtc = 0;
924 }
916 925
917#ifdef CONFIG_X86_PM_TIMER 926#ifdef CONFIG_X86_PM_TIMER
918 /* detect the location of the ACPI PM Timer */ 927 /* detect the location of the ACPI PM Timer */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 9307f182fe30..c7364bd633e1 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -2267,7 +2267,7 @@ static int __init apm_init(void)
2267 2267
2268 dmi_check_system(apm_dmi_table); 2268 dmi_check_system(apm_dmi_table);
2269 2269
2270 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { 2270 if (apm_info.bios.version == 0 || machine_is_olpc()) {
2271 printk(KERN_INFO "apm: BIOS not found.\n"); 2271 printk(KERN_INFO "apm: BIOS not found.\n");
2272 return -ENODEV; 2272 return -ENODEV;
2273 } 2273 }
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 5c042466f274..674134e9f5e5 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -80,6 +80,7 @@ void common(void) {
80 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 80 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
81 OFFSET(BP_version, boot_params, hdr.version); 81 OFFSET(BP_version, boot_params, hdr.version);
82 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); 82 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
83 OFFSET(BP_init_size, boot_params, hdr.init_size);
83 OFFSET(BP_pref_address, boot_params, hdr.pref_address); 84 OFFSET(BP_pref_address, boot_params, hdr.pref_address);
84 OFFSET(BP_code32_start, boot_params, hdr.code32_start); 85 OFFSET(BP_code32_start, boot_params, hdr.code32_start);
85 86
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b18f4706e607..8dae51fd3db1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -233,7 +233,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
233 * The Quark is also family 5, but does not have the same bug. 233 * The Quark is also family 5, but does not have the same bug.
234 */ 234 */
235 clear_cpu_bug(c, X86_BUG_F00F); 235 clear_cpu_bug(c, X86_BUG_F00F);
236 if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) { 236 if (c->x86 == 5 && c->x86_model < 9) {
237 static int f00f_workaround_enabled; 237 static int f00f_workaround_enabled;
238 238
239 set_cpu_bug(c, X86_BUG_F00F); 239 set_cpu_bug(c, X86_BUG_F00F);
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/ebda.c
index 992f442ca155..afe65dffee80 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/ebda.c
@@ -38,7 +38,7 @@ void __init reserve_ebda_region(void)
38 * that the paravirt case can handle memory setup 38 * that the paravirt case can handle memory setup
39 * correctly, without our help. 39 * correctly, without our help.
40 */ 40 */
41 if (paravirt_enabled()) 41 if (!x86_platform.legacy.ebda_search)
42 return; 42 return;
43 43
44 /* end of low (conventional) memory */ 44 /* end of low (conventional) memory */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 2911ef3a9f1c..d784bb547a9d 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -34,6 +34,8 @@ asmlinkage __visible void __init i386_start_kernel(void)
34 cr4_init_shadow(); 34 cr4_init_shadow();
35 sanitize_boot_params(&boot_params); 35 sanitize_boot_params(&boot_params);
36 36
37 x86_early_init_platform_quirks();
38
37 /* Call the subarch specific early setup function */ 39 /* Call the subarch specific early setup function */
38 switch (boot_params.hdr.hardware_subarch) { 40 switch (boot_params.hdr.hardware_subarch) {
39 case X86_SUBARCH_INTEL_MID: 41 case X86_SUBARCH_INTEL_MID:
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1f4422d5c8d0..b72fb0b71dd1 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -182,6 +182,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
182 if (!boot_params.hdr.version) 182 if (!boot_params.hdr.version)
183 copy_bootdata(__va(real_mode_data)); 183 copy_bootdata(__va(real_mode_data));
184 184
185 x86_early_init_platform_quirks();
185 reserve_ebda_region(); 186 reserve_ebda_region();
186 187
187 switch (boot_params.hdr.hardware_subarch) { 188 switch (boot_params.hdr.hardware_subarch) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index dc1207e2f193..eea2a6f72b31 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -285,14 +285,6 @@ static void __init paravirt_ops_setup(void)
285{ 285{
286 pv_info.name = "KVM"; 286 pv_info.name = "KVM";
287 287
288 /*
289 * KVM isn't paravirt in the sense of paravirt_enabled. A KVM
290 * guest kernel works like a bare metal kernel with additional
291 * features, and paravirt_enabled is about features that are
292 * missing.
293 */
294 pv_info.paravirt_enabled = 0;
295
296 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 288 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
297 pv_cpu_ops.io_delay = kvm_io_delay; 289 pv_cpu_ops.io_delay = kvm_io_delay;
298 290
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index f9583917c7c4..7b3b3f24c3ea 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -294,7 +294,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
294 294
295struct pv_info pv_info = { 295struct pv_info pv_info = {
296 .name = "bare hardware", 296 .name = "bare hardware",
297 .paravirt_enabled = 0,
298 .kernel_rpl = 0, 297 .kernel_rpl = 0,
299 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ 298 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
300 299
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
new file mode 100644
index 000000000000..b2f8a33b36ff
--- /dev/null
+++ b/arch/x86/kernel/platform-quirks.c
@@ -0,0 +1,35 @@
1#include <linux/kernel.h>
2#include <linux/init.h>
3
4#include <asm/setup.h>
5#include <asm/bios_ebda.h>
6
7void __init x86_early_init_platform_quirks(void)
8{
9 x86_platform.legacy.rtc = 1;
10 x86_platform.legacy.ebda_search = 0;
11 x86_platform.legacy.devices.pnpbios = 1;
12
13 switch (boot_params.hdr.hardware_subarch) {
14 case X86_SUBARCH_PC:
15 x86_platform.legacy.ebda_search = 1;
16 break;
17 case X86_SUBARCH_XEN:
18 case X86_SUBARCH_LGUEST:
19 case X86_SUBARCH_INTEL_MID:
20 case X86_SUBARCH_CE4100:
21 x86_platform.legacy.devices.pnpbios = 0;
22 x86_platform.legacy.rtc = 0;
23 break;
24 }
25
26 if (x86_platform.set_legacy_features)
27 x86_platform.set_legacy_features();
28}
29
30#if defined(CONFIG_PNPBIOS)
31bool __init arch_pnpbios_disabled(void)
32{
33 return x86_platform.legacy.devices.pnpbios == 0;
34}
35#endif
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 4af8d063fb36..eceaa082ec3f 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -14,6 +14,7 @@
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/intel-mid.h> 15#include <asm/intel-mid.h>
16#include <asm/rtc.h> 16#include <asm/rtc.h>
17#include <asm/setup.h>
17 18
18#ifdef CONFIG_X86_32 19#ifdef CONFIG_X86_32
19/* 20/*
@@ -185,22 +186,7 @@ static __init int add_rtc_cmos(void)
185 } 186 }
186 } 187 }
187#endif 188#endif
188 if (of_have_populated_dt()) 189 if (!x86_platform.legacy.rtc)
189 return 0;
190
191 /* Intel MID platforms don't have ioport rtc */
192 if (intel_mid_identify_cpu())
193 return -ENODEV;
194
195#ifdef CONFIG_ACPI
196 if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
197 /* This warning can likely go away again in a year or two. */
198 pr_info("ACPI: not registering RTC platform device\n");
199 return -ENODEV;
200 }
201#endif
202
203 if (paravirt_enabled() && !paravirt_has(RTC))
204 return -ENODEV; 190 return -ENODEV;
205 191
206 platform_device_register(&rtc_device); 192 platform_device_register(&rtc_device);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index e72a07f20b05..9b0185fbe3eb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -74,12 +74,6 @@ void __init tboot_probe(void)
74 return; 74 return;
75 } 75 }
76 76
77 /* only a natively booted kernel should be using TXT */
78 if (paravirt_enabled()) {
79 pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
80 return;
81 }
82
83 /* Map and check for tboot UUID. */ 77 /* Map and check for tboot UUID. */
84 set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); 78 set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
85 tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); 79 tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 4c941f88d405..9297a002d8e5 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -334,7 +334,7 @@ SECTIONS
334 __brk_limit = .; 334 __brk_limit = .;
335 } 335 }
336 336
337 . = ALIGN(PAGE_SIZE); 337 . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
338 _end = .; 338 _end = .;
339 339
340 STABS_DEBUG 340 STABS_DEBUG
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index fd57d3ae7e16..3847e736702e 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1408,13 +1408,10 @@ __init void lguest_init(void)
1408{ 1408{
1409 /* We're under lguest. */ 1409 /* We're under lguest. */
1410 pv_info.name = "lguest"; 1410 pv_info.name = "lguest";
1411 /* Paravirt is enabled. */
1412 pv_info.paravirt_enabled = 1;
1413 /* We're running at privilege level 1, not 0 as normal. */ 1411 /* We're running at privilege level 1, not 0 as normal. */
1414 pv_info.kernel_rpl = 1; 1412 pv_info.kernel_rpl = 1;
1415 /* Everyone except Xen runs with this set. */ 1413 /* Everyone except Xen runs with this set. */
1416 pv_info.shared_kernel_pmd = 1; 1414 pv_info.shared_kernel_pmd = 1;
1417 pv_info.features = 0;
1418 1415
1419 /* 1416 /*
1420 * We set up all the lguest overrides for sensitive operations. These 1417 * We set up all the lguest overrides for sensitive operations. These
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
new file mode 100644
index 000000000000..ec21796ac5fd
--- /dev/null
+++ b/arch/x86/mm/ident_map.c
@@ -0,0 +1,79 @@
1/*
2 * Helper routines for building identity mapping page tables. This is
3 * included by both the compressed kernel and the regular kernel.
4 */
5
6static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
7 unsigned long addr, unsigned long end)
8{
9 addr &= PMD_MASK;
10 for (; addr < end; addr += PMD_SIZE) {
11 pmd_t *pmd = pmd_page + pmd_index(addr);
12
13 if (!pmd_present(*pmd))
14 set_pmd(pmd, __pmd(addr | pmd_flag));
15 }
16}
17
18static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
19 unsigned long addr, unsigned long end)
20{
21 unsigned long next;
22
23 for (; addr < end; addr = next) {
24 pud_t *pud = pud_page + pud_index(addr);
25 pmd_t *pmd;
26
27 next = (addr & PUD_MASK) + PUD_SIZE;
28 if (next > end)
29 next = end;
30
31 if (pud_present(*pud)) {
32 pmd = pmd_offset(pud, 0);
33 ident_pmd_init(info->pmd_flag, pmd, addr, next);
34 continue;
35 }
36 pmd = (pmd_t *)info->alloc_pgt_page(info->context);
37 if (!pmd)
38 return -ENOMEM;
39 ident_pmd_init(info->pmd_flag, pmd, addr, next);
40 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
41 }
42
43 return 0;
44}
45
46int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
47 unsigned long addr, unsigned long end)
48{
49 unsigned long next;
50 int result;
51 int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
52
53 for (; addr < end; addr = next) {
54 pgd_t *pgd = pgd_page + pgd_index(addr) + off;
55 pud_t *pud;
56
57 next = (addr & PGDIR_MASK) + PGDIR_SIZE;
58 if (next > end)
59 next = end;
60
61 if (pgd_present(*pgd)) {
62 pud = pud_offset(pgd, 0);
63 result = ident_pud_init(info, pud, addr, next);
64 if (result)
65 return result;
66 continue;
67 }
68
69 pud = (pud_t *)info->alloc_pgt_page(info->context);
70 if (!pud)
71 return -ENOMEM;
72 result = ident_pud_init(info, pud, addr, next);
73 if (result)
74 return result;
75 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
76 }
77
78 return 0;
79}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 85af914e3d27..84df150ee77e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -804,9 +804,6 @@ void __init mem_init(void)
804 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); 804 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
805#undef high_memory 805#undef high_memory
806#undef __FIXADDR_TOP 806#undef __FIXADDR_TOP
807#ifdef CONFIG_RANDOMIZE_BASE
808 BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
809#endif
810 807
811#ifdef CONFIG_HIGHMEM 808#ifdef CONFIG_HIGHMEM
812 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 809 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 89d97477c1d9..bce2e5d9edd4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -58,79 +58,7 @@
58 58
59#include "mm_internal.h" 59#include "mm_internal.h"
60 60
61static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, 61#include "ident_map.c"
62 unsigned long addr, unsigned long end)
63{
64 addr &= PMD_MASK;
65 for (; addr < end; addr += PMD_SIZE) {
66 pmd_t *pmd = pmd_page + pmd_index(addr);
67
68 if (!pmd_present(*pmd))
69 set_pmd(pmd, __pmd(addr | pmd_flag));
70 }
71}
72static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
73 unsigned long addr, unsigned long end)
74{
75 unsigned long next;
76
77 for (; addr < end; addr = next) {
78 pud_t *pud = pud_page + pud_index(addr);
79 pmd_t *pmd;
80
81 next = (addr & PUD_MASK) + PUD_SIZE;
82 if (next > end)
83 next = end;
84
85 if (pud_present(*pud)) {
86 pmd = pmd_offset(pud, 0);
87 ident_pmd_init(info->pmd_flag, pmd, addr, next);
88 continue;
89 }
90 pmd = (pmd_t *)info->alloc_pgt_page(info->context);
91 if (!pmd)
92 return -ENOMEM;
93 ident_pmd_init(info->pmd_flag, pmd, addr, next);
94 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
95 }
96
97 return 0;
98}
99
100int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
101 unsigned long addr, unsigned long end)
102{
103 unsigned long next;
104 int result;
105 int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
106
107 for (; addr < end; addr = next) {
108 pgd_t *pgd = pgd_page + pgd_index(addr) + off;
109 pud_t *pud;
110
111 next = (addr & PGDIR_MASK) + PGDIR_SIZE;
112 if (next > end)
113 next = end;
114
115 if (pgd_present(*pgd)) {
116 pud = pud_offset(pgd, 0);
117 result = ident_pud_init(info, pud, addr, next);
118 if (result)
119 return result;
120 continue;
121 }
122
123 pud = (pud_t *)info->alloc_pgt_page(info->context);
124 if (!pud)
125 return -ENOMEM;
126 result = ident_pud_init(info, pud, addr, next);
127 if (result)
128 return result;
129 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
130 }
131
132 return 0;
133}
134 62
135/* 63/*
136 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the 64 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
diff --git a/arch/x86/tools/calc_run_size.sh b/arch/x86/tools/calc_run_size.sh
deleted file mode 100644
index 1a4c17bb3910..000000000000
--- a/arch/x86/tools/calc_run_size.sh
+++ /dev/null
@@ -1,42 +0,0 @@
1#!/bin/sh
2#
3# Calculate the amount of space needed to run the kernel, including room for
4# the .bss and .brk sections.
5#
6# Usage:
7# objdump -h a.out | sh calc_run_size.sh
8
9NUM='\([0-9a-fA-F]*[ \t]*\)'
10OUT=$(sed -n 's/^[ \t0-9]*.b[sr][sk][ \t]*'"$NUM$NUM$NUM$NUM"'.*/\1\4/p')
11if [ -z "$OUT" ] ; then
12 echo "Never found .bss or .brk file offset" >&2
13 exit 1
14fi
15
16OUT=$(echo ${OUT# })
17sizeA=$(printf "%d" 0x${OUT%% *})
18OUT=${OUT#* }
19offsetA=$(printf "%d" 0x${OUT%% *})
20OUT=${OUT#* }
21sizeB=$(printf "%d" 0x${OUT%% *})
22OUT=${OUT#* }
23offsetB=$(printf "%d" 0x${OUT%% *})
24
25run_size=$(( $offsetA + $sizeA + $sizeB ))
26
27# BFD linker shows the same file offset in ELF.
28if [ "$offsetA" -ne "$offsetB" ] ; then
29 # Gold linker shows them as consecutive.
30 endB=$(( $offsetB + $sizeB ))
31 if [ "$endB" != "$run_size" ] ; then
32 printf "sizeA: 0x%x\n" $sizeA >&2
33 printf "offsetA: 0x%x\n" $offsetA >&2
34 printf "sizeB: 0x%x\n" $sizeB >&2
35 printf "offsetB: 0x%x\n" $offsetB >&2
36 echo ".bss and .brk are non-contiguous" >&2
37 exit 1
38 fi
39fi
40
41printf "%d\n" $run_size
42exit 0
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 6ab672233ac9..760789ae8562 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1206,13 +1206,11 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1206} 1206}
1207 1207
1208static const struct pv_info xen_info __initconst = { 1208static const struct pv_info xen_info __initconst = {
1209 .paravirt_enabled = 1,
1210 .shared_kernel_pmd = 0, 1209 .shared_kernel_pmd = 0,
1211 1210
1212#ifdef CONFIG_X86_64 1211#ifdef CONFIG_X86_64
1213 .extra_user_64bit_cs = FLAT_USER_CS64, 1212 .extra_user_64bit_cs = FLAT_USER_CS64,
1214#endif 1213#endif
1215 .features = 0,
1216 .name = "Xen", 1214 .name = "Xen",
1217}; 1215};
1218 1216
@@ -1528,6 +1526,11 @@ static void __init xen_pvh_early_guest_init(void)
1528} 1526}
1529#endif /* CONFIG_XEN_PVH */ 1527#endif /* CONFIG_XEN_PVH */
1530 1528
1529static void __init xen_dom0_set_legacy_features(void)
1530{
1531 x86_platform.legacy.rtc = 1;
1532}
1533
1531/* First C function to be called on Xen boot */ 1534/* First C function to be called on Xen boot */
1532asmlinkage __visible void __init xen_start_kernel(void) 1535asmlinkage __visible void __init xen_start_kernel(void)
1533{ 1536{
@@ -1548,8 +1551,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
1548 1551
1549 /* Install Xen paravirt ops */ 1552 /* Install Xen paravirt ops */
1550 pv_info = xen_info; 1553 pv_info = xen_info;
1551 if (xen_initial_domain())
1552 pv_info.features |= PV_SUPPORTED_RTC;
1553 pv_init_ops = xen_init_ops; 1554 pv_init_ops = xen_init_ops;
1554 if (!xen_pvh_domain()) { 1555 if (!xen_pvh_domain()) {
1555 pv_cpu_ops = xen_cpu_ops; 1556 pv_cpu_ops = xen_cpu_ops;
@@ -1684,6 +1685,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
1684 boot_params.hdr.ramdisk_image = initrd_start; 1685 boot_params.hdr.ramdisk_image = initrd_start;
1685 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1686 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1686 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1687 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1688 boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
1687 1689
1688 if (!xen_initial_domain()) { 1690 if (!xen_initial_domain()) {
1689 add_preferred_console("xenboot", 0, NULL); 1691 add_preferred_console("xenboot", 0, NULL);
@@ -1701,6 +1703,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
1701 .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, 1703 .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
1702 }; 1704 };
1703 1705
1706 x86_platform.set_legacy_features =
1707 xen_dom0_set_legacy_features;
1704 xen_init_vga(info, xen_start_info->console.dom0.info_size); 1708 xen_init_vga(info, xen_start_info->console.dom0.info_size);
1705 xen_start_info->console.domU.mfn = 0; 1709 xen_start_info->console.domU.mfn = 0;
1706 xen_start_info->console.domU.evtchn = 0; 1710 xen_start_info->console.domU.evtchn = 0;