aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-08-12 17:31:10 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-08-12 17:31:10 -0400
commit01ea443982203fcdee1250ab630ab6516f0a16e6 (patch)
tree4bb26d3417faebe6ccdad7021184b0c1d0ccb3fb
parent3bc6d8c155fbbbe789b6caa44b9e658a5b2995d3 (diff)
parentd52c0569bab4edc888832df44dc7ac28517134f6 (diff)
Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Ingo Molnar: "This is bigger than usual - the reason is partly a pent-up stream of fixes after the merge window and partly accidental. The fixes are: - five patches to fix a boot failure on Andy Lutomirsky's laptop - four SGI UV platform fixes - KASAN fix - warning fix - documentation update - swap entry definition fix - pkeys fix - irq stats fix" * 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/apic/x2apic, smp/hotplug: Don't use before alloc in x2apic_cluster_probe() x86/efi: Allocate a trampoline if needed in efi_free_boot_services() x86/boot: Rework reserve_real_mode() to allow multiple tries x86/boot: Defer setup_real_mode() to early_initcall time x86/boot: Synchronize trampoline_cr4_features and mmu_cr4_features directly x86/boot: Run reserve_bios_regions() after we initialize the memory map x86/irq: Do not substract irq_tlb_count from irq_call_count x86/mm: Fix swap entry comment and macro x86/mm/kaslr: Fix -Wformat-security warning x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE buffer manipulation x86/build: Reduce the W=1 warnings noise when compiling x86 syscall tables x86/platform/UV: Fix kernel panic running RHEL kdump kernel on UV systems x86/platform/UV: Fix problem with UV4 BIOS providing incorrect PXM values x86/platform/UV: Fix bug with iounmap() of the UV4 EFI System Table causing a crash x86/platform/UV: Fix problem with UV4 Socket IDs not being contiguous x86/entry: Clarify the RF saving/restoring situation with SYSCALL/SYSRET x86/mm: Disable preemption during CR3 read+write x86/mm/KASLR: Increase BRK pages for KASLR memory randomization x86/mm/KASLR: Fix physical memory calculation on KASLR memory randomization x86, kasan, ftrace: Put APIC interrupt handlers into .irqentry.text
-rw-r--r--arch/x86/entry/Makefile2
-rw-r--r--arch/x86/entry/entry_64.S25
-rw-r--r--arch/x86/include/asm/hardirq.h4
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/realmode.h10
-rw-r--r--arch/x86/include/asm/tlbflush.h7
-rw-r--r--arch/x86/include/asm/uv/bios.h5
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c13
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c42
-rw-r--r--arch/x86/kernel/fpu/xstate.c138
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/setup.c27
-rw-r--r--arch/x86/lib/kaslr.c2
-rw-r--r--arch/x86/mm/init.c14
-rw-r--r--arch/x86/mm/kaslr.c2
-rw-r--r--arch/x86/platform/efi/quirks.c21
-rw-r--r--arch/x86/platform/uv/bios_uv.c8
-rw-r--r--arch/x86/realmode/init.c47
20 files changed, 182 insertions, 195 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index fe91c25092da..77f28ce9c646 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -5,6 +5,8 @@
5OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y 5OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y
6OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y 6OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
7 7
8CFLAGS_syscall_64.o += -Wno-override-init
9CFLAGS_syscall_32.o += -Wno-override-init
8obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o 10obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
9obj-y += common.o 11obj-y += common.o
10 12
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index b846875aeea6..d172c619c449 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -288,11 +288,15 @@ return_from_SYSCALL_64:
288 jne opportunistic_sysret_failed 288 jne opportunistic_sysret_failed
289 289
290 /* 290 /*
291 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, 291 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
292 * restoring TF results in a trap from userspace immediately after 292 * restore RF properly. If the slowpath sets it for whatever reason, we
293 * SYSRET. This would cause an infinite loop whenever #DB happens 293 * need to restore it correctly.
294 * with register state that satisfies the opportunistic SYSRET 294 *
295 * conditions. For example, single-stepping this user code: 295 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
296 * trap from userspace immediately after SYSRET. This would cause an
297 * infinite loop whenever #DB happens with register state that satisfies
298 * the opportunistic SYSRET conditions. For example, single-stepping
299 * this user code:
296 * 300 *
297 * movq $stuck_here, %rcx 301 * movq $stuck_here, %rcx
298 * pushfq 302 * pushfq
@@ -601,9 +605,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
601.endm 605.endm
602#endif 606#endif
603 607
608/* Make sure APIC interrupt handlers end up in the irqentry section: */
609#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
610# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
611# define POP_SECTION_IRQENTRY .popsection
612#else
613# define PUSH_SECTION_IRQENTRY
614# define POP_SECTION_IRQENTRY
615#endif
616
604.macro apicinterrupt num sym do_sym 617.macro apicinterrupt num sym do_sym
618PUSH_SECTION_IRQENTRY
605apicinterrupt3 \num \sym \do_sym 619apicinterrupt3 \num \sym \do_sym
606trace_apicinterrupt \num \sym 620trace_apicinterrupt \num \sym
621POP_SECTION_IRQENTRY
607.endm 622.endm
608 623
609#ifdef CONFIG_SMP 624#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 7178043b0e1d..59405a248fc2 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,10 +22,6 @@ typedef struct {
22#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
23 unsigned int irq_resched_count; 23 unsigned int irq_resched_count;
24 unsigned int irq_call_count; 24 unsigned int irq_call_count;
25 /*
26 * irq_tlb_count is double-counted in irq_call_count, so it must be
27 * subtracted from irq_call_count when displaying irq_call_count
28 */
29 unsigned int irq_tlb_count; 25 unsigned int irq_tlb_count;
30#endif 26#endif
31#ifdef CONFIG_X86_THERMAL_VECTOR 27#ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 7e8ec7ae10fa..1cc82ece9ac1 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -145,7 +145,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
145 * 145 *
146 * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number 146 * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number
147 * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names 147 * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
148 * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry 148 * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry
149 * 149 *
150 * G (8) is aliased and used as a PROT_NONE indicator for 150 * G (8) is aliased and used as a PROT_NONE indicator for
151 * !present ptes. We need to start storing swap entries above 151 * !present ptes. We need to start storing swap entries above
@@ -156,7 +156,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
156#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) 156#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
157#define SWP_TYPE_BITS 5 157#define SWP_TYPE_BITS 5
158/* Place the offset above the type: */ 158/* Place the offset above the type: */
159#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) 159#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
160 160
161#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) 161#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
162 162
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 9c6b890d5e7a..b2988c0ed829 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -58,7 +58,15 @@ extern unsigned char boot_gdt[];
58extern unsigned char secondary_startup_64[]; 58extern unsigned char secondary_startup_64[];
59#endif 59#endif
60 60
61static inline size_t real_mode_size_needed(void)
62{
63 if (real_mode_header)
64 return 0; /* already allocated. */
65
66 return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE);
67}
68
69void set_real_mode_mem(phys_addr_t mem, size_t size);
61void reserve_real_mode(void); 70void reserve_real_mode(void);
62void setup_real_mode(void);
63 71
64#endif /* _ARCH_X86_REALMODE_H */ 72#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4e5be94e079a..6fa85944af83 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -135,7 +135,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
135 135
136static inline void __native_flush_tlb(void) 136static inline void __native_flush_tlb(void)
137{ 137{
138 /*
139 * If current->mm == NULL then we borrow a mm which may change during a
140 * task switch and therefore we must not be preempted while we write CR3
141 * back:
142 */
143 preempt_disable();
138 native_write_cr3(native_read_cr3()); 144 native_write_cr3(native_read_cr3());
145 preempt_enable();
139} 146}
140 147
141static inline void __native_flush_tlb_global_irq_disabled(void) 148static inline void __native_flush_tlb_global_irq_disabled(void)
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index c852590254d5..e652a7cc6186 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -79,7 +79,7 @@ struct uv_gam_range_entry {
79 u16 nasid; /* HNasid */ 79 u16 nasid; /* HNasid */
80 u16 sockid; /* Socket ID, high bits of APIC ID */ 80 u16 sockid; /* Socket ID, high bits of APIC ID */
81 u16 pnode; /* Index to MMR and GRU spaces */ 81 u16 pnode; /* Index to MMR and GRU spaces */
82 u32 pxm; /* ACPI proximity domain number */ 82 u32 unused2;
83 u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */ 83 u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */
84}; 84};
85 85
@@ -88,7 +88,8 @@ struct uv_gam_range_entry {
88#define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ 88#define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */
89#define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */ 89#define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */
90#define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */ 90#define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */
91#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_2 91#define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */
92#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3
92 93
93#define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */ 94#define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */
94#define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */ 95#define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 6368fa69d2af..54f35d988025 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -155,7 +155,7 @@ static void init_x2apic_ldr(void)
155/* 155/*
156 * At CPU state changes, update the x2apic cluster sibling info. 156 * At CPU state changes, update the x2apic cluster sibling info.
157 */ 157 */
158int x2apic_prepare_cpu(unsigned int cpu) 158static int x2apic_prepare_cpu(unsigned int cpu)
159{ 159{
160 if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) 160 if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL))
161 return -ENOMEM; 161 return -ENOMEM;
@@ -168,7 +168,7 @@ int x2apic_prepare_cpu(unsigned int cpu)
168 return 0; 168 return 0;
169} 169}
170 170
171int x2apic_dead_cpu(unsigned int this_cpu) 171static int x2apic_dead_cpu(unsigned int this_cpu)
172{ 172{
173 int cpu; 173 int cpu;
174 174
@@ -186,13 +186,18 @@ int x2apic_dead_cpu(unsigned int this_cpu)
186static int x2apic_cluster_probe(void) 186static int x2apic_cluster_probe(void)
187{ 187{
188 int cpu = smp_processor_id(); 188 int cpu = smp_processor_id();
189 int ret;
189 190
190 if (!x2apic_mode) 191 if (!x2apic_mode)
191 return 0; 192 return 0;
192 193
194 ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
195 x2apic_prepare_cpu, x2apic_dead_cpu);
196 if (ret < 0) {
197 pr_err("Failed to register X2APIC_PREPARE\n");
198 return 0;
199 }
193 cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); 200 cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
194 cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
195 x2apic_prepare_cpu, x2apic_dead_cpu);
196 return 1; 201 return 1;
197} 202}
198 203
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 09b59adaea3f..cb0673c1e940 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
223 if (strncmp(oem_id, "SGI", 3) != 0) 223 if (strncmp(oem_id, "SGI", 3) != 0)
224 return 0; 224 return 0;
225 225
226 if (numa_off) {
227 pr_err("UV: NUMA is off, disabling UV support\n");
228 return 0;
229 }
230
226 /* Setup early hub type field in uv_hub_info for Node 0 */ 231 /* Setup early hub type field in uv_hub_info for Node 0 */
227 uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; 232 uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
228 233
@@ -325,7 +330,7 @@ static __init void build_uv_gr_table(void)
325 struct uv_gam_range_entry *gre = uv_gre_table; 330 struct uv_gam_range_entry *gre = uv_gre_table;
326 struct uv_gam_range_s *grt; 331 struct uv_gam_range_s *grt;
327 unsigned long last_limit = 0, ram_limit = 0; 332 unsigned long last_limit = 0, ram_limit = 0;
328 int bytes, i, sid, lsid = -1; 333 int bytes, i, sid, lsid = -1, indx = 0, lindx = -1;
329 334
330 if (!gre) 335 if (!gre)
331 return; 336 return;
@@ -356,11 +361,12 @@ static __init void build_uv_gr_table(void)
356 } 361 }
357 sid = gre->sockid - _min_socket; 362 sid = gre->sockid - _min_socket;
358 if (lsid < sid) { /* new range */ 363 if (lsid < sid) { /* new range */
359 grt = &_gr_table[sid]; 364 grt = &_gr_table[indx];
360 grt->base = lsid; 365 grt->base = lindx;
361 grt->nasid = gre->nasid; 366 grt->nasid = gre->nasid;
362 grt->limit = last_limit = gre->limit; 367 grt->limit = last_limit = gre->limit;
363 lsid = sid; 368 lsid = sid;
369 lindx = indx++;
364 continue; 370 continue;
365 } 371 }
366 if (lsid == sid && !ram_limit) { /* update range */ 372 if (lsid == sid && !ram_limit) { /* update range */
@@ -371,7 +377,7 @@ static __init void build_uv_gr_table(void)
371 } 377 }
372 if (!ram_limit) { /* non-contiguous ram range */ 378 if (!ram_limit) { /* non-contiguous ram range */
373 grt++; 379 grt++;
374 grt->base = sid - 1; 380 grt->base = lindx;
375 grt->nasid = gre->nasid; 381 grt->nasid = gre->nasid;
376 grt->limit = last_limit = gre->limit; 382 grt->limit = last_limit = gre->limit;
377 continue; 383 continue;
@@ -1155,19 +1161,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
1155 for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { 1161 for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
1156 if (!index) { 1162 if (!index) {
1157 pr_info("UV: GAM Range Table...\n"); 1163 pr_info("UV: GAM Range Table...\n");
1158 pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", 1164 pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n",
1159 "Range", "", "Size", "Type", "NASID", 1165 "Range", "", "Size", "Type", "NASID",
1160 "SID", "PN", "PXM"); 1166 "SID", "PN");
1161 } 1167 }
1162 pr_info( 1168 pr_info(
1163 "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", 1169 "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n",
1164 index++, 1170 index++,
1165 (unsigned long)lgre << UV_GAM_RANGE_SHFT, 1171 (unsigned long)lgre << UV_GAM_RANGE_SHFT,
1166 (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, 1172 (unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
1167 ((unsigned long)(gre->limit - lgre)) >> 1173 ((unsigned long)(gre->limit - lgre)) >>
1168 (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ 1174 (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */
1169 gre->type, gre->nasid, gre->sockid, 1175 gre->type, gre->nasid, gre->sockid, gre->pnode);
1170 gre->pnode, gre->pxm);
1171 1176
1172 lgre = gre->limit; 1177 lgre = gre->limit;
1173 if (sock_min > gre->sockid) 1178 if (sock_min > gre->sockid)
@@ -1286,7 +1291,7 @@ static void __init build_socket_tables(void)
1286 _pnode_to_socket[i] = SOCK_EMPTY; 1291 _pnode_to_socket[i] = SOCK_EMPTY;
1287 1292
1288 /* fill in pnode/node/addr conversion list values */ 1293 /* fill in pnode/node/addr conversion list values */
1289 pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); 1294 pr_info("UV: GAM Building socket/pnode conversion tables\n");
1290 for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { 1295 for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
1291 if (gre->type == UV_GAM_RANGE_TYPE_HOLE) 1296 if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
1292 continue; 1297 continue;
@@ -1294,20 +1299,18 @@ static void __init build_socket_tables(void)
1294 if (_socket_to_pnode[i] != SOCK_EMPTY) 1299 if (_socket_to_pnode[i] != SOCK_EMPTY)
1295 continue; /* duplicate */ 1300 continue; /* duplicate */
1296 _socket_to_pnode[i] = gre->pnode; 1301 _socket_to_pnode[i] = gre->pnode;
1297 _socket_to_node[i] = gre->pxm;
1298 1302
1299 i = gre->pnode - minpnode; 1303 i = gre->pnode - minpnode;
1300 _pnode_to_socket[i] = gre->sockid; 1304 _pnode_to_socket[i] = gre->sockid;
1301 1305
1302 pr_info( 1306 pr_info(
1303 "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", 1307 "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
1304 gre->sockid, gre->type, gre->nasid, 1308 gre->sockid, gre->type, gre->nasid,
1305 _socket_to_pnode[gre->sockid - minsock], 1309 _socket_to_pnode[gre->sockid - minsock],
1306 _socket_to_node[gre->sockid - minsock],
1307 _pnode_to_socket[gre->pnode - minpnode]); 1310 _pnode_to_socket[gre->pnode - minpnode]);
1308 } 1311 }
1309 1312
1310 /* check socket -> node values */ 1313 /* Set socket -> node values */
1311 lnid = -1; 1314 lnid = -1;
1312 for_each_present_cpu(cpu) { 1315 for_each_present_cpu(cpu) {
1313 int nid = cpu_to_node(cpu); 1316 int nid = cpu_to_node(cpu);
@@ -1318,14 +1321,9 @@ static void __init build_socket_tables(void)
1318 lnid = nid; 1321 lnid = nid;
1319 apicid = per_cpu(x86_cpu_to_apicid, cpu); 1322 apicid = per_cpu(x86_cpu_to_apicid, cpu);
1320 sockid = apicid >> uv_cpuid.socketid_shift; 1323 sockid = apicid >> uv_cpuid.socketid_shift;
1321 i = sockid - minsock; 1324 _socket_to_node[sockid - minsock] = nid;
1322 1325 pr_info("UV: sid:%02x: apicid:%04x node:%2d\n",
1323 if (nid != _socket_to_node[i]) { 1326 sockid, apicid, nid);
1324 pr_warn(
1325 "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n",
1326 i, sockid, gre->type, _socket_to_node[i], nid);
1327 _socket_to_node[i] = nid;
1328 }
1329 } 1327 }
1330 1328
1331 /* Setup physical blade to pnode translation from GAM Range Table */ 1329 /* Setup physical blade to pnode translation from GAM Range Table */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 680049aa4593..01567aa87503 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -866,105 +866,17 @@ const void *get_xsave_field_ptr(int xsave_state)
866 return get_xsave_addr(&fpu->state.xsave, xsave_state); 866 return get_xsave_addr(&fpu->state.xsave, xsave_state);
867} 867}
868 868
869
870/*
871 * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
872 * to take out of its "init state". This will ensure that an
873 * XRSTOR actually restores the state.
874 */
875static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
876 int xstate_feature_mask)
877{
878 xsave->header.xfeatures |= xstate_feature_mask;
879}
880
881/*
882 * This function is safe to call whether the FPU is in use or not.
883 *
884 * Note that this only works on the current task.
885 *
886 * Inputs:
887 * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
888 * XFEATURE_MASK_SSE, etc...)
889 * @xsave_state_ptr: a pointer to a copy of the state that you would
890 * like written in to the current task's FPU xsave state. This pointer
891 * must not be located in the current tasks's xsave area.
892 * Output:
893 * address of the state in the xsave area or NULL if the state
894 * is not present or is in its 'init state'.
895 */
896static void fpu__xfeature_set_state(int xstate_feature_mask,
897 void *xstate_feature_src, size_t len)
898{
899 struct xregs_state *xsave = &current->thread.fpu.state.xsave;
900 struct fpu *fpu = &current->thread.fpu;
901 void *dst;
902
903 if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
904 WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
905 return;
906 }
907
908 /*
909 * Tell the FPU code that we need the FPU state to be in
910 * 'fpu' (not in the registers), and that we need it to
911 * be stable while we write to it.
912 */
913 fpu__current_fpstate_write_begin();
914
915 /*
916 * This method *WILL* *NOT* work for compact-format
917 * buffers. If the 'xstate_feature_mask' is unset in
918 * xcomp_bv then we may need to move other feature state
919 * "up" in the buffer.
920 */
921 if (xsave->header.xcomp_bv & xstate_feature_mask) {
922 WARN_ON_ONCE(1);
923 goto out;
924 }
925
926 /* find the location in the xsave buffer of the desired state */
927 dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
928
929 /*
930 * Make sure that the pointer being passed in did not
931 * come from the xsave buffer itself.
932 */
933 WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
934
935 /* put the caller-provided data in the location */
936 memcpy(dst, xstate_feature_src, len);
937
938 /*
939 * Mark the xfeature so that the CPU knows there is state
940 * in the buffer now.
941 */
942 fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
943out:
944 /*
945 * We are done writing to the 'fpu'. Reenable preeption
946 * and (possibly) move the fpstate back in to the fpregs.
947 */
948 fpu__current_fpstate_write_end();
949}
950
951#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) 869#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
952#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) 870#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
953 871
954/* 872/*
955 * This will go out and modify the XSAVE buffer so that PKRU is 873 * This will go out and modify PKRU register to set the access
956 * set to a particular state for access to 'pkey'. 874 * rights for @pkey to @init_val.
957 *
958 * PKRU state does affect kernel access to user memory. We do
959 * not modfiy PKRU *itself* here, only the XSAVE state that will
960 * be restored in to PKRU when we return back to userspace.
961 */ 875 */
962int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 876int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
963 unsigned long init_val) 877 unsigned long init_val)
964{ 878{
965 struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; 879 u32 old_pkru;
966 struct pkru_state *old_pkru_state;
967 struct pkru_state new_pkru_state;
968 int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); 880 int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
969 u32 new_pkru_bits = 0; 881 u32 new_pkru_bits = 0;
970 882
@@ -974,6 +886,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
974 */ 886 */
975 if (!boot_cpu_has(X86_FEATURE_OSPKE)) 887 if (!boot_cpu_has(X86_FEATURE_OSPKE))
976 return -EINVAL; 888 return -EINVAL;
889 /*
890 * For most XSAVE components, this would be an arduous task:
891 * brining fpstate up to date with fpregs, updating fpstate,
892 * then re-populating fpregs. But, for components that are
893 * never lazily managed, we can just access the fpregs
894 * directly. PKRU is never managed lazily, so we can just
895 * manipulate it directly. Make sure it stays that way.
896 */
897 WARN_ON_ONCE(!use_eager_fpu());
977 898
978 /* Set the bits we need in PKRU: */ 899 /* Set the bits we need in PKRU: */
979 if (init_val & PKEY_DISABLE_ACCESS) 900 if (init_val & PKEY_DISABLE_ACCESS)
@@ -984,37 +905,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
984 /* Shift the bits in to the correct place in PKRU for pkey: */ 905 /* Shift the bits in to the correct place in PKRU for pkey: */
985 new_pkru_bits <<= pkey_shift; 906 new_pkru_bits <<= pkey_shift;
986 907
987 /* Locate old copy of the state in the xsave buffer: */ 908 /* Get old PKRU and mask off any old bits in place: */
988 old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); 909 old_pkru = read_pkru();
989 910 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
990 /*
991 * When state is not in the buffer, it is in the init
992 * state, set it manually. Otherwise, copy out the old
993 * state.
994 */
995 if (!old_pkru_state)
996 new_pkru_state.pkru = 0;
997 else
998 new_pkru_state.pkru = old_pkru_state->pkru;
999
1000 /* Mask off any old bits in place: */
1001 new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1002
1003 /* Set the newly-requested bits: */
1004 new_pkru_state.pkru |= new_pkru_bits;
1005
1006 /*
1007 * We could theoretically live without zeroing pkru.pad.
1008 * The current XSAVE feature state definition says that
1009 * only bytes 0->3 are used. But we do not want to
1010 * chance leaking kernel stack out to userspace in case a
1011 * memcpy() of the whole xsave buffer was done.
1012 *
1013 * They're in the same cacheline anyway.
1014 */
1015 new_pkru_state.pad = 0;
1016 911
1017 fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state)); 912 /* Write old part along with new part: */
913 write_pkru(old_pkru | new_pkru_bits);
1018 914
1019 return 0; 915 return 0;
1020} 916}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 2dda0bc4576e..f16c55bfc090 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void)
25 /* Initialize 32bit specific setup functions */ 25 /* Initialize 32bit specific setup functions */
26 x86_init.resources.reserve_resources = i386_reserve_resources; 26 x86_init.resources.reserve_resources = i386_reserve_resources;
27 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 27 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
28
29 reserve_bios_regions();
30} 28}
31 29
32asmlinkage __visible void __init i386_start_kernel(void) 30asmlinkage __visible void __init i386_start_kernel(void)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 99d48e7d2974..54a2372f5dbb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
183 copy_bootdata(__va(real_mode_data)); 183 copy_bootdata(__va(real_mode_data));
184 184
185 x86_early_init_platform_quirks(); 185 x86_early_init_platform_quirks();
186 reserve_bios_regions();
187 186
188 switch (boot_params.hdr.hardware_subarch) { 187 switch (boot_params.hdr.hardware_subarch) {
189 case X86_SUBARCH_INTEL_MID: 188 case X86_SUBARCH_INTEL_MID:
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 61521dc19c10..9f669fdd2010 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
102 seq_puts(p, " Rescheduling interrupts\n"); 102 seq_puts(p, " Rescheduling interrupts\n");
103 seq_printf(p, "%*s: ", prec, "CAL"); 103 seq_printf(p, "%*s: ", prec, "CAL");
104 for_each_online_cpu(j) 104 for_each_online_cpu(j)
105 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - 105 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
106 irq_stats(j)->irq_tlb_count);
107 seq_puts(p, " Function call interrupts\n"); 106 seq_puts(p, " Function call interrupts\n");
108 seq_printf(p, "%*s: ", prec, "TLB"); 107 seq_printf(p, "%*s: ", prec, "TLB");
109 for_each_online_cpu(j) 108 for_each_online_cpu(j)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 991b77986d57..0fa60f5f5a16 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -936,8 +936,6 @@ void __init setup_arch(char **cmdline_p)
936 936
937 x86_init.oem.arch_setup(); 937 x86_init.oem.arch_setup();
938 938
939 kernel_randomize_memory();
940
941 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; 939 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
942 setup_memory_map(); 940 setup_memory_map();
943 parse_setup_data(); 941 parse_setup_data();
@@ -1055,6 +1053,12 @@ void __init setup_arch(char **cmdline_p)
1055 1053
1056 max_possible_pfn = max_pfn; 1054 max_possible_pfn = max_pfn;
1057 1055
1056 /*
1057 * Define random base addresses for memory sections after max_pfn is
1058 * defined and before each memory section base is used.
1059 */
1060 kernel_randomize_memory();
1061
1058#ifdef CONFIG_X86_32 1062#ifdef CONFIG_X86_32
1059 /* max_low_pfn get updated here */ 1063 /* max_low_pfn get updated here */
1060 find_low_pfn_range(); 1064 find_low_pfn_range();
@@ -1097,6 +1101,8 @@ void __init setup_arch(char **cmdline_p)
1097 efi_find_mirror(); 1101 efi_find_mirror();
1098 } 1102 }
1099 1103
1104 reserve_bios_regions();
1105
1100 /* 1106 /*
1101 * The EFI specification says that boot service code won't be called 1107 * The EFI specification says that boot service code won't be called
1102 * after ExitBootServices(). This is, in fact, a lie. 1108 * after ExitBootServices(). This is, in fact, a lie.
@@ -1125,7 +1131,15 @@ void __init setup_arch(char **cmdline_p)
1125 1131
1126 early_trap_pf_init(); 1132 early_trap_pf_init();
1127 1133
1128 setup_real_mode(); 1134 /*
1135 * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
1136 * with the current CR4 value. This may not be necessary, but
1137 * auditing all the early-boot CR4 manipulation would be needed to
1138 * rule it out.
1139 */
1140 if (boot_cpu_data.cpuid_level >= 0)
1141 /* A CPU has %cr4 if and only if it has CPUID. */
1142 mmu_cr4_features = __read_cr4();
1129 1143
1130 memblock_set_current_limit(get_max_mapped()); 1144 memblock_set_current_limit(get_max_mapped());
1131 1145
@@ -1174,13 +1188,6 @@ void __init setup_arch(char **cmdline_p)
1174 1188
1175 kasan_init(); 1189 kasan_init();
1176 1190
1177 if (boot_cpu_data.cpuid_level >= 0) {
1178 /* A CPU has %cr4 if and only if it has CPUID */
1179 mmu_cr4_features = __read_cr4();
1180 if (trampoline_cr4_features)
1181 *trampoline_cr4_features = mmu_cr4_features;
1182 }
1183
1184#ifdef CONFIG_X86_32 1191#ifdef CONFIG_X86_32
1185 /* sync back kernel address range */ 1192 /* sync back kernel address range */
1186 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, 1193 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index f7dfeda83e5c..121f59c6ee54 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -19,7 +19,7 @@
19#include <asm/cpufeature.h> 19#include <asm/cpufeature.h>
20#include <asm/setup.h> 20#include <asm/setup.h>
21 21
22#define debug_putstr(v) early_printk(v) 22#define debug_putstr(v) early_printk("%s", v)
23#define has_cpuflag(f) boot_cpu_has(f) 23#define has_cpuflag(f) boot_cpu_has(f)
24#define get_boot_seed() kaslr_offset() 24#define get_boot_seed() kaslr_offset()
25#endif 25#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 620928903be3..d28a2d741f9e 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -122,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num)
122 return __va(pfn << PAGE_SHIFT); 122 return __va(pfn << PAGE_SHIFT);
123} 123}
124 124
125/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ 125/*
126#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) 126 * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
127 * With KASLR memory randomization, depending on the machine e820 memory
128 * and the PUD alignment. We may need twice more pages when KASLR memory
129 * randomization is enabled.
130 */
131#ifndef CONFIG_RANDOMIZE_MEMORY
132#define INIT_PGD_PAGE_COUNT 6
133#else
134#define INIT_PGD_PAGE_COUNT 12
135#endif
136#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
127RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); 137RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
128void __init early_alloc_pgt_buf(void) 138void __init early_alloc_pgt_buf(void)
129{ 139{
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 26dccd6c0df1..ec8654f117d8 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -97,7 +97,7 @@ void __init kernel_randomize_memory(void)
97 * add padding if needed (especially for memory hotplug support). 97 * add padding if needed (especially for memory hotplug support).
98 */ 98 */
99 BUG_ON(kaslr_regions[0].base != &page_offset_base); 99 BUG_ON(kaslr_regions[0].base != &page_offset_base);
100 memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) + 100 memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
101 CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; 101 CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
102 102
103 /* Adapt phyiscal memory region size based on available memory */ 103 /* Adapt phyiscal memory region size based on available memory */
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 4480c06cade7..89d1146f5a6f 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -254,6 +254,7 @@ void __init efi_free_boot_services(void)
254 for_each_efi_memory_desc(md) { 254 for_each_efi_memory_desc(md) {
255 unsigned long long start = md->phys_addr; 255 unsigned long long start = md->phys_addr;
256 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; 256 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
257 size_t rm_size;
257 258
258 if (md->type != EFI_BOOT_SERVICES_CODE && 259 if (md->type != EFI_BOOT_SERVICES_CODE &&
259 md->type != EFI_BOOT_SERVICES_DATA) 260 md->type != EFI_BOOT_SERVICES_DATA)
@@ -263,6 +264,26 @@ void __init efi_free_boot_services(void)
263 if (md->attribute & EFI_MEMORY_RUNTIME) 264 if (md->attribute & EFI_MEMORY_RUNTIME)
264 continue; 265 continue;
265 266
267 /*
268 * Nasty quirk: if all sub-1MB memory is used for boot
269 * services, we can get here without having allocated the
270 * real mode trampoline. It's too late to hand boot services
271 * memory back to the memblock allocator, so instead
272 * try to manually allocate the trampoline if needed.
273 *
274 * I've seen this on a Dell XPS 13 9350 with firmware
275 * 1.4.4 with SGX enabled booting Linux via Fedora 24's
276 * grub2-efi on a hard disk. (And no, I don't know why
277 * this happened, but Linux should still try to boot rather
278 * panicing early.)
279 */
280 rm_size = real_mode_size_needed();
281 if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) {
282 set_real_mode_mem(start, rm_size);
283 start += rm_size;
284 size -= rm_size;
285 }
286
266 free_bootmem_late(start, size); 287 free_bootmem_late(start, size);
267 } 288 }
268 289
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 0df8a0370d32..23f2f3e41c7f 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -200,12 +200,14 @@ void uv_bios_init(void)
200 return; 200 return;
201 } 201 }
202 202
203 /* Starting with UV4 the UV systab size is variable */
203 if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) { 204 if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) {
205 int size = uv_systab->size;
206
204 iounmap(uv_systab); 207 iounmap(uv_systab);
205 uv_systab = ioremap(efi.uv_systab, uv_systab->size); 208 uv_systab = ioremap(efi.uv_systab, size);
206 if (!uv_systab) { 209 if (!uv_systab) {
207 pr_err("UV: UVsystab: ioremap(%d) failed!\n", 210 pr_err("UV: UVsystab: ioremap(%d) failed!\n", size);
208 uv_systab->size);
209 return; 211 return;
210 } 212 }
211 } 213 }
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 705e3fffb4a1..5db706f14111 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,9 +1,11 @@
1#include <linux/io.h> 1#include <linux/io.h>
2#include <linux/slab.h>
2#include <linux/memblock.h> 3#include <linux/memblock.h>
3 4
4#include <asm/cacheflush.h> 5#include <asm/cacheflush.h>
5#include <asm/pgtable.h> 6#include <asm/pgtable.h>
6#include <asm/realmode.h> 7#include <asm/realmode.h>
8#include <asm/tlbflush.h>
7 9
8struct real_mode_header *real_mode_header; 10struct real_mode_header *real_mode_header;
9u32 *trampoline_cr4_features; 11u32 *trampoline_cr4_features;
@@ -11,25 +13,37 @@ u32 *trampoline_cr4_features;
11/* Hold the pgd entry used on booting additional CPUs */ 13/* Hold the pgd entry used on booting additional CPUs */
12pgd_t trampoline_pgd_entry; 14pgd_t trampoline_pgd_entry;
13 15
16void __init set_real_mode_mem(phys_addr_t mem, size_t size)
17{
18 void *base = __va(mem);
19
20 real_mode_header = (struct real_mode_header *) base;
21 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
22 base, (unsigned long long)mem, size);
23}
24
14void __init reserve_real_mode(void) 25void __init reserve_real_mode(void)
15{ 26{
16 phys_addr_t mem; 27 phys_addr_t mem;
17 unsigned char *base; 28 size_t size = real_mode_size_needed();
18 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); 29
30 if (!size)
31 return;
32
33 WARN_ON(slab_is_available());
19 34
20 /* Has to be under 1M so we can execute real-mode AP code. */ 35 /* Has to be under 1M so we can execute real-mode AP code. */
21 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); 36 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
22 if (!mem) 37 if (!mem) {
23 panic("Cannot allocate trampoline\n"); 38 pr_info("No sub-1M memory is available for the trampoline\n");
39 return;
40 }
24 41
25 base = __va(mem);
26 memblock_reserve(mem, size); 42 memblock_reserve(mem, size);
27 real_mode_header = (struct real_mode_header *) base; 43 set_real_mode_mem(mem, size);
28 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
29 base, (unsigned long long)mem, size);
30} 44}
31 45
32void __init setup_real_mode(void) 46static void __init setup_real_mode(void)
33{ 47{
34 u16 real_mode_seg; 48 u16 real_mode_seg;
35 const u32 *rel; 49 const u32 *rel;
@@ -84,7 +98,7 @@ void __init setup_real_mode(void)
84 98
85 trampoline_header->start = (u64) secondary_startup_64; 99 trampoline_header->start = (u64) secondary_startup_64;
86 trampoline_cr4_features = &trampoline_header->cr4; 100 trampoline_cr4_features = &trampoline_header->cr4;
87 *trampoline_cr4_features = __read_cr4(); 101 *trampoline_cr4_features = mmu_cr4_features;
88 102
89 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 103 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
90 trampoline_pgd[0] = trampoline_pgd_entry.pgd; 104 trampoline_pgd[0] = trampoline_pgd_entry.pgd;
@@ -100,7 +114,7 @@ void __init setup_real_mode(void)
100 * need to mark it executable at do_pre_smp_initcalls() at least, 114 * need to mark it executable at do_pre_smp_initcalls() at least,
101 * thus run it as a early_initcall(). 115 * thus run it as a early_initcall().
102 */ 116 */
103static int __init set_real_mode_permissions(void) 117static void __init set_real_mode_permissions(void)
104{ 118{
105 unsigned char *base = (unsigned char *) real_mode_header; 119 unsigned char *base = (unsigned char *) real_mode_header;
106 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); 120 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
@@ -119,7 +133,16 @@ static int __init set_real_mode_permissions(void)
119 set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); 133 set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
120 set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); 134 set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
121 set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); 135 set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
136}
137
138static int __init init_real_mode(void)
139{
140 if (!real_mode_header)
141 panic("Real mode trampoline was not allocated");
142
143 setup_real_mode();
144 set_real_mode_permissions();
122 145
123 return 0; 146 return 0;
124} 147}
125early_initcall(set_real_mode_permissions); 148early_initcall(init_real_mode);