aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c12
-rw-r--r--arch/x86/include/asm/boot.h2
-rw-r--r--arch/x86/include/asm/fpu/internal.h1
-rw-r--r--arch/x86/include/asm/fpu/xstate.h11
-rw-r--r--arch/x86/include/asm/lguest.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h34
-rw-r--r--arch/x86/kernel/cpu/amd.c3
-rw-r--r--arch/x86/kernel/fpu/init.c161
-rw-r--r--arch/x86/kernel/fpu/xstate.c4
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/verify_cpu.S50
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/pageattr.c3
-rw-r--r--arch/x86/mm/tlb.c29
-rw-r--r--drivers/lguest/core.c74
-rw-r--r--tools/testing/selftests/x86/Makefile6
-rw-r--r--tools/testing/selftests/x86/vdso_restorer.c88
17 files changed, 342 insertions, 151 deletions
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 8602f06c759f..1a50e09c945b 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode)
126 * 126 *
127 * On Xen, we don't appear to have that guarantee, but Xen still 127 * On Xen, we don't appear to have that guarantee, but Xen still
128 * supplies a valid seqlock using the version field. 128 * supplies a valid seqlock using the version field.
129 129 *
130 * We only do pvclock vdso timing at all if 130 * We only do pvclock vdso timing at all if
131 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to 131 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
132 * mean that all vCPUs have matching pvti and that the TSC is 132 * mean that all vCPUs have matching pvti and that the TSC is
133 * synced, so we can just look at vCPU 0's pvti. 133 * synced, so we can just look at vCPU 0's pvti.
134 */ 134 */
135 135
136 if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
137 *mode = VCLOCK_NONE;
138 return 0;
139 }
140
141 do { 136 do {
142 version = pvti->version; 137 version = pvti->version;
143 138
144 smp_rmb(); 139 smp_rmb();
145 140
141 if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
142 *mode = VCLOCK_NONE;
143 return 0;
144 }
145
146 tsc = rdtsc_ordered(); 146 tsc = rdtsc_ordered();
147 pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; 147 pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
148 pvti_tsc_shift = pvti->tsc_shift; 148 pvti_tsc_shift = pvti->tsc_shift;
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 4fa687a47a62..6b8d6e8cd449 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -27,7 +27,7 @@
27#define BOOT_HEAP_SIZE 0x400000 27#define BOOT_HEAP_SIZE 0x400000
28#else /* !CONFIG_KERNEL_BZIP2 */ 28#else /* !CONFIG_KERNEL_BZIP2 */
29 29
30#define BOOT_HEAP_SIZE 0x8000 30#define BOOT_HEAP_SIZE 0x10000
31 31
32#endif /* !CONFIG_KERNEL_BZIP2 */ 32#endif /* !CONFIG_KERNEL_BZIP2 */
33 33
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index eadcdd5bb946..0fd440df63f1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void);
42extern void fpu__init_system(struct cpuinfo_x86 *c); 42extern void fpu__init_system(struct cpuinfo_x86 *c);
43extern void fpu__init_check_bugs(void); 43extern void fpu__init_check_bugs(void);
44extern void fpu__resume_cpu(void); 44extern void fpu__resume_cpu(void);
45extern u64 fpu__get_supported_xfeatures_mask(void);
45 46
46/* 47/*
47 * Debugging facility: 48 * Debugging facility:
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 3a6c89b70307..af30fdeb140d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,15 +20,16 @@
20 20
21/* Supported features which support lazy state saving */ 21/* Supported features which support lazy state saving */
22#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ 22#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \
23 XFEATURE_MASK_SSE | \ 23 XFEATURE_MASK_SSE)
24
25/* Supported features which require eager state saving */
26#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
27 XFEATURE_MASK_BNDCSR | \
24 XFEATURE_MASK_YMM | \ 28 XFEATURE_MASK_YMM | \
25 XFEATURE_MASK_OPMASK | \ 29 XFEATURE_MASK_OPMASK | \
26 XFEATURE_MASK_ZMM_Hi256 | \ 30 XFEATURE_MASK_ZMM_Hi256 | \
27 XFEATURE_MASK_Hi16_ZMM) 31 XFEATURE_MASK_Hi16_ZMM)
28 32
29/* Supported features which require eager state saving */
30#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
31
32/* All currently supported features */ 33/* All currently supported features */
33#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) 34#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
34 35
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 3bbc07a57a31..73d0c9b92087 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -12,7 +12,9 @@
12#define GUEST_PL 1 12#define GUEST_PL 1
13 13
14/* Page for Switcher text itself, then two pages per cpu */ 14/* Page for Switcher text itself, then two pages per cpu */
15#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids) 15#define SWITCHER_TEXT_PAGES (1)
16#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
17#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
16 18
17/* Where we map the Switcher, in both Host and Guest. */ 19/* Where we map the Switcher, in both Host and Guest. */
18extern unsigned long switcher_addr; 20extern unsigned long switcher_addr;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 379cd3658799..bfd9b2a35a0b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
116#endif 116#endif
117 cpumask_set_cpu(cpu, mm_cpumask(next)); 117 cpumask_set_cpu(cpu, mm_cpumask(next));
118 118
119 /* Re-load page tables */ 119 /*
120 * Re-load page tables.
121 *
122 * This logic has an ordering constraint:
123 *
124 * CPU 0: Write to a PTE for 'next'
125 * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
126 * CPU 1: set bit 1 in next's mm_cpumask
127 * CPU 1: load from the PTE that CPU 0 writes (implicit)
128 *
129 * We need to prevent an outcome in which CPU 1 observes
130 * the new PTE value and CPU 0 observes bit 1 clear in
131 * mm_cpumask. (If that occurs, then the IPI will never
132 * be sent, and CPU 0's TLB will contain a stale entry.)
133 *
134 * The bad outcome can occur if either CPU's load is
135 * reordered before that CPU's store, so both CPUs must
136 * execute full barriers to prevent this from happening.
137 *
138 * Thus, switch_mm needs a full barrier between the
139 * store to mm_cpumask and any operation that could load
140 * from next->pgd. TLB fills are special and can happen
141 * due to instruction fetches or for no reason at all,
142 * and neither LOCK nor MFENCE orders them.
143 * Fortunately, load_cr3() is serializing and gives the
144 * ordering guarantee we need.
145 *
146 */
120 load_cr3(next->pgd); 147 load_cr3(next->pgd);
148
121 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 149 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
122 150
123 /* Stop flush ipis for the previous mm */ 151 /* Stop flush ipis for the previous mm */
@@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
156 * schedule, protecting us from simultaneous changes. 184 * schedule, protecting us from simultaneous changes.
157 */ 185 */
158 cpumask_set_cpu(cpu, mm_cpumask(next)); 186 cpumask_set_cpu(cpu, mm_cpumask(next));
187
159 /* 188 /*
160 * We were in lazy tlb mode and leave_mm disabled 189 * We were in lazy tlb mode and leave_mm disabled
161 * tlb flush IPI delivery. We must reload CR3 190 * tlb flush IPI delivery. We must reload CR3
162 * to make sure to use no freed page tables. 191 * to make sure to use no freed page tables.
192 *
193 * As above, load_cr3() is serializing and orders TLB
194 * fills with respect to the mm_cpumask write.
163 */ 195 */
164 load_cr3(next->pgd); 196 load_cr3(next->pgd);
165 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 197 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e678ddeed030..a07956a08936 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
434 */ 434 */
435 int ht_nodeid = c->initial_apicid; 435 int ht_nodeid = c->initial_apicid;
436 436
437 if (ht_nodeid >= 0 && 437 if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
438 __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
439 node = __apicid_to_node[ht_nodeid]; 438 node = __apicid_to_node[ht_nodeid];
440 /* Pick a nearby node */ 439 /* Pick a nearby node */
441 if (!node_online(node)) 440 if (!node_online(node))
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7b2978ab30df..6d9f0a7ef4c8 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -3,8 +3,11 @@
3 */ 3 */
4#include <asm/fpu/internal.h> 4#include <asm/fpu/internal.h>
5#include <asm/tlbflush.h> 5#include <asm/tlbflush.h>
6#include <asm/setup.h>
7#include <asm/cmdline.h>
6 8
7#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/init.h>
8 11
9/* 12/*
10 * Initialize the TS bit in CR0 according to the style of context-switches 13 * Initialize the TS bit in CR0 according to the style of context-switches
@@ -270,20 +273,52 @@ static void __init fpu__init_system_xstate_size_legacy(void)
270 */ 273 */
271static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; 274static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
272 275
273static int __init eager_fpu_setup(char *s) 276/*
277 * Find supported xfeatures based on cpu features and command-line input.
278 * This must be called after fpu__init_parse_early_param() is called and
279 * xfeatures_mask is enumerated.
280 */
281u64 __init fpu__get_supported_xfeatures_mask(void)
274{ 282{
275 if (!strcmp(s, "on")) 283 /* Support all xfeatures known to us */
276 eagerfpu = ENABLE; 284 if (eagerfpu != DISABLE)
277 else if (!strcmp(s, "off")) 285 return XCNTXT_MASK;
278 eagerfpu = DISABLE; 286
279 else if (!strcmp(s, "auto")) 287 /* Warning of xfeatures being disabled for no eagerfpu mode */
280 eagerfpu = AUTO; 288 if (xfeatures_mask & XFEATURE_MASK_EAGER) {
281 return 1; 289 pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
290 xfeatures_mask & XFEATURE_MASK_EAGER);
291 }
292
293 /* Return a mask that masks out all features requiring eagerfpu mode */
294 return ~XFEATURE_MASK_EAGER;
295}
296
297/*
298 * Disable features dependent on eagerfpu.
299 */
300static void __init fpu__clear_eager_fpu_features(void)
301{
302 setup_clear_cpu_cap(X86_FEATURE_MPX);
303 setup_clear_cpu_cap(X86_FEATURE_AVX);
304 setup_clear_cpu_cap(X86_FEATURE_AVX2);
305 setup_clear_cpu_cap(X86_FEATURE_AVX512F);
306 setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
307 setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
308 setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
282} 309}
283__setup("eagerfpu=", eager_fpu_setup);
284 310
285/* 311/*
286 * Pick the FPU context switching strategy: 312 * Pick the FPU context switching strategy:
313 *
314 * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
315 * the following is true:
316 *
317 * (1) the cpu has xsaveopt, as it has the optimization and doing eager
318 * FPU switching has a relatively low cost compared to a plain xsave;
319 * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
320 * switching. Should the kernel boot with noxsaveopt, we support MPX
321 * with eager FPU switching at a higher cost.
287 */ 322 */
288static void __init fpu__init_system_ctx_switch(void) 323static void __init fpu__init_system_ctx_switch(void)
289{ 324{
@@ -295,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void)
295 WARN_ON_FPU(current->thread.fpu.fpstate_active); 330 WARN_ON_FPU(current->thread.fpu.fpstate_active);
296 current_thread_info()->status = 0; 331 current_thread_info()->status = 0;
297 332
298 /* Auto enable eagerfpu for xsaveopt */
299 if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) 333 if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
300 eagerfpu = ENABLE; 334 eagerfpu = ENABLE;
301 335
302 if (xfeatures_mask & XFEATURE_MASK_EAGER) { 336 if (xfeatures_mask & XFEATURE_MASK_EAGER)
303 if (eagerfpu == DISABLE) { 337 eagerfpu = ENABLE;
304 pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
305 xfeatures_mask & XFEATURE_MASK_EAGER);
306 xfeatures_mask &= ~XFEATURE_MASK_EAGER;
307 } else {
308 eagerfpu = ENABLE;
309 }
310 }
311 338
312 if (eagerfpu == ENABLE) 339 if (eagerfpu == ENABLE)
313 setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); 340 setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
@@ -316,11 +343,48 @@ static void __init fpu__init_system_ctx_switch(void)
316} 343}
317 344
318/* 345/*
346 * We parse fpu parameters early because fpu__init_system() is executed
347 * before parse_early_param().
348 */
349static void __init fpu__init_parse_early_param(void)
350{
351 /*
352 * No need to check "eagerfpu=auto" again, since it is the
353 * initial default.
354 */
355 if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
356 eagerfpu = DISABLE;
357 fpu__clear_eager_fpu_features();
358 } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
359 eagerfpu = ENABLE;
360 }
361
362 if (cmdline_find_option_bool(boot_command_line, "no387"))
363 setup_clear_cpu_cap(X86_FEATURE_FPU);
364
365 if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
366 setup_clear_cpu_cap(X86_FEATURE_FXSR);
367 setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
368 setup_clear_cpu_cap(X86_FEATURE_XMM);
369 }
370
371 if (cmdline_find_option_bool(boot_command_line, "noxsave"))
372 fpu__xstate_clear_all_cpu_caps();
373
374 if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
375 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
376
377 if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
378 setup_clear_cpu_cap(X86_FEATURE_XSAVES);
379}
380
381/*
319 * Called on the boot CPU once per system bootup, to set up the initial 382 * Called on the boot CPU once per system bootup, to set up the initial
320 * FPU state that is later cloned into all processes: 383 * FPU state that is later cloned into all processes:
321 */ 384 */
322void __init fpu__init_system(struct cpuinfo_x86 *c) 385void __init fpu__init_system(struct cpuinfo_x86 *c)
323{ 386{
387 fpu__init_parse_early_param();
324 fpu__init_system_early_generic(c); 388 fpu__init_system_early_generic(c);
325 389
326 /* 390 /*
@@ -344,62 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
344 408
345 fpu__init_system_ctx_switch(); 409 fpu__init_system_ctx_switch();
346} 410}
347
348/*
349 * Boot parameter to turn off FPU support and fall back to math-emu:
350 */
351static int __init no_387(char *s)
352{
353 setup_clear_cpu_cap(X86_FEATURE_FPU);
354 return 1;
355}
356__setup("no387", no_387);
357
358/*
359 * Disable all xstate CPU features:
360 */
361static int __init x86_noxsave_setup(char *s)
362{
363 if (strlen(s))
364 return 0;
365
366 fpu__xstate_clear_all_cpu_caps();
367
368 return 1;
369}
370__setup("noxsave", x86_noxsave_setup);
371
372/*
373 * Disable the XSAVEOPT instruction specifically:
374 */
375static int __init x86_noxsaveopt_setup(char *s)
376{
377 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
378
379 return 1;
380}
381__setup("noxsaveopt", x86_noxsaveopt_setup);
382
383/*
384 * Disable the XSAVES instruction:
385 */
386static int __init x86_noxsaves_setup(char *s)
387{
388 setup_clear_cpu_cap(X86_FEATURE_XSAVES);
389
390 return 1;
391}
392__setup("noxsaves", x86_noxsaves_setup);
393
394/*
395 * Disable FX save/restore and SSE support:
396 */
397static int __init x86_nofxsr_setup(char *s)
398{
399 setup_clear_cpu_cap(X86_FEATURE_FXSR);
400 setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
401 setup_clear_cpu_cap(X86_FEATURE_XMM);
402
403 return 1;
404}
405__setup("nofxsr", x86_nofxsr_setup);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 40f100285984..d425cda5ae6d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
52 setup_clear_cpu_cap(X86_FEATURE_AVX512ER); 52 setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
53 setup_clear_cpu_cap(X86_FEATURE_AVX512CD); 53 setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
54 setup_clear_cpu_cap(X86_FEATURE_MPX); 54 setup_clear_cpu_cap(X86_FEATURE_MPX);
55 setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
55} 56}
56 57
57/* 58/*
@@ -632,8 +633,7 @@ void __init fpu__init_system_xstate(void)
632 BUG(); 633 BUG();
633 } 634 }
634 635
635 /* Support only the state known to the OS: */ 636 xfeatures_mask &= fpu__get_supported_xfeatures_mask();
636 xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
637 637
638 /* Enable xstate instructions to be able to continue with initialization: */ 638 /* Enable xstate instructions to be able to continue with initialization: */
639 fpu__init_cpu_xstate(); 639 fpu__init_cpu_xstate();
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d64889aa2d46..ab0adc0fa5db 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
182 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), 182 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
183 }, 183 },
184 }, 184 },
185 { /* Handle problems with rebooting on the iMac10,1. */
186 .callback = set_pci_reboot,
187 .ident = "Apple iMac10,1",
188 .matches = {
189 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
190 DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
191 },
192 },
185 193
186 /* ASRock */ 194 /* ASRock */
187 { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ 195 { /* Handle problems with rebooting on ASRock Q1900DC-ITX */
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 4cf401f581e7..07efb35ee4bc 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -48,31 +48,31 @@ verify_cpu:
48 pushfl 48 pushfl
49 popl %eax 49 popl %eax
50 cmpl %eax,%ebx 50 cmpl %eax,%ebx
51 jz verify_cpu_no_longmode # cpu has no cpuid 51 jz .Lverify_cpu_no_longmode # cpu has no cpuid
52#endif 52#endif
53 53
54 movl $0x0,%eax # See if cpuid 1 is implemented 54 movl $0x0,%eax # See if cpuid 1 is implemented
55 cpuid 55 cpuid
56 cmpl $0x1,%eax 56 cmpl $0x1,%eax
57 jb verify_cpu_no_longmode # no cpuid 1 57 jb .Lverify_cpu_no_longmode # no cpuid 1
58 58
59 xor %di,%di 59 xor %di,%di
60 cmpl $0x68747541,%ebx # AuthenticAMD 60 cmpl $0x68747541,%ebx # AuthenticAMD
61 jnz verify_cpu_noamd 61 jnz .Lverify_cpu_noamd
62 cmpl $0x69746e65,%edx 62 cmpl $0x69746e65,%edx
63 jnz verify_cpu_noamd 63 jnz .Lverify_cpu_noamd
64 cmpl $0x444d4163,%ecx 64 cmpl $0x444d4163,%ecx
65 jnz verify_cpu_noamd 65 jnz .Lverify_cpu_noamd
66 mov $1,%di # cpu is from AMD 66 mov $1,%di # cpu is from AMD
67 jmp verify_cpu_check 67 jmp .Lverify_cpu_check
68 68
69verify_cpu_noamd: 69.Lverify_cpu_noamd:
70 cmpl $0x756e6547,%ebx # GenuineIntel? 70 cmpl $0x756e6547,%ebx # GenuineIntel?
71 jnz verify_cpu_check 71 jnz .Lverify_cpu_check
72 cmpl $0x49656e69,%edx 72 cmpl $0x49656e69,%edx
73 jnz verify_cpu_check 73 jnz .Lverify_cpu_check
74 cmpl $0x6c65746e,%ecx 74 cmpl $0x6c65746e,%ecx
75 jnz verify_cpu_check 75 jnz .Lverify_cpu_check
76 76
77 # only call IA32_MISC_ENABLE when: 77 # only call IA32_MISC_ENABLE when:
78 # family > 6 || (family == 6 && model >= 0xd) 78 # family > 6 || (family == 6 && model >= 0xd)
@@ -83,59 +83,59 @@ verify_cpu_noamd:
83 andl $0x0ff00f00, %eax # mask family and extended family 83 andl $0x0ff00f00, %eax # mask family and extended family
84 shrl $8, %eax 84 shrl $8, %eax
85 cmpl $6, %eax 85 cmpl $6, %eax
86 ja verify_cpu_clear_xd # family > 6, ok 86 ja .Lverify_cpu_clear_xd # family > 6, ok
87 jb verify_cpu_check # family < 6, skip 87 jb .Lverify_cpu_check # family < 6, skip
88 88
89 andl $0x000f00f0, %ecx # mask model and extended model 89 andl $0x000f00f0, %ecx # mask model and extended model
90 shrl $4, %ecx 90 shrl $4, %ecx
91 cmpl $0xd, %ecx 91 cmpl $0xd, %ecx
92 jb verify_cpu_check # family == 6, model < 0xd, skip 92 jb .Lverify_cpu_check # family == 6, model < 0xd, skip
93 93
94verify_cpu_clear_xd: 94.Lverify_cpu_clear_xd:
95 movl $MSR_IA32_MISC_ENABLE, %ecx 95 movl $MSR_IA32_MISC_ENABLE, %ecx
96 rdmsr 96 rdmsr
97 btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE 97 btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
98 jnc verify_cpu_check # only write MSR if bit was changed 98 jnc .Lverify_cpu_check # only write MSR if bit was changed
99 wrmsr 99 wrmsr
100 100
101verify_cpu_check: 101.Lverify_cpu_check:
102 movl $0x1,%eax # Does the cpu have what it takes 102 movl $0x1,%eax # Does the cpu have what it takes
103 cpuid 103 cpuid
104 andl $REQUIRED_MASK0,%edx 104 andl $REQUIRED_MASK0,%edx
105 xorl $REQUIRED_MASK0,%edx 105 xorl $REQUIRED_MASK0,%edx
106 jnz verify_cpu_no_longmode 106 jnz .Lverify_cpu_no_longmode
107 107
108 movl $0x80000000,%eax # See if extended cpuid is implemented 108 movl $0x80000000,%eax # See if extended cpuid is implemented
109 cpuid 109 cpuid
110 cmpl $0x80000001,%eax 110 cmpl $0x80000001,%eax
111 jb verify_cpu_no_longmode # no extended cpuid 111 jb .Lverify_cpu_no_longmode # no extended cpuid
112 112
113 movl $0x80000001,%eax # Does the cpu have what it takes 113 movl $0x80000001,%eax # Does the cpu have what it takes
114 cpuid 114 cpuid
115 andl $REQUIRED_MASK1,%edx 115 andl $REQUIRED_MASK1,%edx
116 xorl $REQUIRED_MASK1,%edx 116 xorl $REQUIRED_MASK1,%edx
117 jnz verify_cpu_no_longmode 117 jnz .Lverify_cpu_no_longmode
118 118
119verify_cpu_sse_test: 119.Lverify_cpu_sse_test:
120 movl $1,%eax 120 movl $1,%eax
121 cpuid 121 cpuid
122 andl $SSE_MASK,%edx 122 andl $SSE_MASK,%edx
123 cmpl $SSE_MASK,%edx 123 cmpl $SSE_MASK,%edx
124 je verify_cpu_sse_ok 124 je .Lverify_cpu_sse_ok
125 test %di,%di 125 test %di,%di
126 jz verify_cpu_no_longmode # only try to force SSE on AMD 126 jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
127 movl $MSR_K7_HWCR,%ecx 127 movl $MSR_K7_HWCR,%ecx
128 rdmsr 128 rdmsr
129 btr $15,%eax # enable SSE 129 btr $15,%eax # enable SSE
130 wrmsr 130 wrmsr
131 xor %di,%di # don't loop 131 xor %di,%di # don't loop
132 jmp verify_cpu_sse_test # try again 132 jmp .Lverify_cpu_sse_test # try again
133 133
134verify_cpu_no_longmode: 134.Lverify_cpu_no_longmode:
135 popf # Restore caller passed flags 135 popf # Restore caller passed flags
136 movl $1,%eax 136 movl $1,%eax
137 ret 137 ret
138verify_cpu_sse_ok: 138.Lverify_cpu_sse_ok:
139 popf # Restore caller passed flags 139 popf # Restore caller passed flags
140 xorl %eax, %eax 140 xorl %eax, %eax
141 ret 141 ret
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe0ce2c..8829482d69ec 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
814 if (phys_addr < (phys_addr_t)0x40000000) 814 if (phys_addr < (phys_addr_t)0x40000000)
815 return; 815 return;
816 816
817 if (IS_ALIGNED(addr, PAGE_SIZE) && 817 if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
818 IS_ALIGNED(next, PAGE_SIZE)) {
819 /* 818 /*
820 * Do not free direct mapping pages since they were 819 * Do not free direct mapping pages since they were
821 * freed when offlining, or simplely not in use. 820 * freed when offlining, or simplely not in use.
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6000ad7f560c..fc6a4c8f6e2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages)
66 66
67static void split_page_count(int level) 67static void split_page_count(int level)
68{ 68{
69 if (direct_pages_count[level] == 0)
70 return;
71
69 direct_pages_count[level]--; 72 direct_pages_count[level]--;
70 direct_pages_count[level - 1] += PTRS_PER_PTE; 73 direct_pages_count[level - 1] += PTRS_PER_PTE;
71} 74}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0d66fb..8f4cc3dfac32 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
161 preempt_disable(); 161 preempt_disable();
162 162
163 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 163 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
164
165 /* This is an implicit full barrier that synchronizes with switch_mm. */
164 local_flush_tlb(); 166 local_flush_tlb();
167
165 trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); 168 trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
166 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 169 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
167 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); 170 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
188 unsigned long base_pages_to_flush = TLB_FLUSH_ALL; 191 unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
189 192
190 preempt_disable(); 193 preempt_disable();
191 if (current->active_mm != mm) 194 if (current->active_mm != mm) {
195 /* Synchronize with switch_mm. */
196 smp_mb();
197
192 goto out; 198 goto out;
199 }
193 200
194 if (!current->mm) { 201 if (!current->mm) {
195 leave_mm(smp_processor_id()); 202 leave_mm(smp_processor_id());
203
204 /* Synchronize with switch_mm. */
205 smp_mb();
206
196 goto out; 207 goto out;
197 } 208 }
198 209
199 if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) 210 if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
200 base_pages_to_flush = (end - start) >> PAGE_SHIFT; 211 base_pages_to_flush = (end - start) >> PAGE_SHIFT;
201 212
213 /*
214 * Both branches below are implicit full barriers (MOV to CR or
215 * INVLPG) that synchronize with switch_mm.
216 */
202 if (base_pages_to_flush > tlb_single_page_flush_ceiling) { 217 if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
203 base_pages_to_flush = TLB_FLUSH_ALL; 218 base_pages_to_flush = TLB_FLUSH_ALL;
204 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 219 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
228 preempt_disable(); 243 preempt_disable();
229 244
230 if (current->active_mm == mm) { 245 if (current->active_mm == mm) {
231 if (current->mm) 246 if (current->mm) {
247 /*
248 * Implicit full barrier (INVLPG) that synchronizes
249 * with switch_mm.
250 */
232 __flush_tlb_one(start); 251 __flush_tlb_one(start);
233 else 252 } else {
234 leave_mm(smp_processor_id()); 253 leave_mm(smp_processor_id());
254
255 /* Synchronize with switch_mm. */
256 smp_mb();
257 }
235 } 258 }
236 259
237 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 260 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 312ffd3d0017..9e385b38debf 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -22,7 +22,8 @@
22 22
23unsigned long switcher_addr; 23unsigned long switcher_addr;
24struct page **lg_switcher_pages; 24struct page **lg_switcher_pages;
25static struct vm_struct *switcher_vma; 25static struct vm_struct *switcher_text_vma;
26static struct vm_struct *switcher_stacks_vma;
26 27
27/* This One Big lock protects all inter-guest data structures. */ 28/* This One Big lock protects all inter-guest data structures. */
28DEFINE_MUTEX(lguest_lock); 29DEFINE_MUTEX(lguest_lock);
@@ -83,54 +84,80 @@ static __init int map_switcher(void)
83 } 84 }
84 85
85 /* 86 /*
87 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
88 * It goes in the first page, which we map in momentarily.
89 */
90 memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
91 end_switcher_text - start_switcher_text);
92 kunmap(lg_switcher_pages[0]);
93
94 /*
86 * We place the Switcher underneath the fixmap area, which is the 95 * We place the Switcher underneath the fixmap area, which is the
87 * highest virtual address we can get. This is important, since we 96 * highest virtual address we can get. This is important, since we
88 * tell the Guest it can't access this memory, so we want its ceiling 97 * tell the Guest it can't access this memory, so we want its ceiling
89 * as high as possible. 98 * as high as possible.
90 */ 99 */
91 switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; 100 switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
92 101
93 /* 102 /*
94 * Now we reserve the "virtual memory area" we want. We might 103 * Now we reserve the "virtual memory area"s we want. We might
95 * not get it in theory, but in practice it's worked so far. 104 * not get them in theory, but in practice it's worked so far.
96 * The end address needs +1 because __get_vm_area allocates an 105 *
97 * extra guard page, so we need space for that. 106 * We want the switcher text to be read-only and executable, and
107 * the stacks to be read-write and non-executable.
98 */ 108 */
99 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, 109 switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
100 VM_ALLOC, switcher_addr, switcher_addr 110 switcher_addr,
101 + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); 111 switcher_addr + PAGE_SIZE);
102 if (!switcher_vma) { 112
113 if (!switcher_text_vma) {
103 err = -ENOMEM; 114 err = -ENOMEM;
104 printk("lguest: could not map switcher pages high\n"); 115 printk("lguest: could not map switcher pages high\n");
105 goto free_pages; 116 goto free_pages;
106 } 117 }
107 118
119 switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
120 VM_ALLOC|VM_NO_GUARD,
121 switcher_addr + PAGE_SIZE,
122 switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
123 if (!switcher_stacks_vma) {
124 err = -ENOMEM;
125 printk("lguest: could not map switcher pages high\n");
126 goto free_text_vma;
127 }
128
108 /* 129 /*
109 * This code actually sets up the pages we've allocated to appear at 130 * This code actually sets up the pages we've allocated to appear at
110 * switcher_addr. map_vm_area() takes the vma we allocated above, the 131 * switcher_addr. map_vm_area() takes the vma we allocated above, the
111 * kind of pages we're mapping (kernel pages), and a pointer to our 132 * kind of pages we're mapping (kernel text pages and kernel writable
112 * array of struct pages. 133 * pages respectively), and a pointer to our array of struct pages.
113 */ 134 */
114 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages); 135 err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
136 if (err) {
137 printk("lguest: text map_vm_area failed: %i\n", err);
138 goto free_vmas;
139 }
140
141 err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
142 lg_switcher_pages + SWITCHER_TEXT_PAGES);
115 if (err) { 143 if (err) {
116 printk("lguest: map_vm_area failed: %i\n", err); 144 printk("lguest: stacks map_vm_area failed: %i\n", err);
117 goto free_vma; 145 goto free_vmas;
118 } 146 }
119 147
120 /* 148 /*
121 * Now the Switcher is mapped at the right address, we can't fail! 149 * Now the Switcher is mapped at the right address, we can't fail!
122 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
123 */ 150 */
124 memcpy(switcher_vma->addr, start_switcher_text,
125 end_switcher_text - start_switcher_text);
126
127 printk(KERN_INFO "lguest: mapped switcher at %p\n", 151 printk(KERN_INFO "lguest: mapped switcher at %p\n",
128 switcher_vma->addr); 152 switcher_text_vma->addr);
129 /* And we succeeded... */ 153 /* And we succeeded... */
130 return 0; 154 return 0;
131 155
132free_vma: 156free_vmas:
133 vunmap(switcher_vma->addr); 157 /* Undoes map_vm_area and __get_vm_area */
158 vunmap(switcher_stacks_vma->addr);
159free_text_vma:
160 vunmap(switcher_text_vma->addr);
134free_pages: 161free_pages:
135 i = TOTAL_SWITCHER_PAGES; 162 i = TOTAL_SWITCHER_PAGES;
136free_some_pages: 163free_some_pages:
@@ -148,7 +175,8 @@ static void unmap_switcher(void)
148 unsigned int i; 175 unsigned int i;
149 176
150 /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */ 177 /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
151 vunmap(switcher_vma->addr); 178 vunmap(switcher_text_vma->addr);
179 vunmap(switcher_stacks_vma->addr);
152 /* Now we just need to free the pages we copied the switcher into */ 180 /* Now we just need to free the pages we copied the switcher into */
153 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) 181 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
154 __free_pages(lg_switcher_pages[i], 0); 182 __free_pages(lg_switcher_pages[i], 0);
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index eabcff411984..d0c473f65850 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,9 +4,11 @@ include ../lib.mk
4 4
5.PHONY: all all_32 all_64 warn_32bit_failure clean 5.PHONY: all all_32 all_64 warn_32bit_failure clean
6 6
7TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall 7TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
8TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \ 8TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
9 test_FCMOV test_FCOMI test_FISTTP 9 test_FCMOV test_FCOMI test_FISTTP \
10 ldt_gdt \
11 vdso_restorer
10 12
11TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) 13TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
12BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) 14BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
diff --git a/tools/testing/selftests/x86/vdso_restorer.c b/tools/testing/selftests/x86/vdso_restorer.c
new file mode 100644
index 000000000000..cb038424a403
--- /dev/null
+++ b/tools/testing/selftests/x86/vdso_restorer.c
@@ -0,0 +1,88 @@
1/*
2 * vdso_restorer.c - tests vDSO-based signal restore
3 * Copyright (c) 2015 Andrew Lutomirski
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * This makes sure that sa_restorer == NULL keeps working on 32-bit
15 * configurations. Modern glibc doesn't use it under any circumstances,
16 * so it's easy to overlook breakage.
17 *
18 * 64-bit userspace has never supported sa_restorer == NULL, so this is
19 * 32-bit only.
20 */
21
22#define _GNU_SOURCE
23
24#include <err.h>
25#include <stdio.h>
26#include <string.h>
27#include <signal.h>
28#include <unistd.h>
29#include <syscall.h>
30#include <sys/syscall.h>
31
32/* Open-code this -- the headers are too messy to easily use them. */
33struct real_sigaction {
34 void *handler;
35 unsigned long flags;
36 void *restorer;
37 unsigned int mask[2];
38};
39
40static volatile sig_atomic_t handler_called;
41
42static void handler_with_siginfo(int sig, siginfo_t *info, void *ctx_void)
43{
44 handler_called = 1;
45}
46
47static void handler_without_siginfo(int sig)
48{
49 handler_called = 1;
50}
51
52int main()
53{
54 int nerrs = 0;
55 struct real_sigaction sa;
56
57 memset(&sa, 0, sizeof(sa));
58 sa.handler = handler_with_siginfo;
59 sa.flags = SA_SIGINFO;
60 sa.restorer = NULL; /* request kernel-provided restorer */
61
62 if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0)
63 err(1, "raw rt_sigaction syscall");
64
65 raise(SIGUSR1);
66
67 if (handler_called) {
68 printf("[OK]\tSA_SIGINFO handler returned successfully\n");
69 } else {
70 printf("[FAIL]\tSA_SIGINFO handler was not called\n");
71 nerrs++;
72 }
73
74 sa.flags = 0;
75 sa.handler = handler_without_siginfo;
76 if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0)
77 err(1, "raw sigaction syscall");
78 handler_called = 0;
79
80 raise(SIGUSR1);
81
82 if (handler_called) {
83 printf("[OK]\t!SA_SIGINFO handler returned successfully\n");
84 } else {
85 printf("[FAIL]\t!SA_SIGINFO handler was not called\n");
86 nerrs++;
87 }
88}