summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 18:35:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 18:35:14 -0400
commit39d7530d7494b4e47ba1856e741f513dafd17e3d (patch)
tree6b16a744047cff9ff77f26bc5811fe9d953a9b91 /arch/x86
parent16c97650a56abdd067f7da079007b7e00b307083 (diff)
parenta45ff5994c9cde41af627c46abb9f32beae68943 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "ARM: - support for chained PMU counters in guests - improved SError handling - handle Neoverse N1 erratum #1349291 - allow side-channel mitigation status to be migrated - standardise most AArch64 system register accesses to msr_s/mrs_s - fix host MPIDR corruption on 32bit - selftests ckleanups x86: - PMU event {white,black}listing - ability for the guest to disable host-side interrupt polling - fixes for enlightened VMCS (Hyper-V pv nested virtualization), - new hypercall to yield to IPI target - support for passing cstate MSRs through to the guest - lots of cleanups and optimizations Generic: - Some txt->rST conversions for the documentation" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (128 commits) Documentation: virtual: Add toctree hooks Documentation: kvm: Convert cpuid.txt to .rst Documentation: virtual: Convert paravirt_ops.txt to .rst KVM: x86: Unconditionally enable irqs in guest context KVM: x86: PMU Event Filter kvm: x86: Fix -Wmissing-prototypes warnings KVM: Properly check if "page" is valid in kvm_vcpu_unmap KVM: arm/arm64: Initialise host's MPIDRs by reading the actual register KVM: LAPIC: Retry tune per-vCPU timer_advance_ns if adaptive tuning goes insane kvm: LAPIC: write down valid APIC registers KVM: arm64: Migrate _elx sysreg accessors to msr_s/mrs_s KVM: doc: Add API documentation on the KVM_REG_ARM_WORKAROUNDS register KVM: arm/arm64: Add save/restore support for firmware workaround state arm64: KVM: Propagate full Spectre v2 workaround state to KVM guests KVM: arm/arm64: Support chained PMU counters KVM: arm/arm64: Remove pmc->bitmask KVM: arm/arm64: Re-create event when setting counter value KVM: arm/arm64: Extract duplicated code to own function KVM: arm/arm64: Rename kvm_pmu_{enable/disable}_counter functions KVM: LAPIC: ARBPRI is a reserved register for x2APIC ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/kvm_host.h11
-rw-r--r--arch/x86/include/uapi/asm/kvm.h19
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h3
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kernel/kvm.c21
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c247
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/irq_comm.c2
-rw-r--r--arch/x86/kvm/lapic.c123
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c182
-rw-r--r--arch/x86/kvm/mmutrace.h59
-rw-r--r--arch/x86/kvm/paging_tmpl.h42
-rw-r--r--arch/x86/kvm/pmu.c63
-rw-r--r--arch/x86/kvm/pmu.h1
-rw-r--r--arch/x86/kvm/svm.c51
-rw-r--r--arch/x86/kvm/trace.h2
-rw-r--r--arch/x86/kvm/vmx/evmcs.c18
-rw-r--r--arch/x86/kvm/vmx/evmcs.h1
-rw-r--r--arch/x86/kvm/vmx/nested.c763
-rw-r--r--arch/x86/kvm/vmx/nested.h4
-rw-r--r--arch/x86/kvm/vmx/ops.h1
-rw-r--r--arch/x86/kvm/vmx/vmcs.h17
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h57
-rw-r--r--arch/x86/kvm/vmx/vmcs_shadow_fields.h79
-rw-r--r--arch/x86/kvm/vmx/vmx.c449
-rw-r--r--arch/x86/kvm/vmx/vmx.h124
-rw-r--r--arch/x86/kvm/x86.c229
-rw-r--r--arch/x86/kvm/x86.h10
31 files changed, 1611 insertions, 980 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 26d1eb83f72a..0cc5b611a113 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -686,6 +686,7 @@ struct kvm_vcpu_arch {
686 u32 virtual_tsc_mult; 686 u32 virtual_tsc_mult;
687 u32 virtual_tsc_khz; 687 u32 virtual_tsc_khz;
688 s64 ia32_tsc_adjust_msr; 688 s64 ia32_tsc_adjust_msr;
689 u64 msr_ia32_power_ctl;
689 u64 tsc_scaling_ratio; 690 u64 tsc_scaling_ratio;
690 691
691 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 692 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
@@ -752,6 +753,8 @@ struct kvm_vcpu_arch {
752 struct gfn_to_hva_cache data; 753 struct gfn_to_hva_cache data;
753 } pv_eoi; 754 } pv_eoi;
754 755
756 u64 msr_kvm_poll_control;
757
755 /* 758 /*
756 * Indicate whether the access faults on its page table in guest 759 * Indicate whether the access faults on its page table in guest
757 * which is set when fix page fault and used to detect unhandeable 760 * which is set when fix page fault and used to detect unhandeable
@@ -879,6 +882,7 @@ struct kvm_arch {
879 bool mwait_in_guest; 882 bool mwait_in_guest;
880 bool hlt_in_guest; 883 bool hlt_in_guest;
881 bool pause_in_guest; 884 bool pause_in_guest;
885 bool cstate_in_guest;
882 886
883 unsigned long irq_sources_bitmap; 887 unsigned long irq_sources_bitmap;
884 s64 kvmclock_offset; 888 s64 kvmclock_offset;
@@ -926,6 +930,8 @@ struct kvm_arch {
926 930
927 bool guest_can_read_msr_platform_info; 931 bool guest_can_read_msr_platform_info;
928 bool exception_payload_enabled; 932 bool exception_payload_enabled;
933
934 struct kvm_pmu_event_filter *pmu_event_filter;
929}; 935};
930 936
931struct kvm_vm_stat { 937struct kvm_vm_stat {
@@ -996,7 +1002,7 @@ struct kvm_x86_ops {
996 int (*disabled_by_bios)(void); /* __init */ 1002 int (*disabled_by_bios)(void); /* __init */
997 int (*hardware_enable)(void); 1003 int (*hardware_enable)(void);
998 void (*hardware_disable)(void); 1004 void (*hardware_disable)(void);
999 void (*check_processor_compatibility)(void *rtn); 1005 int (*check_processor_compatibility)(void);/* __init */
1000 int (*hardware_setup)(void); /* __init */ 1006 int (*hardware_setup)(void); /* __init */
1001 void (*hardware_unsetup)(void); /* __exit */ 1007 void (*hardware_unsetup)(void); /* __exit */
1002 bool (*cpu_has_accelerated_tpr)(void); 1008 bool (*cpu_has_accelerated_tpr)(void);
@@ -1110,7 +1116,7 @@ struct kvm_x86_ops {
1110 int (*check_intercept)(struct kvm_vcpu *vcpu, 1116 int (*check_intercept)(struct kvm_vcpu *vcpu,
1111 struct x86_instruction_info *info, 1117 struct x86_instruction_info *info,
1112 enum x86_intercept_stage stage); 1118 enum x86_intercept_stage stage);
1113 void (*handle_external_intr)(struct kvm_vcpu *vcpu); 1119 void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
1114 bool (*mpx_supported)(void); 1120 bool (*mpx_supported)(void);
1115 bool (*xsaves_supported)(void); 1121 bool (*xsaves_supported)(void);
1116 bool (*umip_emulated)(void); 1122 bool (*umip_emulated)(void);
@@ -1529,7 +1535,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
1529 unsigned long ipi_bitmap_high, u32 min, 1535 unsigned long ipi_bitmap_high, u32 min,
1530 unsigned long icr, int op_64_bit); 1536 unsigned long icr, int op_64_bit);
1531 1537
1532u64 kvm_get_arch_capabilities(void);
1533void kvm_define_shared_msr(unsigned index, u32 msr); 1538void kvm_define_shared_msr(unsigned index, u32 msr);
1534int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 1539int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
1535 1540
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d6ab5b4d15e5..e901b0ab116f 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -378,10 +378,11 @@ struct kvm_sync_regs {
378 struct kvm_vcpu_events events; 378 struct kvm_vcpu_events events;
379}; 379};
380 380
381#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) 381#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
382#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) 382#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
383#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) 383#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
384#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) 384#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
385#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
385 386
386#define KVM_STATE_NESTED_FORMAT_VMX 0 387#define KVM_STATE_NESTED_FORMAT_VMX 0
387#define KVM_STATE_NESTED_FORMAT_SVM 1 /* unused */ 388#define KVM_STATE_NESTED_FORMAT_SVM 1 /* unused */
@@ -432,4 +433,14 @@ struct kvm_nested_state {
432 } data; 433 } data;
433}; 434};
434 435
436/* for KVM_CAP_PMU_EVENT_FILTER */
437struct kvm_pmu_event_filter {
438 __u32 action;
439 __u32 nevents;
440 __u64 events[0];
441};
442
443#define KVM_PMU_EVENT_ALLOW 0
444#define KVM_PMU_EVENT_DENY 1
445
435#endif /* _ASM_X86_KVM_H */ 446#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 19980ec1a316..2a8e0b6b9805 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -29,6 +29,8 @@
29#define KVM_FEATURE_PV_TLB_FLUSH 9 29#define KVM_FEATURE_PV_TLB_FLUSH 9
30#define KVM_FEATURE_ASYNC_PF_VMEXIT 10 30#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
31#define KVM_FEATURE_PV_SEND_IPI 11 31#define KVM_FEATURE_PV_SEND_IPI 11
32#define KVM_FEATURE_POLL_CONTROL 12
33#define KVM_FEATURE_PV_SCHED_YIELD 13
32 34
33#define KVM_HINTS_REALTIME 0 35#define KVM_HINTS_REALTIME 0
34 36
@@ -47,6 +49,7 @@
47#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 49#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
48#define MSR_KVM_STEAL_TIME 0x4b564d03 50#define MSR_KVM_STEAL_TIME 0x4b564d03
49#define MSR_KVM_PV_EOI_EN 0x4b564d04 51#define MSR_KVM_PV_EOI_EN 0x4b564d04
52#define MSR_KVM_POLL_CONTROL 0x4b564d05
50 53
51struct kvm_steal_time { 54struct kvm_steal_time {
52 __u64 steal; 55 __u64 steal;
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d213ec5c3766..f0b0c90dd398 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -146,7 +146,6 @@
146 146
147#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 147#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
148#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 148#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2
149#define VMX_ABORT_VMCS_CORRUPTED 3
150#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 149#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
151 150
152#endif /* _UAPIVMX_H */ 151#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5169b8cc35bb..82caf01b63dd 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -527,6 +527,21 @@ static void kvm_setup_pv_ipi(void)
527 pr_info("KVM setup pv IPIs\n"); 527 pr_info("KVM setup pv IPIs\n");
528} 528}
529 529
530static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
531{
532 int cpu;
533
534 native_send_call_func_ipi(mask);
535
536 /* Make sure other vCPUs get a chance to run if they need to. */
537 for_each_cpu(cpu, mask) {
538 if (vcpu_is_preempted(cpu)) {
539 kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
540 break;
541 }
542 }
543}
544
530static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) 545static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
531{ 546{
532 native_smp_prepare_cpus(max_cpus); 547 native_smp_prepare_cpus(max_cpus);
@@ -638,6 +653,12 @@ static void __init kvm_guest_init(void)
638#ifdef CONFIG_SMP 653#ifdef CONFIG_SMP
639 smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; 654 smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
640 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 655 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
656 if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
657 !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
658 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
659 smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
660 pr_info("KVM setup pv sched yield\n");
661 }
641 if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", 662 if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
642 kvm_cpu_online, kvm_cpu_down_prepare) < 0) 663 kvm_cpu_online, kvm_cpu_down_prepare) < 0)
643 pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); 664 pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index fc042419e670..840e12583b85 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -41,6 +41,7 @@ config KVM
41 select PERF_EVENTS 41 select PERF_EVENTS
42 select HAVE_KVM_MSI 42 select HAVE_KVM_MSI
43 select HAVE_KVM_CPU_RELAX_INTERCEPT 43 select HAVE_KVM_CPU_RELAX_INTERCEPT
44 select HAVE_KVM_NO_POLL
44 select KVM_GENERIC_DIRTYLOG_READ_PROTECT 45 select KVM_GENERIC_DIRTYLOG_READ_PROTECT
45 select KVM_VFIO 46 select KVM_VFIO
46 select SRCU 47 select SRCU
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 4992e7c99588..ead681210306 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -134,6 +134,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
134 (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) 134 (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
135 best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); 135 best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
136 136
137 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
138 best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
139 if (best) {
140 if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT)
141 best->ecx |= F(MWAIT);
142 else
143 best->ecx &= ~F(MWAIT);
144 }
145 }
146
137 /* Update physical-address width */ 147 /* Update physical-address width */
138 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 148 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
139 kvm_mmu_reset_context(vcpu); 149 kvm_mmu_reset_context(vcpu);
@@ -276,19 +286,38 @@ static void cpuid_mask(u32 *word, int wordnum)
276 *word &= boot_cpu_data.x86_capability[wordnum]; 286 *word &= boot_cpu_data.x86_capability[wordnum];
277} 287}
278 288
279static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 289static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
280 u32 index) 290 u32 index)
281{ 291{
282 entry->function = function; 292 entry->function = function;
283 entry->index = index; 293 entry->index = index;
294 entry->flags = 0;
295
284 cpuid_count(entry->function, entry->index, 296 cpuid_count(entry->function, entry->index,
285 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 297 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
286 entry->flags = 0; 298
299 switch (function) {
300 case 2:
301 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
302 break;
303 case 4:
304 case 7:
305 case 0xb:
306 case 0xd:
307 case 0x14:
308 case 0x8000001d:
309 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
310 break;
311 }
287} 312}
288 313
289static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, 314static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry,
290 u32 func, u32 index, int *nent, int maxnent) 315 u32 func, int *nent, int maxnent)
291{ 316{
317 entry->function = func;
318 entry->index = 0;
319 entry->flags = 0;
320
292 switch (func) { 321 switch (func) {
293 case 0: 322 case 0:
294 entry->eax = 7; 323 entry->eax = 7;
@@ -300,21 +329,83 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
300 break; 329 break;
301 case 7: 330 case 7:
302 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 331 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
303 if (index == 0) 332 entry->eax = 0;
304 entry->ecx = F(RDPID); 333 entry->ecx = F(RDPID);
305 ++*nent; 334 ++*nent;
306 default: 335 default:
307 break; 336 break;
308 } 337 }
309 338
310 entry->function = func;
311 entry->index = index;
312
313 return 0; 339 return 0;
314} 340}
315 341
316static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 342static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
317 u32 index, int *nent, int maxnent) 343{
344 unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
345 unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
346 unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
347 unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
348 unsigned f_la57;
349
350 /* cpuid 7.0.ebx */
351 const u32 kvm_cpuid_7_0_ebx_x86_features =
352 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
353 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
354 F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
355 F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
356 F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
357
358 /* cpuid 7.0.ecx*/
359 const u32 kvm_cpuid_7_0_ecx_x86_features =
360 F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
361 F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
362 F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
363 F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
364
365 /* cpuid 7.0.edx*/
366 const u32 kvm_cpuid_7_0_edx_x86_features =
367 F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
368 F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
369 F(MD_CLEAR);
370
371 switch (index) {
372 case 0:
373 entry->eax = 0;
374 entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
375 cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
376 /* TSC_ADJUST is emulated */
377 entry->ebx |= F(TSC_ADJUST);
378
379 entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
380 f_la57 = entry->ecx & F(LA57);
381 cpuid_mask(&entry->ecx, CPUID_7_ECX);
382 /* Set LA57 based on hardware capability. */
383 entry->ecx |= f_la57;
384 entry->ecx |= f_umip;
385 /* PKU is not yet implemented for shadow paging. */
386 if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
387 entry->ecx &= ~F(PKU);
388
389 entry->edx &= kvm_cpuid_7_0_edx_x86_features;
390 cpuid_mask(&entry->edx, CPUID_7_EDX);
391 /*
392 * We emulate ARCH_CAPABILITIES in software even
393 * if the host doesn't support it.
394 */
395 entry->edx |= F(ARCH_CAPABILITIES);
396 break;
397 default:
398 WARN_ON_ONCE(1);
399 entry->eax = 0;
400 entry->ebx = 0;
401 entry->ecx = 0;
402 entry->edx = 0;
403 break;
404 }
405}
406
407static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
408 int *nent, int maxnent)
318{ 409{
319 int r; 410 int r;
320 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 411 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
@@ -327,12 +418,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
327 unsigned f_lm = 0; 418 unsigned f_lm = 0;
328#endif 419#endif
329 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; 420 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
330 unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
331 unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
332 unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; 421 unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
333 unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
334 unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; 422 unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
335 unsigned f_la57 = 0;
336 423
337 /* cpuid 1.edx */ 424 /* cpuid 1.edx */
338 const u32 kvm_cpuid_1_edx_x86_features = 425 const u32 kvm_cpuid_1_edx_x86_features =
@@ -377,7 +464,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
377 /* cpuid 0x80000008.ebx */ 464 /* cpuid 0x80000008.ebx */
378 const u32 kvm_cpuid_8000_0008_ebx_x86_features = 465 const u32 kvm_cpuid_8000_0008_ebx_x86_features =
379 F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | 466 F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
380 F(AMD_SSB_NO) | F(AMD_STIBP); 467 F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON);
381 468
382 /* cpuid 0xC0000001.edx */ 469 /* cpuid 0xC0000001.edx */
383 const u32 kvm_cpuid_C000_0001_edx_x86_features = 470 const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -385,31 +472,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
385 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | 472 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
386 F(PMM) | F(PMM_EN); 473 F(PMM) | F(PMM_EN);
387 474
388 /* cpuid 7.0.ebx */
389 const u32 kvm_cpuid_7_0_ebx_x86_features =
390 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
391 F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
392 F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
393 F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
394 F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
395
396 /* cpuid 0xD.1.eax */ 475 /* cpuid 0xD.1.eax */
397 const u32 kvm_cpuid_D_1_eax_x86_features = 476 const u32 kvm_cpuid_D_1_eax_x86_features =
398 F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; 477 F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
399 478
400 /* cpuid 7.0.ecx*/
401 const u32 kvm_cpuid_7_0_ecx_x86_features =
402 F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
403 F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
404 F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
405 F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
406
407 /* cpuid 7.0.edx*/
408 const u32 kvm_cpuid_7_0_edx_x86_features =
409 F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
410 F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
411 F(MD_CLEAR);
412
413 /* all calls to cpuid_count() should be made on the same cpu */ 479 /* all calls to cpuid_count() should be made on the same cpu */
414 get_cpu(); 480 get_cpu();
415 481
@@ -418,12 +484,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
418 if (*nent >= maxnent) 484 if (*nent >= maxnent)
419 goto out; 485 goto out;
420 486
421 do_cpuid_1_ent(entry, function, index); 487 do_host_cpuid(entry, function, 0);
422 ++*nent; 488 ++*nent;
423 489
424 switch (function) { 490 switch (function) {
425 case 0: 491 case 0:
426 entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); 492 /* Limited to the highest leaf implemented in KVM. */
493 entry->eax = min(entry->eax, 0x1fU);
427 break; 494 break;
428 case 1: 495 case 1:
429 entry->edx &= kvm_cpuid_1_edx_x86_features; 496 entry->edx &= kvm_cpuid_1_edx_x86_features;
@@ -441,14 +508,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
441 case 2: { 508 case 2: {
442 int t, times = entry->eax & 0xff; 509 int t, times = entry->eax & 0xff;
443 510
444 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
445 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 511 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
446 for (t = 1; t < times; ++t) { 512 for (t = 1; t < times; ++t) {
447 if (*nent >= maxnent) 513 if (*nent >= maxnent)
448 goto out; 514 goto out;
449 515
450 do_cpuid_1_ent(&entry[t], function, 0); 516 do_host_cpuid(&entry[t], function, 0);
451 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
452 ++*nent; 517 ++*nent;
453 } 518 }
454 break; 519 break;
@@ -458,7 +523,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
458 case 0x8000001d: { 523 case 0x8000001d: {
459 int i, cache_type; 524 int i, cache_type;
460 525
461 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
462 /* read more entries until cache_type is zero */ 526 /* read more entries until cache_type is zero */
463 for (i = 1; ; ++i) { 527 for (i = 1; ; ++i) {
464 if (*nent >= maxnent) 528 if (*nent >= maxnent)
@@ -467,9 +531,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
467 cache_type = entry[i - 1].eax & 0x1f; 531 cache_type = entry[i - 1].eax & 0x1f;
468 if (!cache_type) 532 if (!cache_type)
469 break; 533 break;
470 do_cpuid_1_ent(&entry[i], function, i); 534 do_host_cpuid(&entry[i], function, i);
471 entry[i].flags |=
472 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
473 ++*nent; 535 ++*nent;
474 } 536 }
475 break; 537 break;
@@ -480,36 +542,21 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
480 entry->ecx = 0; 542 entry->ecx = 0;
481 entry->edx = 0; 543 entry->edx = 0;
482 break; 544 break;
545 /* function 7 has additional index. */
483 case 7: { 546 case 7: {
484 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 547 int i;
485 /* Mask ebx against host capability word 9 */ 548
486 if (index == 0) { 549 for (i = 0; ; ) {
487 entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; 550 do_cpuid_7_mask(&entry[i], i);
488 cpuid_mask(&entry->ebx, CPUID_7_0_EBX); 551 if (i == entry->eax)
489 // TSC_ADJUST is emulated 552 break;
490 entry->ebx |= F(TSC_ADJUST); 553 if (*nent >= maxnent)
491 entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; 554 goto out;
492 f_la57 = entry->ecx & F(LA57); 555
493 cpuid_mask(&entry->ecx, CPUID_7_ECX); 556 ++i;
494 /* Set LA57 based on hardware capability. */ 557 do_host_cpuid(&entry[i], function, i);
495 entry->ecx |= f_la57; 558 ++*nent;
496 entry->ecx |= f_umip;
497 /* PKU is not yet implemented for shadow paging. */
498 if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
499 entry->ecx &= ~F(PKU);
500 entry->edx &= kvm_cpuid_7_0_edx_x86_features;
501 cpuid_mask(&entry->edx, CPUID_7_EDX);
502 /*
503 * We emulate ARCH_CAPABILITIES in software even
504 * if the host doesn't support it.
505 */
506 entry->edx |= F(ARCH_CAPABILITIES);
507 } else {
508 entry->ebx = 0;
509 entry->ecx = 0;
510 entry->edx = 0;
511 } 559 }
512 entry->eax = 0;
513 break; 560 break;
514 } 561 }
515 case 9: 562 case 9:
@@ -543,11 +590,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
543 entry->edx = edx.full; 590 entry->edx = edx.full;
544 break; 591 break;
545 } 592 }
546 /* function 0xb has additional index. */ 593 /*
594 * Per Intel's SDM, the 0x1f is a superset of 0xb,
595 * thus they can be handled by common code.
596 */
597 case 0x1f:
547 case 0xb: { 598 case 0xb: {
548 int i, level_type; 599 int i, level_type;
549 600
550 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
551 /* read more entries until level_type is zero */ 601 /* read more entries until level_type is zero */
552 for (i = 1; ; ++i) { 602 for (i = 1; ; ++i) {
553 if (*nent >= maxnent) 603 if (*nent >= maxnent)
@@ -556,9 +606,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
556 level_type = entry[i - 1].ecx & 0xff00; 606 level_type = entry[i - 1].ecx & 0xff00;
557 if (!level_type) 607 if (!level_type)
558 break; 608 break;
559 do_cpuid_1_ent(&entry[i], function, i); 609 do_host_cpuid(&entry[i], function, i);
560 entry[i].flags |=
561 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
562 ++*nent; 610 ++*nent;
563 } 611 }
564 break; 612 break;
@@ -571,7 +619,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
571 entry->ebx = xstate_required_size(supported, false); 619 entry->ebx = xstate_required_size(supported, false);
572 entry->ecx = entry->ebx; 620 entry->ecx = entry->ebx;
573 entry->edx &= supported >> 32; 621 entry->edx &= supported >> 32;
574 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
575 if (!supported) 622 if (!supported)
576 break; 623 break;
577 624
@@ -580,7 +627,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
580 if (*nent >= maxnent) 627 if (*nent >= maxnent)
581 goto out; 628 goto out;
582 629
583 do_cpuid_1_ent(&entry[i], function, idx); 630 do_host_cpuid(&entry[i], function, idx);
584 if (idx == 1) { 631 if (idx == 1) {
585 entry[i].eax &= kvm_cpuid_D_1_eax_x86_features; 632 entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
586 cpuid_mask(&entry[i].eax, CPUID_D_1_EAX); 633 cpuid_mask(&entry[i].eax, CPUID_D_1_EAX);
@@ -597,8 +644,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
597 } 644 }
598 entry[i].ecx = 0; 645 entry[i].ecx = 0;
599 entry[i].edx = 0; 646 entry[i].edx = 0;
600 entry[i].flags |=
601 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
602 ++*nent; 647 ++*nent;
603 ++i; 648 ++i;
604 } 649 }
@@ -611,12 +656,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
611 if (!f_intel_pt) 656 if (!f_intel_pt)
612 break; 657 break;
613 658
614 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
615 for (t = 1; t <= times; ++t) { 659 for (t = 1; t <= times; ++t) {
616 if (*nent >= maxnent) 660 if (*nent >= maxnent)
617 goto out; 661 goto out;
618 do_cpuid_1_ent(&entry[t], function, t); 662 do_host_cpuid(&entry[t], function, t);
619 entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
620 ++*nent; 663 ++*nent;
621 } 664 }
622 break; 665 break;
@@ -640,7 +683,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
640 (1 << KVM_FEATURE_PV_UNHALT) | 683 (1 << KVM_FEATURE_PV_UNHALT) |
641 (1 << KVM_FEATURE_PV_TLB_FLUSH) | 684 (1 << KVM_FEATURE_PV_TLB_FLUSH) |
642 (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | 685 (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
643 (1 << KVM_FEATURE_PV_SEND_IPI); 686 (1 << KVM_FEATURE_PV_SEND_IPI) |
687 (1 << KVM_FEATURE_POLL_CONTROL) |
688 (1 << KVM_FEATURE_PV_SCHED_YIELD);
644 689
645 if (sched_info_on()) 690 if (sched_info_on())
646 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); 691 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
@@ -730,21 +775,19 @@ out:
730 return r; 775 return r;
731} 776}
732 777
733static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, 778static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func,
734 u32 idx, int *nent, int maxnent, unsigned int type) 779 int *nent, int maxnent, unsigned int type)
735{ 780{
736 if (type == KVM_GET_EMULATED_CPUID) 781 if (type == KVM_GET_EMULATED_CPUID)
737 return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); 782 return __do_cpuid_func_emulated(entry, func, nent, maxnent);
738 783
739 return __do_cpuid_ent(entry, func, idx, nent, maxnent); 784 return __do_cpuid_func(entry, func, nent, maxnent);
740} 785}
741 786
742#undef F 787#undef F
743 788
744struct kvm_cpuid_param { 789struct kvm_cpuid_param {
745 u32 func; 790 u32 func;
746 u32 idx;
747 bool has_leaf_count;
748 bool (*qualifier)(const struct kvm_cpuid_param *param); 791 bool (*qualifier)(const struct kvm_cpuid_param *param);
749}; 792};
750 793
@@ -788,11 +831,10 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
788 int limit, nent = 0, r = -E2BIG, i; 831 int limit, nent = 0, r = -E2BIG, i;
789 u32 func; 832 u32 func;
790 static const struct kvm_cpuid_param param[] = { 833 static const struct kvm_cpuid_param param[] = {
791 { .func = 0, .has_leaf_count = true }, 834 { .func = 0 },
792 { .func = 0x80000000, .has_leaf_count = true }, 835 { .func = 0x80000000 },
793 { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, 836 { .func = 0xC0000000, .qualifier = is_centaur_cpu },
794 { .func = KVM_CPUID_SIGNATURE }, 837 { .func = KVM_CPUID_SIGNATURE },
795 { .func = KVM_CPUID_FEATURES },
796 }; 838 };
797 839
798 if (cpuid->nent < 1) 840 if (cpuid->nent < 1)
@@ -816,19 +858,16 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
816 if (ent->qualifier && !ent->qualifier(ent)) 858 if (ent->qualifier && !ent->qualifier(ent))
817 continue; 859 continue;
818 860
819 r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, 861 r = do_cpuid_func(&cpuid_entries[nent], ent->func,
820 &nent, cpuid->nent, type); 862 &nent, cpuid->nent, type);
821 863
822 if (r) 864 if (r)
823 goto out_free; 865 goto out_free;
824 866
825 if (!ent->has_leaf_count)
826 continue;
827
828 limit = cpuid_entries[nent - 1].eax; 867 limit = cpuid_entries[nent - 1].eax;
829 for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) 868 for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
830 r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, 869 r = do_cpuid_func(&cpuid_entries[nent], func,
831 &nent, cpuid->nent, type); 870 &nent, cpuid->nent, type);
832 871
833 if (r) 872 if (r)
834 goto out_free; 873 goto out_free;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4a387a235424..8e409ad448f9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4258,7 +4258,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
4258 ulong dr6; 4258 ulong dr6;
4259 4259
4260 ctxt->ops->get_dr(ctxt, 6, &dr6); 4260 ctxt->ops->get_dr(ctxt, 6, &dr6);
4261 dr6 &= ~15; 4261 dr6 &= ~DR_TRAP_BITS;
4262 dr6 |= DR6_BD | DR6_RTM; 4262 dr6 |= DR6_BD | DR6_RTM;
4263 ctxt->ops->set_dr(ctxt, 6, dr6); 4263 ctxt->ops->set_dr(ctxt, 6, dr6);
4264 return emulate_db(ctxt); 4264 return emulate_db(ctxt);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index d6519a3aa959..7c6233d37c64 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -102,7 +102,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
102 return mode != KVM_IRQCHIP_NONE; 102 return mode != KVM_IRQCHIP_NONE;
103} 103}
104 104
105bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
106void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); 105void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
107void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); 106void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
108void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); 107void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 924b3bd5a7b7..8ecd48d31800 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -75,7 +75,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
75 if (r < 0) 75 if (r < 0)
76 r = 0; 76 r = 0;
77 r += kvm_apic_set_irq(vcpu, irq, dest_map); 77 r += kvm_apic_set_irq(vcpu, irq, dest_map);
78 } else if (kvm_lapic_enabled(vcpu)) { 78 } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
79 if (!kvm_vector_hashing_enabled()) { 79 if (!kvm_vector_hashing_enabled()) {
80 if (!lowest) 80 if (!lowest)
81 lowest = vcpu; 81 lowest = vcpu;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4dabc318adb8..a232e76d8f23 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -69,6 +69,7 @@
69#define X2APIC_BROADCAST 0xFFFFFFFFul 69#define X2APIC_BROADCAST 0xFFFFFFFFul
70 70
71#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 71#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
72#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
72/* step-by-step approximation to mitigate fluctuation */ 73/* step-by-step approximation to mitigate fluctuation */
73#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 74#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
74 75
@@ -85,11 +86,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
85 apic_test_vector(vector, apic->regs + APIC_IRR); 86 apic_test_vector(vector, apic->regs + APIC_IRR);
86} 87}
87 88
88static inline void apic_clear_vector(int vec, void *bitmap)
89{
90 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
91}
92
93static inline int __apic_test_and_set_vector(int vec, void *bitmap) 89static inline int __apic_test_and_set_vector(int vec, void *bitmap)
94{ 90{
95 return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 91 return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -443,12 +439,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
443 439
444 if (unlikely(vcpu->arch.apicv_active)) { 440 if (unlikely(vcpu->arch.apicv_active)) {
445 /* need to update RVI */ 441 /* need to update RVI */
446 apic_clear_vector(vec, apic->regs + APIC_IRR); 442 kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
447 kvm_x86_ops->hwapic_irr_update(vcpu, 443 kvm_x86_ops->hwapic_irr_update(vcpu,
448 apic_find_highest_irr(apic)); 444 apic_find_highest_irr(apic));
449 } else { 445 } else {
450 apic->irr_pending = false; 446 apic->irr_pending = false;
451 apic_clear_vector(vec, apic->regs + APIC_IRR); 447 kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
452 if (apic_search_irr(apic) != -1) 448 if (apic_search_irr(apic) != -1)
453 apic->irr_pending = true; 449 apic->irr_pending = true;
454 } 450 }
@@ -1053,9 +1049,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
1053 1049
1054 if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { 1050 if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
1055 if (trig_mode) 1051 if (trig_mode)
1056 kvm_lapic_set_vector(vector, apic->regs + APIC_TMR); 1052 kvm_lapic_set_vector(vector,
1053 apic->regs + APIC_TMR);
1057 else 1054 else
1058 apic_clear_vector(vector, apic->regs + APIC_TMR); 1055 kvm_lapic_clear_vector(vector,
1056 apic->regs + APIC_TMR);
1059 } 1057 }
1060 1058
1061 if (vcpu->arch.apicv_active) 1059 if (vcpu->arch.apicv_active)
@@ -1313,21 +1311,45 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
1313 return container_of(dev, struct kvm_lapic, dev); 1311 return container_of(dev, struct kvm_lapic, dev);
1314} 1312}
1315 1313
1314#define APIC_REG_MASK(reg) (1ull << ((reg) >> 4))
1315#define APIC_REGS_MASK(first, count) \
1316 (APIC_REG_MASK(first) * ((1ull << (count)) - 1))
1317
1316int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, 1318int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
1317 void *data) 1319 void *data)
1318{ 1320{
1319 unsigned char alignment = offset & 0xf; 1321 unsigned char alignment = offset & 0xf;
1320 u32 result; 1322 u32 result;
1321 /* this bitmask has a bit cleared for each reserved register */ 1323 /* this bitmask has a bit cleared for each reserved register */
1322 static const u64 rmask = 0x43ff01ffffffe70cULL; 1324 u64 valid_reg_mask =
1323 1325 APIC_REG_MASK(APIC_ID) |
1324 if ((alignment + len) > 4) { 1326 APIC_REG_MASK(APIC_LVR) |
1325 apic_debug("KVM_APIC_READ: alignment error %x %d\n", 1327 APIC_REG_MASK(APIC_TASKPRI) |
1326 offset, len); 1328 APIC_REG_MASK(APIC_PROCPRI) |
1327 return 1; 1329 APIC_REG_MASK(APIC_LDR) |
1328 } 1330 APIC_REG_MASK(APIC_DFR) |
1331 APIC_REG_MASK(APIC_SPIV) |
1332 APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
1333 APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
1334 APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
1335 APIC_REG_MASK(APIC_ESR) |
1336 APIC_REG_MASK(APIC_ICR) |
1337 APIC_REG_MASK(APIC_ICR2) |
1338 APIC_REG_MASK(APIC_LVTT) |
1339 APIC_REG_MASK(APIC_LVTTHMR) |
1340 APIC_REG_MASK(APIC_LVTPC) |
1341 APIC_REG_MASK(APIC_LVT0) |
1342 APIC_REG_MASK(APIC_LVT1) |
1343 APIC_REG_MASK(APIC_LVTERR) |
1344 APIC_REG_MASK(APIC_TMICT) |
1345 APIC_REG_MASK(APIC_TMCCT) |
1346 APIC_REG_MASK(APIC_TDCR);
1347
1348 /* ARBPRI is not valid on x2APIC */
1349 if (!apic_x2apic_mode(apic))
1350 valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
1329 1351
1330 if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { 1352 if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) {
1331 apic_debug("KVM_APIC_READ: read reserved register %x\n", 1353 apic_debug("KVM_APIC_READ: read reserved register %x\n",
1332 offset); 1354 offset);
1333 return 1; 1355 return 1;
@@ -1499,11 +1521,40 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
1499 } 1521 }
1500} 1522}
1501 1523
1502void wait_lapic_expire(struct kvm_vcpu *vcpu) 1524static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
1525 s64 advance_expire_delta)
1503{ 1526{
1504 struct kvm_lapic *apic = vcpu->arch.apic; 1527 struct kvm_lapic *apic = vcpu->arch.apic;
1505 u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; 1528 u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
1506 u64 guest_tsc, tsc_deadline, ns; 1529 u64 ns;
1530
1531 /* too early */
1532 if (advance_expire_delta < 0) {
1533 ns = -advance_expire_delta * 1000000ULL;
1534 do_div(ns, vcpu->arch.virtual_tsc_khz);
1535 timer_advance_ns -= min((u32)ns,
1536 timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
1537 } else {
1538 /* too late */
1539 ns = advance_expire_delta * 1000000ULL;
1540 do_div(ns, vcpu->arch.virtual_tsc_khz);
1541 timer_advance_ns += min((u32)ns,
1542 timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
1543 }
1544
1545 if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
1546 apic->lapic_timer.timer_advance_adjust_done = true;
1547 if (unlikely(timer_advance_ns > 5000)) {
1548 timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
1549 apic->lapic_timer.timer_advance_adjust_done = false;
1550 }
1551 apic->lapic_timer.timer_advance_ns = timer_advance_ns;
1552}
1553
1554void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
1555{
1556 struct kvm_lapic *apic = vcpu->arch.apic;
1557 u64 guest_tsc, tsc_deadline;
1507 1558
1508 if (apic->lapic_timer.expired_tscdeadline == 0) 1559 if (apic->lapic_timer.expired_tscdeadline == 0)
1509 return; 1560 return;
@@ -1514,34 +1565,15 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
1514 tsc_deadline = apic->lapic_timer.expired_tscdeadline; 1565 tsc_deadline = apic->lapic_timer.expired_tscdeadline;
1515 apic->lapic_timer.expired_tscdeadline = 0; 1566 apic->lapic_timer.expired_tscdeadline = 0;
1516 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); 1567 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
1517 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); 1568 apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline;
1518 1569
1519 if (guest_tsc < tsc_deadline) 1570 if (guest_tsc < tsc_deadline)
1520 __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); 1571 __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
1521 1572
1522 if (!apic->lapic_timer.timer_advance_adjust_done) { 1573 if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
1523 /* too early */ 1574 adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
1524 if (guest_tsc < tsc_deadline) {
1525 ns = (tsc_deadline - guest_tsc) * 1000000ULL;
1526 do_div(ns, vcpu->arch.virtual_tsc_khz);
1527 timer_advance_ns -= min((u32)ns,
1528 timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
1529 } else {
1530 /* too late */
1531 ns = (guest_tsc - tsc_deadline) * 1000000ULL;
1532 do_div(ns, vcpu->arch.virtual_tsc_khz);
1533 timer_advance_ns += min((u32)ns,
1534 timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
1535 }
1536 if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
1537 apic->lapic_timer.timer_advance_adjust_done = true;
1538 if (unlikely(timer_advance_ns > 5000)) {
1539 timer_advance_ns = 0;
1540 apic->lapic_timer.timer_advance_adjust_done = true;
1541 }
1542 apic->lapic_timer.timer_advance_ns = timer_advance_ns;
1543 }
1544} 1575}
1576EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
1545 1577
1546static void start_sw_tscdeadline(struct kvm_lapic *apic) 1578static void start_sw_tscdeadline(struct kvm_lapic *apic)
1547{ 1579{
@@ -2014,7 +2046,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
2014 apic_debug("%s: offset 0x%x with length 0x%x, and value is " 2046 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
2015 "0x%x\n", __func__, offset, len, val); 2047 "0x%x\n", __func__, offset, len, val);
2016 2048
2017 kvm_lapic_reg_write(apic, offset & 0xff0, val); 2049 kvm_lapic_reg_write(apic, offset, val);
2018 2050
2019 return 0; 2051 return 0;
2020} 2052}
@@ -2311,7 +2343,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
2311 HRTIMER_MODE_ABS_PINNED); 2343 HRTIMER_MODE_ABS_PINNED);
2312 apic->lapic_timer.timer.function = apic_timer_fn; 2344 apic->lapic_timer.timer.function = apic_timer_fn;
2313 if (timer_advance_ns == -1) { 2345 if (timer_advance_ns == -1) {
2314 apic->lapic_timer.timer_advance_ns = 1000; 2346 apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
2315 apic->lapic_timer.timer_advance_adjust_done = false; 2347 apic->lapic_timer.timer_advance_adjust_done = false;
2316 } else { 2348 } else {
2317 apic->lapic_timer.timer_advance_ns = timer_advance_ns; 2349 apic->lapic_timer.timer_advance_ns = timer_advance_ns;
@@ -2321,7 +2353,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
2321 2353
2322 /* 2354 /*
2323 * APIC is created enabled. This will prevent kvm_lapic_set_base from 2355 * APIC is created enabled. This will prevent kvm_lapic_set_base from
2324 * thinking that APIC satet has changed. 2356 * thinking that APIC state has changed.
2325 */ 2357 */
2326 vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; 2358 vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
2327 static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ 2359 static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
@@ -2330,6 +2362,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
2330 return 0; 2362 return 0;
2331nomem_free_apic: 2363nomem_free_apic:
2332 kfree(apic); 2364 kfree(apic);
2365 vcpu->arch.apic = NULL;
2333nomem: 2366nomem:
2334 return -ENOMEM; 2367 return -ENOMEM;
2335} 2368}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index d6d049ba3045..36747174e4a8 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -32,6 +32,7 @@ struct kvm_timer {
32 u64 tscdeadline; 32 u64 tscdeadline;
33 u64 expired_tscdeadline; 33 u64 expired_tscdeadline;
34 u32 timer_advance_ns; 34 u32 timer_advance_ns;
35 s64 advance_expire_delta;
35 atomic_t pending; /* accumulated triggered timers */ 36 atomic_t pending; /* accumulated triggered timers */
36 bool hv_timer_in_use; 37 bool hv_timer_in_use;
37 bool timer_advance_adjust_done; 38 bool timer_advance_adjust_done;
@@ -129,6 +130,11 @@ void kvm_lapic_exit(void);
129#define VEC_POS(v) ((v) & (32 - 1)) 130#define VEC_POS(v) ((v) & (32 - 1))
130#define REG_POS(v) (((v) >> 5) << 4) 131#define REG_POS(v) (((v) >> 5) << 4)
131 132
133static inline void kvm_lapic_clear_vector(int vec, void *bitmap)
134{
135 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
136}
137
132static inline void kvm_lapic_set_vector(int vec, void *bitmap) 138static inline void kvm_lapic_set_vector(int vec, void *bitmap)
133{ 139{
134 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 140 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -219,7 +225,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
219 225
220bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); 226bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
221 227
222void wait_lapic_expire(struct kvm_vcpu *vcpu); 228void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
223 229
224bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, 230bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
225 struct kvm_vcpu **dest_vcpu); 231 struct kvm_vcpu **dest_vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4a9c63d1c20a..9a5814d8d194 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -140,9 +140,6 @@ module_param(dbg, bool, 0644);
140 140
141#include <trace/events/kvm.h> 141#include <trace/events/kvm.h>
142 142
143#define CREATE_TRACE_POINTS
144#include "mmutrace.h"
145
146#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 143#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
147#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) 144#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
148 145
@@ -259,11 +256,20 @@ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
259 */ 256 */
260static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; 257static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
261 258
259/*
260 * The number of non-reserved physical address bits irrespective of features
261 * that repurpose legal bits, e.g. MKTME.
262 */
263static u8 __read_mostly shadow_phys_bits;
262 264
263static void mmu_spte_set(u64 *sptep, u64 spte); 265static void mmu_spte_set(u64 *sptep, u64 spte);
266static bool is_executable_pte(u64 spte);
264static union kvm_mmu_page_role 267static union kvm_mmu_page_role
265kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); 268kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
266 269
270#define CREATE_TRACE_POINTS
271#include "mmutrace.h"
272
267 273
268static inline bool kvm_available_flush_tlb_with_range(void) 274static inline bool kvm_available_flush_tlb_with_range(void)
269{ 275{
@@ -468,6 +474,21 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
468} 474}
469EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 475EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
470 476
477static u8 kvm_get_shadow_phys_bits(void)
478{
479 /*
480 * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
481 * in CPU detection code, but MKTME treats those reduced bits as
482 * 'keyID' thus they are not reserved bits. Therefore for MKTME
483 * we should still return physical address bits reported by CPUID.
484 */
485 if (!boot_cpu_has(X86_FEATURE_TME) ||
486 WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
487 return boot_cpu_data.x86_phys_bits;
488
489 return cpuid_eax(0x80000008) & 0xff;
490}
491
471static void kvm_mmu_reset_all_pte_masks(void) 492static void kvm_mmu_reset_all_pte_masks(void)
472{ 493{
473 u8 low_phys_bits; 494 u8 low_phys_bits;
@@ -481,6 +502,8 @@ static void kvm_mmu_reset_all_pte_masks(void)
481 shadow_present_mask = 0; 502 shadow_present_mask = 0;
482 shadow_acc_track_mask = 0; 503 shadow_acc_track_mask = 0;
483 504
505 shadow_phys_bits = kvm_get_shadow_phys_bits();
506
484 /* 507 /*
485 * If the CPU has 46 or less physical address bits, then set an 508 * If the CPU has 46 or less physical address bits, then set an
486 * appropriate mask to guard against L1TF attacks. Otherwise, it is 509 * appropriate mask to guard against L1TF attacks. Otherwise, it is
@@ -1073,10 +1096,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1073 1096
1074static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) 1097static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1075{ 1098{
1076 if (sp->role.direct) 1099 if (!sp->role.direct) {
1077 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
1078 else
1079 sp->gfns[index] = gfn; 1100 sp->gfns[index] = gfn;
1101 return;
1102 }
1103
1104 if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
1105 pr_err_ratelimited("gfn mismatch under direct page %llx "
1106 "(expected %llx, got %llx)\n",
1107 sp->gfn,
1108 kvm_mmu_page_get_gfn(sp, index), gfn);
1080} 1109}
1081 1110
1082/* 1111/*
@@ -3055,10 +3084,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3055 ret = RET_PF_EMULATE; 3084 ret = RET_PF_EMULATE;
3056 3085
3057 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 3086 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3058 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 3087 trace_kvm_mmu_set_spte(level, gfn, sptep);
3059 is_large_pte(*sptep)? "2MB" : "4kB",
3060 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
3061 *sptep, sptep);
3062 if (!was_rmapped && is_large_pte(*sptep)) 3088 if (!was_rmapped && is_large_pte(*sptep))
3063 ++vcpu->kvm->stat.lpages; 3089 ++vcpu->kvm->stat.lpages;
3064 3090
@@ -3070,8 +3096,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3070 } 3096 }
3071 } 3097 }
3072 3098
3073 kvm_release_pfn_clean(pfn);
3074
3075 return ret; 3099 return ret;
3076} 3100}
3077 3101
@@ -3106,9 +3130,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3106 if (ret <= 0) 3130 if (ret <= 0)
3107 return -1; 3131 return -1;
3108 3132
3109 for (i = 0; i < ret; i++, gfn++, start++) 3133 for (i = 0; i < ret; i++, gfn++, start++) {
3110 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, 3134 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3111 page_to_pfn(pages[i]), true, true); 3135 page_to_pfn(pages[i]), true, true);
3136 put_page(pages[i]);
3137 }
3112 3138
3113 return 0; 3139 return 0;
3114} 3140}
@@ -3156,40 +3182,40 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3156 __direct_pte_prefetch(vcpu, sp, sptep); 3182 __direct_pte_prefetch(vcpu, sp, sptep);
3157} 3183}
3158 3184
3159static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, 3185static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3160 int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) 3186 int map_writable, int level, kvm_pfn_t pfn,
3187 bool prefault)
3161{ 3188{
3162 struct kvm_shadow_walk_iterator iterator; 3189 struct kvm_shadow_walk_iterator it;
3163 struct kvm_mmu_page *sp; 3190 struct kvm_mmu_page *sp;
3164 int emulate = 0; 3191 int ret;
3165 gfn_t pseudo_gfn; 3192 gfn_t gfn = gpa >> PAGE_SHIFT;
3193 gfn_t base_gfn = gfn;
3166 3194
3167 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) 3195 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3168 return 0; 3196 return RET_PF_RETRY;
3169 3197
3170 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 3198 trace_kvm_mmu_spte_requested(gpa, level, pfn);
3171 if (iterator.level == level) { 3199 for_each_shadow_entry(vcpu, gpa, it) {
3172 emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, 3200 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3173 write, level, gfn, pfn, prefault, 3201 if (it.level == level)
3174 map_writable);
3175 direct_pte_prefetch(vcpu, iterator.sptep);
3176 ++vcpu->stat.pf_fixed;
3177 break; 3202 break;
3178 }
3179 3203
3180 drop_large_spte(vcpu, iterator.sptep); 3204 drop_large_spte(vcpu, it.sptep);
3181 if (!is_shadow_present_pte(*iterator.sptep)) { 3205 if (!is_shadow_present_pte(*it.sptep)) {
3182 u64 base_addr = iterator.addr; 3206 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
3207 it.level - 1, true, ACC_ALL);
3183 3208
3184 base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 3209 link_shadow_page(vcpu, it.sptep, sp);
3185 pseudo_gfn = base_addr >> PAGE_SHIFT;
3186 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
3187 iterator.level - 1, 1, ACC_ALL);
3188
3189 link_shadow_page(vcpu, iterator.sptep, sp);
3190 } 3210 }
3191 } 3211 }
3192 return emulate; 3212
3213 ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
3214 write, level, base_gfn, pfn, prefault,
3215 map_writable);
3216 direct_pte_prefetch(vcpu, it.sptep);
3217 ++vcpu->stat.pf_fixed;
3218 return ret;
3193} 3219}
3194 3220
3195static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 3221static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -3216,11 +3242,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3216} 3242}
3217 3243
3218static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 3244static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3219 gfn_t *gfnp, kvm_pfn_t *pfnp, 3245 gfn_t gfn, kvm_pfn_t *pfnp,
3220 int *levelp) 3246 int *levelp)
3221{ 3247{
3222 kvm_pfn_t pfn = *pfnp; 3248 kvm_pfn_t pfn = *pfnp;
3223 gfn_t gfn = *gfnp;
3224 int level = *levelp; 3249 int level = *levelp;
3225 3250
3226 /* 3251 /*
@@ -3247,8 +3272,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3247 mask = KVM_PAGES_PER_HPAGE(level) - 1; 3272 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3248 VM_BUG_ON((gfn & mask) != (pfn & mask)); 3273 VM_BUG_ON((gfn & mask) != (pfn & mask));
3249 if (pfn & mask) { 3274 if (pfn & mask) {
3250 gfn &= ~mask;
3251 *gfnp = gfn;
3252 kvm_release_pfn_clean(pfn); 3275 kvm_release_pfn_clean(pfn);
3253 pfn &= ~mask; 3276 pfn &= ~mask;
3254 kvm_get_pfn(pfn); 3277 kvm_get_pfn(pfn);
@@ -3505,22 +3528,19 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3505 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) 3528 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3506 return r; 3529 return r;
3507 3530
3531 r = RET_PF_RETRY;
3508 spin_lock(&vcpu->kvm->mmu_lock); 3532 spin_lock(&vcpu->kvm->mmu_lock);
3509 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3533 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3510 goto out_unlock; 3534 goto out_unlock;
3511 if (make_mmu_pages_available(vcpu) < 0) 3535 if (make_mmu_pages_available(vcpu) < 0)
3512 goto out_unlock; 3536 goto out_unlock;
3513 if (likely(!force_pt_level)) 3537 if (likely(!force_pt_level))
3514 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3538 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3515 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); 3539 r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
3516 spin_unlock(&vcpu->kvm->mmu_lock);
3517
3518 return r;
3519
3520out_unlock: 3540out_unlock:
3521 spin_unlock(&vcpu->kvm->mmu_lock); 3541 spin_unlock(&vcpu->kvm->mmu_lock);
3522 kvm_release_pfn_clean(pfn); 3542 kvm_release_pfn_clean(pfn);
3523 return RET_PF_RETRY; 3543 return r;
3524} 3544}
3525 3545
3526static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, 3546static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
@@ -4015,19 +4035,6 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
4015 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); 4035 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4016} 4036}
4017 4037
4018bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
4019{
4020 if (unlikely(!lapic_in_kernel(vcpu) ||
4021 kvm_event_needs_reinjection(vcpu) ||
4022 vcpu->arch.exception.pending))
4023 return false;
4024
4025 if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
4026 return false;
4027
4028 return kvm_x86_ops->interrupt_allowed(vcpu);
4029}
4030
4031static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 4038static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4032 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) 4039 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
4033{ 4040{
@@ -4147,22 +4154,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4147 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) 4154 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4148 return r; 4155 return r;
4149 4156
4157 r = RET_PF_RETRY;
4150 spin_lock(&vcpu->kvm->mmu_lock); 4158 spin_lock(&vcpu->kvm->mmu_lock);
4151 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 4159 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4152 goto out_unlock; 4160 goto out_unlock;
4153 if (make_mmu_pages_available(vcpu) < 0) 4161 if (make_mmu_pages_available(vcpu) < 0)
4154 goto out_unlock; 4162 goto out_unlock;
4155 if (likely(!force_pt_level)) 4163 if (likely(!force_pt_level))
4156 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 4164 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4157 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); 4165 r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
4158 spin_unlock(&vcpu->kvm->mmu_lock);
4159
4160 return r;
4161
4162out_unlock: 4166out_unlock:
4163 spin_unlock(&vcpu->kvm->mmu_lock); 4167 spin_unlock(&vcpu->kvm->mmu_lock);
4164 kvm_release_pfn_clean(pfn); 4168 kvm_release_pfn_clean(pfn);
4165 return RET_PF_RETRY; 4169 return r;
4166} 4170}
4167 4171
4168static void nonpaging_init_context(struct kvm_vcpu *vcpu, 4172static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4494,7 +4498,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4494 */ 4498 */
4495 shadow_zero_check = &context->shadow_zero_check; 4499 shadow_zero_check = &context->shadow_zero_check;
4496 __reset_rsvds_bits_mask(vcpu, shadow_zero_check, 4500 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4497 boot_cpu_data.x86_phys_bits, 4501 shadow_phys_bits,
4498 context->shadow_root_level, uses_nx, 4502 context->shadow_root_level, uses_nx,
4499 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), 4503 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4500 is_pse(vcpu), true); 4504 is_pse(vcpu), true);
@@ -4531,13 +4535,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4531 4535
4532 if (boot_cpu_is_amd()) 4536 if (boot_cpu_is_amd())
4533 __reset_rsvds_bits_mask(vcpu, shadow_zero_check, 4537 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4534 boot_cpu_data.x86_phys_bits, 4538 shadow_phys_bits,
4535 context->shadow_root_level, false, 4539 context->shadow_root_level, false,
4536 boot_cpu_has(X86_FEATURE_GBPAGES), 4540 boot_cpu_has(X86_FEATURE_GBPAGES),
4537 true, true); 4541 true, true);
4538 else 4542 else
4539 __reset_rsvds_bits_mask_ept(shadow_zero_check, 4543 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4540 boot_cpu_data.x86_phys_bits, 4544 shadow_phys_bits,
4541 false); 4545 false);
4542 4546
4543 if (!shadow_me_mask) 4547 if (!shadow_me_mask)
@@ -4558,7 +4562,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4558 struct kvm_mmu *context, bool execonly) 4562 struct kvm_mmu *context, bool execonly)
4559{ 4563{
4560 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 4564 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4561 boot_cpu_data.x86_phys_bits, execonly); 4565 shadow_phys_bits, execonly);
4562} 4566}
4563 4567
4564#define BYTE_MASK(access) \ 4568#define BYTE_MASK(access) \
@@ -5935,7 +5939,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
5935 int nr_to_scan = sc->nr_to_scan; 5939 int nr_to_scan = sc->nr_to_scan;
5936 unsigned long freed = 0; 5940 unsigned long freed = 0;
5937 5941
5938 spin_lock(&kvm_lock); 5942 mutex_lock(&kvm_lock);
5939 5943
5940 list_for_each_entry(kvm, &vm_list, vm_list) { 5944 list_for_each_entry(kvm, &vm_list, vm_list) {
5941 int idx; 5945 int idx;
@@ -5977,7 +5981,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
5977 break; 5981 break;
5978 } 5982 }
5979 5983
5980 spin_unlock(&kvm_lock); 5984 mutex_unlock(&kvm_lock);
5981 return freed; 5985 return freed;
5982} 5986}
5983 5987
@@ -5999,6 +6003,34 @@ static void mmu_destroy_caches(void)
5999 kmem_cache_destroy(mmu_page_header_cache); 6003 kmem_cache_destroy(mmu_page_header_cache);
6000} 6004}
6001 6005
6006static void kvm_set_mmio_spte_mask(void)
6007{
6008 u64 mask;
6009
6010 /*
6011 * Set the reserved bits and the present bit of an paging-structure
6012 * entry to generate page fault with PFER.RSV = 1.
6013 */
6014
6015 /*
6016 * Mask the uppermost physical address bit, which would be reserved as
6017 * long as the supported physical address width is less than 52.
6018 */
6019 mask = 1ull << 51;
6020
6021 /* Set the present bit. */
6022 mask |= 1ull;
6023
6024 /*
6025 * If reserved bit is not supported, clear the present bit to disable
6026 * mmio page fault.
6027 */
6028 if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
6029 mask &= ~1ull;
6030
6031 kvm_mmu_set_mmio_spte_mask(mask, mask);
6032}
6033
6002int kvm_mmu_module_init(void) 6034int kvm_mmu_module_init(void)
6003{ 6035{
6004 int ret = -ENOMEM; 6036 int ret = -ENOMEM;
@@ -6015,6 +6047,8 @@ int kvm_mmu_module_init(void)
6015 6047
6016 kvm_mmu_reset_all_pte_masks(); 6048 kvm_mmu_reset_all_pte_masks();
6017 6049
6050 kvm_set_mmio_spte_mask();
6051
6018 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 6052 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6019 sizeof(struct pte_list_desc), 6053 sizeof(struct pte_list_desc),
6020 0, SLAB_ACCOUNT, NULL); 6054 0, SLAB_ACCOUNT, NULL);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index dd30dccd2ad5..d8001b4bca05 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -301,6 +301,65 @@ TRACE_EVENT(
301 __entry->kvm_gen == __entry->spte_gen 301 __entry->kvm_gen == __entry->spte_gen
302 ) 302 )
303); 303);
304
305TRACE_EVENT(
306 kvm_mmu_set_spte,
307 TP_PROTO(int level, gfn_t gfn, u64 *sptep),
308 TP_ARGS(level, gfn, sptep),
309
310 TP_STRUCT__entry(
311 __field(u64, gfn)
312 __field(u64, spte)
313 __field(u64, sptep)
314 __field(u8, level)
315 /* These depend on page entry type, so compute them now. */
316 __field(bool, r)
317 __field(bool, x)
318 __field(u8, u)
319 ),
320
321 TP_fast_assign(
322 __entry->gfn = gfn;
323 __entry->spte = *sptep;
324 __entry->sptep = virt_to_phys(sptep);
325 __entry->level = level;
326 __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
327 __entry->x = is_executable_pte(__entry->spte);
328 __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
329 ),
330
331 TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
332 __entry->gfn, __entry->spte,
333 __entry->r ? "r" : "-",
334 __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
335 __entry->x ? "x" : "-",
336 __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
337 __entry->level, __entry->sptep
338 )
339);
340
341TRACE_EVENT(
342 kvm_mmu_spte_requested,
343 TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
344 TP_ARGS(addr, level, pfn),
345
346 TP_STRUCT__entry(
347 __field(u64, gfn)
348 __field(u64, pfn)
349 __field(u8, level)
350 ),
351
352 TP_fast_assign(
353 __entry->gfn = addr >> PAGE_SHIFT;
354 __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
355 __entry->level = level;
356 ),
357
358 TP_printk("gfn %llx pfn %llx level %d",
359 __entry->gfn, __entry->pfn, __entry->level
360 )
361);
362
304#endif /* _TRACE_KVMMMU_H */ 363#endif /* _TRACE_KVMMMU_H */
305 364
306#undef TRACE_INCLUDE_PATH 365#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d583bcd119fc..7d5cdb3af594 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -540,6 +540,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
540 mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, 540 mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
541 true, true); 541 true, true);
542 542
543 kvm_release_pfn_clean(pfn);
543 return true; 544 return true;
544} 545}
545 546
@@ -619,6 +620,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
619 struct kvm_shadow_walk_iterator it; 620 struct kvm_shadow_walk_iterator it;
620 unsigned direct_access, access = gw->pt_access; 621 unsigned direct_access, access = gw->pt_access;
621 int top_level, ret; 622 int top_level, ret;
623 gfn_t base_gfn;
622 624
623 direct_access = gw->pte_access; 625 direct_access = gw->pte_access;
624 626
@@ -663,35 +665,34 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
663 link_shadow_page(vcpu, it.sptep, sp); 665 link_shadow_page(vcpu, it.sptep, sp);
664 } 666 }
665 667
666 for (; 668 base_gfn = gw->gfn;
667 shadow_walk_okay(&it) && it.level > hlevel; 669
668 shadow_walk_next(&it)) { 670 trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
669 gfn_t direct_gfn;
670 671
672 for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
671 clear_sp_write_flooding_count(it.sptep); 673 clear_sp_write_flooding_count(it.sptep);
674 base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
675 if (it.level == hlevel)
676 break;
677
672 validate_direct_spte(vcpu, it.sptep, direct_access); 678 validate_direct_spte(vcpu, it.sptep, direct_access);
673 679
674 drop_large_spte(vcpu, it.sptep); 680 drop_large_spte(vcpu, it.sptep);
675 681
676 if (is_shadow_present_pte(*it.sptep)) 682 if (!is_shadow_present_pte(*it.sptep)) {
677 continue; 683 sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
678 684 it.level - 1, true, direct_access);
679 direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 685 link_shadow_page(vcpu, it.sptep, sp);
680 686 }
681 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
682 true, direct_access);
683 link_shadow_page(vcpu, it.sptep, sp);
684 } 687 }
685 688
686 clear_sp_write_flooding_count(it.sptep);
687 ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, 689 ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
688 it.level, gw->gfn, pfn, prefault, map_writable); 690 it.level, base_gfn, pfn, prefault, map_writable);
689 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 691 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
690 692 ++vcpu->stat.pf_fixed;
691 return ret; 693 return ret;
692 694
693out_gpte_changed: 695out_gpte_changed:
694 kvm_release_pfn_clean(pfn);
695 return RET_PF_RETRY; 696 return RET_PF_RETRY;
696} 697}
697 698
@@ -839,6 +840,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
839 walker.pte_access &= ~ACC_EXEC_MASK; 840 walker.pte_access &= ~ACC_EXEC_MASK;
840 } 841 }
841 842
843 r = RET_PF_RETRY;
842 spin_lock(&vcpu->kvm->mmu_lock); 844 spin_lock(&vcpu->kvm->mmu_lock);
843 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 845 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
844 goto out_unlock; 846 goto out_unlock;
@@ -847,19 +849,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
847 if (make_mmu_pages_available(vcpu) < 0) 849 if (make_mmu_pages_available(vcpu) < 0)
848 goto out_unlock; 850 goto out_unlock;
849 if (!force_pt_level) 851 if (!force_pt_level)
850 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 852 transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
851 r = FNAME(fetch)(vcpu, addr, &walker, write_fault, 853 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
852 level, pfn, map_writable, prefault); 854 level, pfn, map_writable, prefault);
853 ++vcpu->stat.pf_fixed;
854 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 855 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
855 spin_unlock(&vcpu->kvm->mmu_lock);
856
857 return r;
858 856
859out_unlock: 857out_unlock:
860 spin_unlock(&vcpu->kvm->mmu_lock); 858 spin_unlock(&vcpu->kvm->mmu_lock);
861 kvm_release_pfn_clean(pfn); 859 kvm_release_pfn_clean(pfn);
862 return RET_PF_RETRY; 860 return r;
863} 861}
864 862
865static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) 863static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index ab73a9a639ae..aa5a2597305a 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -19,6 +19,9 @@
19#include "lapic.h" 19#include "lapic.h"
20#include "pmu.h" 20#include "pmu.h"
21 21
22/* This keeps the total size of the filter under 4k. */
23#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 63
24
22/* NOTE: 25/* NOTE:
23 * - Each perf counter is defined as "struct kvm_pmc"; 26 * - Each perf counter is defined as "struct kvm_pmc";
24 * - There are two types of perf counters: general purpose (gp) and fixed. 27 * - There are two types of perf counters: general purpose (gp) and fixed.
@@ -141,6 +144,10 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
141{ 144{
142 unsigned config, type = PERF_TYPE_RAW; 145 unsigned config, type = PERF_TYPE_RAW;
143 u8 event_select, unit_mask; 146 u8 event_select, unit_mask;
147 struct kvm *kvm = pmc->vcpu->kvm;
148 struct kvm_pmu_event_filter *filter;
149 int i;
150 bool allow_event = true;
144 151
145 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) 152 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
146 printk_once("kvm pmu: pin control bit is ignored\n"); 153 printk_once("kvm pmu: pin control bit is ignored\n");
@@ -152,6 +159,22 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
152 if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) 159 if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
153 return; 160 return;
154 161
162 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
163 if (filter) {
164 for (i = 0; i < filter->nevents; i++)
165 if (filter->events[i] ==
166 (eventsel & AMD64_RAW_EVENT_MASK_NB))
167 break;
168 if (filter->action == KVM_PMU_EVENT_ALLOW &&
169 i == filter->nevents)
170 allow_event = false;
171 if (filter->action == KVM_PMU_EVENT_DENY &&
172 i < filter->nevents)
173 allow_event = false;
174 }
175 if (!allow_event)
176 return;
177
155 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; 178 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
156 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 179 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
157 180
@@ -348,3 +371,43 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
348{ 371{
349 kvm_pmu_reset(vcpu); 372 kvm_pmu_reset(vcpu);
350} 373}
374
375int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
376{
377 struct kvm_pmu_event_filter tmp, *filter;
378 size_t size;
379 int r;
380
381 if (copy_from_user(&tmp, argp, sizeof(tmp)))
382 return -EFAULT;
383
384 if (tmp.action != KVM_PMU_EVENT_ALLOW &&
385 tmp.action != KVM_PMU_EVENT_DENY)
386 return -EINVAL;
387
388 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
389 return -E2BIG;
390
391 size = struct_size(filter, events, tmp.nevents);
392 filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
393 if (!filter)
394 return -ENOMEM;
395
396 r = -EFAULT;
397 if (copy_from_user(filter, argp, size))
398 goto cleanup;
399
400 /* Ensure nevents can't be changed between the user copies. */
401 *filter = tmp;
402
403 mutex_lock(&kvm->lock);
404 rcu_swap_protected(kvm->arch.pmu_event_filter, filter,
405 mutex_is_locked(&kvm->lock));
406 mutex_unlock(&kvm->lock);
407
408 synchronize_srcu_expedited(&kvm->srcu);
409 r = 0;
410cleanup:
411 kfree(filter);
412 return r;
413}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 22dff661145a..58265f761c3b 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -118,6 +118,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
118void kvm_pmu_reset(struct kvm_vcpu *vcpu); 118void kvm_pmu_reset(struct kvm_vcpu *vcpu);
119void kvm_pmu_init(struct kvm_vcpu *vcpu); 119void kvm_pmu_init(struct kvm_vcpu *vcpu);
120void kvm_pmu_destroy(struct kvm_vcpu *vcpu); 120void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
121int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
121 122
122bool is_vmware_backdoor_pmc(u32 pmc_idx); 123bool is_vmware_backdoor_pmc(u32 pmc_idx);
123 124
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 48c865a4e5dd..583b9fa656f3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -364,6 +364,10 @@ static int avic;
364module_param(avic, int, S_IRUGO); 364module_param(avic, int, S_IRUGO);
365#endif 365#endif
366 366
367/* enable/disable Next RIP Save */
368static int nrips = true;
369module_param(nrips, int, 0444);
370
367/* enable/disable Virtual VMLOAD VMSAVE */ 371/* enable/disable Virtual VMLOAD VMSAVE */
368static int vls = true; 372static int vls = true;
369module_param(vls, int, 0444); 373module_param(vls, int, 0444);
@@ -770,7 +774,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
770{ 774{
771 struct vcpu_svm *svm = to_svm(vcpu); 775 struct vcpu_svm *svm = to_svm(vcpu);
772 776
773 if (svm->vmcb->control.next_rip != 0) { 777 if (nrips && svm->vmcb->control.next_rip != 0) {
774 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); 778 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
775 svm->next_rip = svm->vmcb->control.next_rip; 779 svm->next_rip = svm->vmcb->control.next_rip;
776 } 780 }
@@ -807,7 +811,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
807 811
808 kvm_deliver_exception_payload(&svm->vcpu); 812 kvm_deliver_exception_payload(&svm->vcpu);
809 813
810 if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { 814 if (nr == BP_VECTOR && !nrips) {
811 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 815 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
812 816
813 /* 817 /*
@@ -1364,6 +1368,11 @@ static __init int svm_hardware_setup(void)
1364 } else 1368 } else
1365 kvm_disable_tdp(); 1369 kvm_disable_tdp();
1366 1370
1371 if (nrips) {
1372 if (!boot_cpu_has(X86_FEATURE_NRIPS))
1373 nrips = false;
1374 }
1375
1367 if (avic) { 1376 if (avic) {
1368 if (!npt_enabled || 1377 if (!npt_enabled ||
1369 !boot_cpu_has(X86_FEATURE_AVIC) || 1378 !boot_cpu_has(X86_FEATURE_AVIC) ||
@@ -3290,7 +3299,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
3290 vmcb->control.exit_int_info_err, 3299 vmcb->control.exit_int_info_err,
3291 KVM_ISA_SVM); 3300 KVM_ISA_SVM);
3292 3301
3293 rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map); 3302 rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
3294 if (rc) { 3303 if (rc) {
3295 if (rc == -EINVAL) 3304 if (rc == -EINVAL)
3296 kvm_inject_gp(&svm->vcpu, 0); 3305 kvm_inject_gp(&svm->vcpu, 0);
@@ -3580,7 +3589,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
3580 3589
3581 vmcb_gpa = svm->vmcb->save.rax; 3590 vmcb_gpa = svm->vmcb->save.rax;
3582 3591
3583 rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map); 3592 rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
3584 if (rc) { 3593 if (rc) {
3585 if (rc == -EINVAL) 3594 if (rc == -EINVAL)
3586 kvm_inject_gp(&svm->vcpu, 0); 3595 kvm_inject_gp(&svm->vcpu, 0);
@@ -3935,7 +3944,7 @@ static int rdpmc_interception(struct vcpu_svm *svm)
3935{ 3944{
3936 int err; 3945 int err;
3937 3946
3938 if (!static_cpu_has(X86_FEATURE_NRIPS)) 3947 if (!nrips)
3939 return emulate_on_interception(svm); 3948 return emulate_on_interception(svm);
3940 3949
3941 err = kvm_rdpmc(&svm->vcpu); 3950 err = kvm_rdpmc(&svm->vcpu);
@@ -5160,10 +5169,13 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
5160 kvm_lapic_set_irr(vec, vcpu->arch.apic); 5169 kvm_lapic_set_irr(vec, vcpu->arch.apic);
5161 smp_mb__after_atomic(); 5170 smp_mb__after_atomic();
5162 5171
5163 if (avic_vcpu_is_running(vcpu)) 5172 if (avic_vcpu_is_running(vcpu)) {
5164 wrmsrl(SVM_AVIC_DOORBELL, 5173 int cpuid = vcpu->cpu;
5165 kvm_cpu_get_apicid(vcpu->cpu)); 5174
5166 else 5175 if (cpuid != get_cpu())
5176 wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
5177 put_cpu();
5178 } else
5167 kvm_vcpu_wake_up(vcpu); 5179 kvm_vcpu_wake_up(vcpu);
5168} 5180}
5169 5181
@@ -5640,6 +5652,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
5640 clgi(); 5652 clgi();
5641 kvm_load_guest_xcr0(vcpu); 5653 kvm_load_guest_xcr0(vcpu);
5642 5654
5655 if (lapic_in_kernel(vcpu) &&
5656 vcpu->arch.apic->lapic_timer.timer_advance_ns)
5657 kvm_wait_lapic_expire(vcpu);
5658
5643 /* 5659 /*
5644 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 5660 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
5645 * it's non-zero. Since vmentry is serialising on affected CPUs, there 5661 * it's non-zero. Since vmentry is serialising on affected CPUs, there
@@ -5861,9 +5877,9 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5861 hypercall[2] = 0xd9; 5877 hypercall[2] = 0xd9;
5862} 5878}
5863 5879
5864static void svm_check_processor_compat(void *rtn) 5880static int __init svm_check_processor_compat(void)
5865{ 5881{
5866 *(int *)rtn = 0; 5882 return 0;
5867} 5883}
5868 5884
5869static bool svm_cpu_has_accelerated_tpr(void) 5885static bool svm_cpu_has_accelerated_tpr(void)
@@ -5875,6 +5891,7 @@ static bool svm_has_emulated_msr(int index)
5875{ 5891{
5876 switch (index) { 5892 switch (index) {
5877 case MSR_IA32_MCG_EXT_CTL: 5893 case MSR_IA32_MCG_EXT_CTL:
5894 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
5878 return false; 5895 return false;
5879 default: 5896 default:
5880 break; 5897 break;
@@ -6162,15 +6179,9 @@ out:
6162 return ret; 6179 return ret;
6163} 6180}
6164 6181
6165static void svm_handle_external_intr(struct kvm_vcpu *vcpu) 6182static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6166{ 6183{
6167 local_irq_enable(); 6184
6168 /*
6169 * We must have an instruction with interrupts enabled, so
6170 * the timer interrupt isn't delayed by the interrupt shadow.
6171 */
6172 asm("nop");
6173 local_irq_disable();
6174} 6185}
6175 6186
6176static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) 6187static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
@@ -7256,7 +7267,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
7256 .set_tdp_cr3 = set_tdp_cr3, 7267 .set_tdp_cr3 = set_tdp_cr3,
7257 7268
7258 .check_intercept = svm_check_intercept, 7269 .check_intercept = svm_check_intercept,
7259 .handle_external_intr = svm_handle_external_intr, 7270 .handle_exit_irqoff = svm_handle_exit_irqoff,
7260 7271
7261 .request_immediate_exit = __kvm_request_immediate_exit, 7272 .request_immediate_exit = __kvm_request_immediate_exit,
7262 7273
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4d47a2631d1f..b5c831e79094 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1365,7 +1365,7 @@ TRACE_EVENT(kvm_hv_timer_state,
1365 __entry->vcpu_id = vcpu_id; 1365 __entry->vcpu_id = vcpu_id;
1366 __entry->hv_timer_in_use = hv_timer_in_use; 1366 __entry->hv_timer_in_use = hv_timer_in_use;
1367 ), 1367 ),
1368 TP_printk("vcpu_id %x hv_timer %x\n", 1368 TP_printk("vcpu_id %x hv_timer %x",
1369 __entry->vcpu_id, 1369 __entry->vcpu_id,
1370 __entry->hv_timer_in_use) 1370 __entry->hv_timer_in_use)
1371); 1371);
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 5466c6d85cf3..72359709cdc1 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -3,6 +3,7 @@
3#include <linux/errno.h> 3#include <linux/errno.h>
4#include <linux/smp.h> 4#include <linux/smp.h>
5 5
6#include "../hyperv.h"
6#include "evmcs.h" 7#include "evmcs.h"
7#include "vmcs.h" 8#include "vmcs.h"
8#include "vmx.h" 9#include "vmx.h"
@@ -313,6 +314,23 @@ void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
313} 314}
314#endif 315#endif
315 316
317bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
318{
319 struct hv_vp_assist_page assist_page;
320
321 *evmcs_gpa = -1ull;
322
323 if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
324 return false;
325
326 if (unlikely(!assist_page.enlighten_vmentry))
327 return false;
328
329 *evmcs_gpa = assist_page.current_nested_vmcs;
330
331 return true;
332}
333
316uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) 334uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
317{ 335{
318 struct vcpu_vmx *vmx = to_vmx(vcpu); 336 struct vcpu_vmx *vmx = to_vmx(vcpu);
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index e0fcef85b332..39a24eec8884 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -195,6 +195,7 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
195static inline void evmcs_touch_msr_bitmap(void) {} 195static inline void evmcs_touch_msr_bitmap(void) {}
196#endif /* IS_ENABLED(CONFIG_HYPERV) */ 196#endif /* IS_ENABLED(CONFIG_HYPERV) */
197 197
198bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
198uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); 199uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
199int nested_enable_evmcs(struct kvm_vcpu *vcpu, 200int nested_enable_evmcs(struct kvm_vcpu *vcpu,
200 uint16_t *vmcs_version); 201 uint16_t *vmcs_version);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 46af3a5e9209..bb509c254939 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -41,15 +41,19 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
41#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 41#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
42#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 42#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
43 43
44static u16 shadow_read_only_fields[] = { 44struct shadow_vmcs_field {
45#define SHADOW_FIELD_RO(x) x, 45 u16 encoding;
46 u16 offset;
47};
48static struct shadow_vmcs_field shadow_read_only_fields[] = {
49#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
46#include "vmcs_shadow_fields.h" 50#include "vmcs_shadow_fields.h"
47}; 51};
48static int max_shadow_read_only_fields = 52static int max_shadow_read_only_fields =
49 ARRAY_SIZE(shadow_read_only_fields); 53 ARRAY_SIZE(shadow_read_only_fields);
50 54
51static u16 shadow_read_write_fields[] = { 55static struct shadow_vmcs_field shadow_read_write_fields[] = {
52#define SHADOW_FIELD_RW(x) x, 56#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
53#include "vmcs_shadow_fields.h" 57#include "vmcs_shadow_fields.h"
54}; 58};
55static int max_shadow_read_write_fields = 59static int max_shadow_read_write_fields =
@@ -63,34 +67,40 @@ static void init_vmcs_shadow_fields(void)
63 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 67 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
64 68
65 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 69 for (i = j = 0; i < max_shadow_read_only_fields; i++) {
66 u16 field = shadow_read_only_fields[i]; 70 struct shadow_vmcs_field entry = shadow_read_only_fields[i];
71 u16 field = entry.encoding;
67 72
68 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 73 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
69 (i + 1 == max_shadow_read_only_fields || 74 (i + 1 == max_shadow_read_only_fields ||
70 shadow_read_only_fields[i + 1] != field + 1)) 75 shadow_read_only_fields[i + 1].encoding != field + 1))
71 pr_err("Missing field from shadow_read_only_field %x\n", 76 pr_err("Missing field from shadow_read_only_field %x\n",
72 field + 1); 77 field + 1);
73 78
74 clear_bit(field, vmx_vmread_bitmap); 79 clear_bit(field, vmx_vmread_bitmap);
75#ifdef CONFIG_X86_64
76 if (field & 1) 80 if (field & 1)
81#ifdef CONFIG_X86_64
77 continue; 82 continue;
83#else
84 entry.offset += sizeof(u32);
78#endif 85#endif
79 if (j < i) 86 shadow_read_only_fields[j++] = entry;
80 shadow_read_only_fields[j] = field;
81 j++;
82 } 87 }
83 max_shadow_read_only_fields = j; 88 max_shadow_read_only_fields = j;
84 89
85 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 90 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
86 u16 field = shadow_read_write_fields[i]; 91 struct shadow_vmcs_field entry = shadow_read_write_fields[i];
92 u16 field = entry.encoding;
87 93
88 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 94 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
89 (i + 1 == max_shadow_read_write_fields || 95 (i + 1 == max_shadow_read_write_fields ||
90 shadow_read_write_fields[i + 1] != field + 1)) 96 shadow_read_write_fields[i + 1].encoding != field + 1))
91 pr_err("Missing field from shadow_read_write_field %x\n", 97 pr_err("Missing field from shadow_read_write_field %x\n",
92 field + 1); 98 field + 1);
93 99
100 WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
101 field <= GUEST_TR_AR_BYTES,
102 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
103
94 /* 104 /*
95 * PML and the preemption timer can be emulated, but the 105 * PML and the preemption timer can be emulated, but the
96 * processor cannot vmwrite to fields that don't exist 106 * processor cannot vmwrite to fields that don't exist
@@ -115,13 +125,13 @@ static void init_vmcs_shadow_fields(void)
115 125
116 clear_bit(field, vmx_vmwrite_bitmap); 126 clear_bit(field, vmx_vmwrite_bitmap);
117 clear_bit(field, vmx_vmread_bitmap); 127 clear_bit(field, vmx_vmread_bitmap);
118#ifdef CONFIG_X86_64
119 if (field & 1) 128 if (field & 1)
129#ifdef CONFIG_X86_64
120 continue; 130 continue;
131#else
132 entry.offset += sizeof(u32);
121#endif 133#endif
122 if (j < i) 134 shadow_read_write_fields[j++] = entry;
123 shadow_read_write_fields[j] = field;
124 j++;
125 } 135 }
126 max_shadow_read_write_fields = j; 136 max_shadow_read_write_fields = j;
127} 137}
@@ -182,7 +192,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
182 192
183static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 193static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
184{ 194{
185 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); 195 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
186 vmcs_write64(VMCS_LINK_POINTER, -1ull); 196 vmcs_write64(VMCS_LINK_POINTER, -1ull);
187} 197}
188 198
@@ -238,22 +248,41 @@ static void free_nested(struct kvm_vcpu *vcpu)
238 free_loaded_vmcs(&vmx->nested.vmcs02); 248 free_loaded_vmcs(&vmx->nested.vmcs02);
239} 249}
240 250
251static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
252 struct loaded_vmcs *prev)
253{
254 struct vmcs_host_state *dest, *src;
255
256 if (unlikely(!vmx->guest_state_loaded))
257 return;
258
259 src = &prev->host_state;
260 dest = &vmx->loaded_vmcs->host_state;
261
262 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
263 dest->ldt_sel = src->ldt_sel;
264#ifdef CONFIG_X86_64
265 dest->ds_sel = src->ds_sel;
266 dest->es_sel = src->es_sel;
267#endif
268}
269
241static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 270static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
242{ 271{
243 struct vcpu_vmx *vmx = to_vmx(vcpu); 272 struct vcpu_vmx *vmx = to_vmx(vcpu);
273 struct loaded_vmcs *prev;
244 int cpu; 274 int cpu;
245 275
246 if (vmx->loaded_vmcs == vmcs) 276 if (vmx->loaded_vmcs == vmcs)
247 return; 277 return;
248 278
249 cpu = get_cpu(); 279 cpu = get_cpu();
250 vmx_vcpu_put(vcpu); 280 prev = vmx->loaded_vmcs;
251 vmx->loaded_vmcs = vmcs; 281 vmx->loaded_vmcs = vmcs;
252 vmx_vcpu_load(vcpu, cpu); 282 vmx_vcpu_load_vmcs(vcpu, cpu);
283 vmx_sync_vmcs_host_state(vmx, prev);
253 put_cpu(); 284 put_cpu();
254 285
255 vm_entry_controls_reset_shadow(vmx);
256 vm_exit_controls_reset_shadow(vmx);
257 vmx_segment_cache_clear(vmx); 286 vmx_segment_cache_clear(vmx);
258} 287}
259 288
@@ -930,8 +959,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
930 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 959 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
931 * must not be dereferenced. 960 * must not be dereferenced.
932 */ 961 */
933 if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && 962 if (is_pae_paging(vcpu) && !nested_ept) {
934 !nested_ept) {
935 if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { 963 if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
936 *entry_failure_code = ENTRY_FAIL_PDPTE; 964 *entry_failure_code = ENTRY_FAIL_PDPTE;
937 return -EINVAL; 965 return -EINVAL;
@@ -1105,14 +1133,6 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1105 vmx->nested.msrs.misc_low = data; 1133 vmx->nested.msrs.misc_low = data;
1106 vmx->nested.msrs.misc_high = data >> 32; 1134 vmx->nested.msrs.misc_high = data >> 32;
1107 1135
1108 /*
1109 * If L1 has read-only VM-exit information fields, use the
1110 * less permissive vmx_vmwrite_bitmap to specify write
1111 * permissions for the shadow VMCS.
1112 */
1113 if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
1114 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
1115
1116 return 0; 1136 return 0;
1117} 1137}
1118 1138
@@ -1214,6 +1234,11 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1214 case MSR_IA32_VMX_VMCS_ENUM: 1234 case MSR_IA32_VMX_VMCS_ENUM:
1215 vmx->nested.msrs.vmcs_enum = data; 1235 vmx->nested.msrs.vmcs_enum = data;
1216 return 0; 1236 return 0;
1237 case MSR_IA32_VMX_VMFUNC:
1238 if (data & ~vmx->nested.msrs.vmfunc_controls)
1239 return -EINVAL;
1240 vmx->nested.msrs.vmfunc_controls = data;
1241 return 0;
1217 default: 1242 default:
1218 /* 1243 /*
1219 * The rest of the VMX capability MSRs do not support restore. 1244 * The rest of the VMX capability MSRs do not support restore.
@@ -1301,41 +1326,29 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1301} 1326}
1302 1327
1303/* 1328/*
1304 * Copy the writable VMCS shadow fields back to the VMCS12, in case 1329 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1305 * they have been modified by the L1 guest. Note that the "read-only" 1330 * been modified by the L1 guest. Note, "writable" in this context means
1306 * VM-exit information fields are actually writable if the vCPU is 1331 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1307 * configured to support "VMWRITE to any supported field in the VMCS." 1332 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1333 * VM-exit information fields (which are actually writable if the vCPU is
1334 * configured to support "VMWRITE to any supported field in the VMCS").
1308 */ 1335 */
1309static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1336static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1310{ 1337{
1311 const u16 *fields[] = {
1312 shadow_read_write_fields,
1313 shadow_read_only_fields
1314 };
1315 const int max_fields[] = {
1316 max_shadow_read_write_fields,
1317 max_shadow_read_only_fields
1318 };
1319 int i, q;
1320 unsigned long field;
1321 u64 field_value;
1322 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1338 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1339 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1340 struct shadow_vmcs_field field;
1341 unsigned long val;
1342 int i;
1323 1343
1324 preempt_disable(); 1344 preempt_disable();
1325 1345
1326 vmcs_load(shadow_vmcs); 1346 vmcs_load(shadow_vmcs);
1327 1347
1328 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1348 for (i = 0; i < max_shadow_read_write_fields; i++) {
1329 for (i = 0; i < max_fields[q]; i++) { 1349 field = shadow_read_write_fields[i];
1330 field = fields[q][i]; 1350 val = __vmcs_readl(field.encoding);
1331 field_value = __vmcs_readl(field); 1351 vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1332 vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
1333 }
1334 /*
1335 * Skip the VM-exit information fields if they are read-only.
1336 */
1337 if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
1338 break;
1339 } 1352 }
1340 1353
1341 vmcs_clear(shadow_vmcs); 1354 vmcs_clear(shadow_vmcs);
@@ -1346,7 +1359,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1346 1359
1347static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1360static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1348{ 1361{
1349 const u16 *fields[] = { 1362 const struct shadow_vmcs_field *fields[] = {
1350 shadow_read_write_fields, 1363 shadow_read_write_fields,
1351 shadow_read_only_fields 1364 shadow_read_only_fields
1352 }; 1365 };
@@ -1354,18 +1367,20 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1354 max_shadow_read_write_fields, 1367 max_shadow_read_write_fields,
1355 max_shadow_read_only_fields 1368 max_shadow_read_only_fields
1356 }; 1369 };
1357 int i, q;
1358 unsigned long field;
1359 u64 field_value = 0;
1360 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1370 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1371 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1372 struct shadow_vmcs_field field;
1373 unsigned long val;
1374 int i, q;
1361 1375
1362 vmcs_load(shadow_vmcs); 1376 vmcs_load(shadow_vmcs);
1363 1377
1364 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1378 for (q = 0; q < ARRAY_SIZE(fields); q++) {
1365 for (i = 0; i < max_fields[q]; i++) { 1379 for (i = 0; i < max_fields[q]; i++) {
1366 field = fields[q][i]; 1380 field = fields[q][i];
1367 vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); 1381 val = vmcs12_read_any(vmcs12, field.encoding,
1368 __vmcs_writel(field, field_value); 1382 field.offset);
1383 __vmcs_writel(field.encoding, val);
1369 } 1384 }
1370 } 1385 }
1371 1386
@@ -1623,7 +1638,7 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1623 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1638 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1624 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1639 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1625 * evmcs->host_rsp = vmcs12->host_rsp; 1640 * evmcs->host_rsp = vmcs12->host_rsp;
1626 * sync_vmcs12() doesn't read these: 1641 * sync_vmcs02_to_vmcs12() doesn't read these:
1627 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1642 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1628 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1643 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1629 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1644 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
@@ -1768,26 +1783,22 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1768 bool from_launch) 1783 bool from_launch)
1769{ 1784{
1770 struct vcpu_vmx *vmx = to_vmx(vcpu); 1785 struct vcpu_vmx *vmx = to_vmx(vcpu);
1771 struct hv_vp_assist_page assist_page; 1786 bool evmcs_gpa_changed = false;
1787 u64 evmcs_gpa;
1772 1788
1773 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1789 if (likely(!vmx->nested.enlightened_vmcs_enabled))
1774 return 1; 1790 return 1;
1775 1791
1776 if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) 1792 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
1777 return 1;
1778
1779 if (unlikely(!assist_page.enlighten_vmentry))
1780 return 1; 1793 return 1;
1781 1794
1782 if (unlikely(assist_page.current_nested_vmcs != 1795 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1783 vmx->nested.hv_evmcs_vmptr)) {
1784
1785 if (!vmx->nested.hv_evmcs) 1796 if (!vmx->nested.hv_evmcs)
1786 vmx->nested.current_vmptr = -1ull; 1797 vmx->nested.current_vmptr = -1ull;
1787 1798
1788 nested_release_evmcs(vcpu); 1799 nested_release_evmcs(vcpu);
1789 1800
1790 if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs), 1801 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1791 &vmx->nested.hv_evmcs_map)) 1802 &vmx->nested.hv_evmcs_map))
1792 return 0; 1803 return 0;
1793 1804
@@ -1822,15 +1833,9 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1822 } 1833 }
1823 1834
1824 vmx->nested.dirty_vmcs12 = true; 1835 vmx->nested.dirty_vmcs12 = true;
1825 /* 1836 vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
1826 * As we keep L2 state for one guest only 'hv_clean_fields' mask
1827 * can't be used when we switch between them. Reset it here for
1828 * simplicity.
1829 */
1830 vmx->nested.hv_evmcs->hv_clean_fields &=
1831 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1832 vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
1833 1837
1838 evmcs_gpa_changed = true;
1834 /* 1839 /*
1835 * Unlike normal vmcs12, enlightened vmcs12 is not fully 1840 * Unlike normal vmcs12, enlightened vmcs12 is not fully
1836 * reloaded from guest's memory (read only fields, fields not 1841 * reloaded from guest's memory (read only fields, fields not
@@ -1844,10 +1849,19 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1844 } 1849 }
1845 1850
1846 } 1851 }
1852
1853 /*
1854 * Clean fields data can't de used on VMLAUNCH and when we switch
1855 * between different L2 guests as KVM keeps a single VMCS12 per L1.
1856 */
1857 if (from_launch || evmcs_gpa_changed)
1858 vmx->nested.hv_evmcs->hv_clean_fields &=
1859 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1860
1847 return 1; 1861 return 1;
1848} 1862}
1849 1863
1850void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu) 1864void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
1851{ 1865{
1852 struct vcpu_vmx *vmx = to_vmx(vcpu); 1866 struct vcpu_vmx *vmx = to_vmx(vcpu);
1853 1867
@@ -1868,7 +1882,7 @@ void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu)
1868 copy_vmcs12_to_shadow(vmx); 1882 copy_vmcs12_to_shadow(vmx);
1869 } 1883 }
1870 1884
1871 vmx->nested.need_vmcs12_sync = false; 1885 vmx->nested.need_vmcs12_to_shadow_sync = false;
1872} 1886}
1873 1887
1874static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 1888static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@ -1948,8 +1962,20 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
1948 if (cpu_has_vmx_msr_bitmap()) 1962 if (cpu_has_vmx_msr_bitmap())
1949 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 1963 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
1950 1964
1951 if (enable_pml) 1965 /*
1966 * The PML address never changes, so it is constant in vmcs02.
1967 * Conceptually we want to copy the PML index from vmcs01 here,
1968 * and then back to vmcs01 on nested vmexit. But since we flush
1969 * the log and reset GUEST_PML_INDEX on each vmexit, the PML
1970 * index is also effectively constant in vmcs02.
1971 */
1972 if (enable_pml) {
1952 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 1973 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
1974 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
1975 }
1976
1977 if (cpu_has_vmx_encls_vmexit())
1978 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
1953 1979
1954 /* 1980 /*
1955 * Set the MSR load/store lists to match L0's settings. Only the 1981 * Set the MSR load/store lists to match L0's settings. Only the
@@ -1963,7 +1989,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
1963 vmx_set_constant_host_state(vmx); 1989 vmx_set_constant_host_state(vmx);
1964} 1990}
1965 1991
1966static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, 1992static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
1967 struct vmcs12 *vmcs12) 1993 struct vmcs12 *vmcs12)
1968{ 1994{
1969 prepare_vmcs02_constant_state(vmx); 1995 prepare_vmcs02_constant_state(vmx);
@@ -1984,17 +2010,14 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1984 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2010 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
1985 2011
1986 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) 2012 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
1987 prepare_vmcs02_early_full(vmx, vmcs12); 2013 prepare_vmcs02_early_rare(vmx, vmcs12);
1988 2014
1989 /* 2015 /*
1990 * PIN CONTROLS 2016 * PIN CONTROLS
1991 */ 2017 */
1992 exec_control = vmcs12->pin_based_vm_exec_control; 2018 exec_control = vmx_pin_based_exec_ctrl(vmx);
1993 2019 exec_control |= (vmcs12->pin_based_vm_exec_control &
1994 /* Preemption timer setting is computed directly in vmx_vcpu_run. */ 2020 ~PIN_BASED_VMX_PREEMPTION_TIMER);
1995 exec_control |= vmcs_config.pin_based_exec_ctrl;
1996 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
1997 vmx->loaded_vmcs->hv_timer_armed = false;
1998 2021
1999 /* Posted interrupts setting is only taken from vmcs12. */ 2022 /* Posted interrupts setting is only taken from vmcs12. */
2000 if (nested_cpu_has_posted_intr(vmcs12)) { 2023 if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -2003,7 +2026,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2003 } else { 2026 } else {
2004 exec_control &= ~PIN_BASED_POSTED_INTR; 2027 exec_control &= ~PIN_BASED_POSTED_INTR;
2005 } 2028 }
2006 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); 2029 pin_controls_set(vmx, exec_control);
2007 2030
2008 /* 2031 /*
2009 * EXEC CONTROLS 2032 * EXEC CONTROLS
@@ -2014,28 +2037,31 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2014 exec_control &= ~CPU_BASED_TPR_SHADOW; 2037 exec_control &= ~CPU_BASED_TPR_SHADOW;
2015 exec_control |= vmcs12->cpu_based_vm_exec_control; 2038 exec_control |= vmcs12->cpu_based_vm_exec_control;
2016 2039
2017 /* 2040 if (exec_control & CPU_BASED_TPR_SHADOW)
2018 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
2019 * nested_get_vmcs12_pages can't fix it up, the illegal value
2020 * will result in a VM entry failure.
2021 */
2022 if (exec_control & CPU_BASED_TPR_SHADOW) {
2023 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
2024 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2041 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2025 } else {
2026#ifdef CONFIG_X86_64 2042#ifdef CONFIG_X86_64
2043 else
2027 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2044 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2028 CPU_BASED_CR8_STORE_EXITING; 2045 CPU_BASED_CR8_STORE_EXITING;
2029#endif 2046#endif
2030 }
2031 2047
2032 /* 2048 /*
2033 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2049 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2034 * for I/O port accesses. 2050 * for I/O port accesses.
2035 */ 2051 */
2036 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2037 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2052 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2038 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 2053 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2054
2055 /*
2056 * This bit will be computed in nested_get_vmcs12_pages, because
2057 * we do not have access to L1's MSR bitmap yet. For now, keep
2058 * the same bit as before, hoping to avoid multiple VMWRITEs that
2059 * only set/clear this bit.
2060 */
2061 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2062 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2063
2064 exec_controls_set(vmx, exec_control);
2039 2065
2040 /* 2066 /*
2041 * SECONDARY EXEC CONTROLS 2067 * SECONDARY EXEC CONTROLS
@@ -2061,22 +2087,19 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2061 /* VMCS shadowing for L2 is emulated for now */ 2087 /* VMCS shadowing for L2 is emulated for now */
2062 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2088 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2063 2089
2064 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2065 vmcs_write16(GUEST_INTR_STATUS,
2066 vmcs12->guest_intr_status);
2067
2068 /* 2090 /*
2069 * Write an illegal value to APIC_ACCESS_ADDR. Later, 2091 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2070 * nested_get_vmcs12_pages will either fix it up or 2092 * will not have to rewrite the controls just for this bit.
2071 * remove the VM execution control.
2072 */ 2093 */
2073 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 2094 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2074 vmcs_write64(APIC_ACCESS_ADDR, -1ull); 2095 (vmcs12->guest_cr4 & X86_CR4_UMIP))
2096 exec_control |= SECONDARY_EXEC_DESC;
2075 2097
2076 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2098 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2077 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 2099 vmcs_write16(GUEST_INTR_STATUS,
2100 vmcs12->guest_intr_status);
2078 2101
2079 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2102 secondary_exec_controls_set(vmx, exec_control);
2080 } 2103 }
2081 2104
2082 /* 2105 /*
@@ -2095,7 +2118,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2095 if (guest_efer != host_efer) 2118 if (guest_efer != host_efer)
2096 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2119 exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2097 } 2120 }
2098 vm_entry_controls_init(vmx, exec_control); 2121 vm_entry_controls_set(vmx, exec_control);
2099 2122
2100 /* 2123 /*
2101 * EXIT CONTROLS 2124 * EXIT CONTROLS
@@ -2107,17 +2130,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2107 exec_control = vmx_vmexit_ctrl(); 2130 exec_control = vmx_vmexit_ctrl();
2108 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2131 if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2109 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2132 exec_control |= VM_EXIT_LOAD_IA32_EFER;
2110 vm_exit_controls_init(vmx, exec_control); 2133 vm_exit_controls_set(vmx, exec_control);
2111
2112 /*
2113 * Conceptually we want to copy the PML address and index from
2114 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
2115 * since we always flush the log on each vmexit and never change
2116 * the PML address (once set), this happens to be equivalent to
2117 * simply resetting the index in vmcs02.
2118 */
2119 if (enable_pml)
2120 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2121 2134
2122 /* 2135 /*
2123 * Interrupt/Exception Fields 2136 * Interrupt/Exception Fields
@@ -2138,7 +2151,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2138 } 2151 }
2139} 2152}
2140 2153
2141static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2154static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2142{ 2155{
2143 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2156 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2144 2157
@@ -2162,6 +2175,8 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2162 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2175 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2163 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2176 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2164 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2177 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2178 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2179 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2165 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2180 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2166 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2181 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2167 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2182 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
@@ -2198,6 +2213,10 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2198 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2213 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2199 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2214 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2200 } 2215 }
2216
2217 if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2218 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2219 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2201 } 2220 }
2202 2221
2203 if (nested_cpu_has_xsaves(vmcs12)) 2222 if (nested_cpu_has_xsaves(vmcs12))
@@ -2233,14 +2252,6 @@ static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2233 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2252 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2234 2253
2235 set_cr4_guest_host_mask(vmx); 2254 set_cr4_guest_host_mask(vmx);
2236
2237 if (kvm_mpx_supported()) {
2238 if (vmx->nested.nested_run_pending &&
2239 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2240 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2241 else
2242 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2243 }
2244} 2255}
2245 2256
2246/* 2257/*
@@ -2259,20 +2270,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2259{ 2270{
2260 struct vcpu_vmx *vmx = to_vmx(vcpu); 2271 struct vcpu_vmx *vmx = to_vmx(vcpu);
2261 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2272 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2273 bool load_guest_pdptrs_vmcs12 = false;
2262 2274
2263 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { 2275 if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
2264 prepare_vmcs02_full(vmx, vmcs12); 2276 prepare_vmcs02_rare(vmx, vmcs12);
2265 vmx->nested.dirty_vmcs12 = false; 2277 vmx->nested.dirty_vmcs12 = false;
2266 }
2267 2278
2268 /* 2279 load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2269 * First, the fields that are shadowed. This must be kept in sync 2280 !(hv_evmcs->hv_clean_fields &
2270 * with vmcs_shadow_fields.h. 2281 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2271 */
2272 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2273 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2274 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2275 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2276 } 2282 }
2277 2283
2278 if (vmx->nested.nested_run_pending && 2284 if (vmx->nested.nested_run_pending &&
@@ -2283,6 +2289,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2283 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2289 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2284 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2290 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2285 } 2291 }
2292 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2293 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2294 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2286 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2295 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2287 2296
2288 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2297 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
@@ -2372,6 +2381,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2372 entry_failure_code)) 2381 entry_failure_code))
2373 return -EINVAL; 2382 return -EINVAL;
2374 2383
2384 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2385 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2386 is_pae_paging(vcpu)) {
2387 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2388 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2389 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2390 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2391 }
2392
2375 if (!enable_ept) 2393 if (!enable_ept)
2376 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2394 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2377 2395
@@ -2609,6 +2627,30 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2609 !kvm_pat_valid(vmcs12->host_ia32_pat)) 2627 !kvm_pat_valid(vmcs12->host_ia32_pat))
2610 return -EINVAL; 2628 return -EINVAL;
2611 2629
2630 ia32e = (vmcs12->vm_exit_controls &
2631 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
2632
2633 if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2634 vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2635 vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2636 vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2637 vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2638 vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2639 vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2640 vmcs12->host_cs_selector == 0 ||
2641 vmcs12->host_tr_selector == 0 ||
2642 (vmcs12->host_ss_selector == 0 && !ia32e))
2643 return -EINVAL;
2644
2645#ifdef CONFIG_X86_64
2646 if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) ||
2647 is_noncanonical_address(vmcs12->host_gs_base, vcpu) ||
2648 is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) ||
2649 is_noncanonical_address(vmcs12->host_idtr_base, vcpu) ||
2650 is_noncanonical_address(vmcs12->host_tr_base, vcpu))
2651 return -EINVAL;
2652#endif
2653
2612 /* 2654 /*
2613 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2655 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2614 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2656 * IA32_EFER MSR must be 0 in the field for that register. In addition,
@@ -2616,8 +2658,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2616 * the host address-space size VM-exit control. 2658 * the host address-space size VM-exit control.
2617 */ 2659 */
2618 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2660 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2619 ia32e = (vmcs12->vm_exit_controls &
2620 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
2621 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || 2661 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
2622 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || 2662 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
2623 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) 2663 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
@@ -2781,7 +2821,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2781 [launched]"i"(offsetof(struct loaded_vmcs, launched)), 2821 [launched]"i"(offsetof(struct loaded_vmcs, launched)),
2782 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), 2822 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
2783 [wordsize]"i"(sizeof(ulong)) 2823 [wordsize]"i"(sizeof(ulong))
2784 : "cc", "memory" 2824 : "memory"
2785 ); 2825 );
2786 2826
2787 if (vmx->msr_autoload.host.nr) 2827 if (vmx->msr_autoload.host.nr)
@@ -2851,18 +2891,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2851 hpa = page_to_phys(vmx->nested.apic_access_page); 2891 hpa = page_to_phys(vmx->nested.apic_access_page);
2852 vmcs_write64(APIC_ACCESS_ADDR, hpa); 2892 vmcs_write64(APIC_ACCESS_ADDR, hpa);
2853 } else { 2893 } else {
2854 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, 2894 secondary_exec_controls_clearbit(vmx,
2855 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 2895 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
2856 } 2896 }
2857 } 2897 }
2858 2898
2859 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 2899 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
2860 map = &vmx->nested.virtual_apic_map; 2900 map = &vmx->nested.virtual_apic_map;
2861 2901
2862 /*
2863 * If translation failed, VM entry will fail because
2864 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
2865 */
2866 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 2902 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
2867 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 2903 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
2868 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 2904 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
@@ -2876,11 +2912,13 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2876 * _not_ what the processor does but it's basically the 2912 * _not_ what the processor does but it's basically the
2877 * only possibility we have. 2913 * only possibility we have.
2878 */ 2914 */
2879 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, 2915 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
2880 CPU_BASED_TPR_SHADOW);
2881 } else { 2916 } else {
2882 printk("bad virtual-APIC page address\n"); 2917 /*
2883 dump_vmcs(); 2918 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
2919 * force VM-Entry to fail.
2920 */
2921 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
2884 } 2922 }
2885 } 2923 }
2886 2924
@@ -2896,11 +2934,9 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2896 } 2934 }
2897 } 2935 }
2898 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 2936 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
2899 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, 2937 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
2900 CPU_BASED_USE_MSR_BITMAPS);
2901 else 2938 else
2902 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, 2939 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
2903 CPU_BASED_USE_MSR_BITMAPS);
2904} 2940}
2905 2941
2906/* 2942/*
@@ -2953,7 +2989,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2953 u32 exit_reason = EXIT_REASON_INVALID_STATE; 2989 u32 exit_reason = EXIT_REASON_INVALID_STATE;
2954 u32 exit_qual; 2990 u32 exit_qual;
2955 2991
2956 evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & 2992 evaluate_pending_interrupts = exec_controls_get(vmx) &
2957 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); 2993 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
2958 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 2994 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
2959 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 2995 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
@@ -2964,6 +3000,25 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2964 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3000 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2965 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3001 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
2966 3002
3003 /*
3004 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3005 * nested early checks are disabled. In the event of a "late" VM-Fail,
3006 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3007 * software model to the pre-VMEntry host state. When EPT is disabled,
3008 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3009 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3010 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3011 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3012 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3013 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3014 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3015 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3016 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3017 * path would need to manually save/restore vmcs01.GUEST_CR3.
3018 */
3019 if (!enable_ept && !nested_early_check)
3020 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3021
2967 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3022 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
2968 3023
2969 prepare_vmcs02_early(vmx, vmcs12); 3024 prepare_vmcs02_early(vmx, vmcs12);
@@ -3059,7 +3114,7 @@ vmentry_fail_vmexit:
3059 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 3114 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3060 vmcs12->exit_qualification = exit_qual; 3115 vmcs12->exit_qualification = exit_qual;
3061 if (enable_shadow_vmcs || vmx->nested.hv_evmcs) 3116 if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3062 vmx->nested.need_vmcs12_sync = true; 3117 vmx->nested.need_vmcs12_to_shadow_sync = true;
3063 return 1; 3118 return 1;
3064} 3119}
3065 3120
@@ -3077,7 +3132,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3077 if (!nested_vmx_check_permission(vcpu)) 3132 if (!nested_vmx_check_permission(vcpu))
3078 return 1; 3133 return 1;
3079 3134
3080 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true)) 3135 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
3081 return 1; 3136 return 1;
3082 3137
3083 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) 3138 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
@@ -3393,20 +3448,57 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3393 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3448 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3394} 3449}
3395 3450
3396/* 3451static bool is_vmcs12_ext_field(unsigned long field)
3397 * Update the guest state fields of vmcs12 to reflect changes that 3452{
3398 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 3453 switch (field) {
3399 * VM-entry controls is also updated, since this is really a guest 3454 case GUEST_ES_SELECTOR:
3400 * state bit.) 3455 case GUEST_CS_SELECTOR:
3401 */ 3456 case GUEST_SS_SELECTOR:
3402static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3457 case GUEST_DS_SELECTOR:
3403{ 3458 case GUEST_FS_SELECTOR:
3404 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 3459 case GUEST_GS_SELECTOR:
3405 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 3460 case GUEST_LDTR_SELECTOR:
3461 case GUEST_TR_SELECTOR:
3462 case GUEST_ES_LIMIT:
3463 case GUEST_CS_LIMIT:
3464 case GUEST_SS_LIMIT:
3465 case GUEST_DS_LIMIT:
3466 case GUEST_FS_LIMIT:
3467 case GUEST_GS_LIMIT:
3468 case GUEST_LDTR_LIMIT:
3469 case GUEST_TR_LIMIT:
3470 case GUEST_GDTR_LIMIT:
3471 case GUEST_IDTR_LIMIT:
3472 case GUEST_ES_AR_BYTES:
3473 case GUEST_DS_AR_BYTES:
3474 case GUEST_FS_AR_BYTES:
3475 case GUEST_GS_AR_BYTES:
3476 case GUEST_LDTR_AR_BYTES:
3477 case GUEST_TR_AR_BYTES:
3478 case GUEST_ES_BASE:
3479 case GUEST_CS_BASE:
3480 case GUEST_SS_BASE:
3481 case GUEST_DS_BASE:
3482 case GUEST_FS_BASE:
3483 case GUEST_GS_BASE:
3484 case GUEST_LDTR_BASE:
3485 case GUEST_TR_BASE:
3486 case GUEST_GDTR_BASE:
3487 case GUEST_IDTR_BASE:
3488 case GUEST_PENDING_DBG_EXCEPTIONS:
3489 case GUEST_BNDCFGS:
3490 return true;
3491 default:
3492 break;
3493 }
3406 3494
3407 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 3495 return false;
3408 vmcs12->guest_rip = kvm_rip_read(vcpu); 3496}
3409 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 3497
3498static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3499 struct vmcs12 *vmcs12)
3500{
3501 struct vcpu_vmx *vmx = to_vmx(vcpu);
3410 3502
3411 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3503 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3412 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3504 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
@@ -3427,8 +3519,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3427 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3519 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3428 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3520 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3429 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3521 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3430 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3431 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3432 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 3522 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3433 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 3523 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3434 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 3524 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
@@ -3444,11 +3534,69 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3444 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 3534 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3445 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 3535 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3446 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 3536 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3537 vmcs12->guest_pending_dbg_exceptions =
3538 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3539 if (kvm_mpx_supported())
3540 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3541
3542 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
3543}
3544
3545static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3546 struct vmcs12 *vmcs12)
3547{
3548 struct vcpu_vmx *vmx = to_vmx(vcpu);
3549 int cpu;
3550
3551 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
3552 return;
3553
3554
3555 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
3556
3557 cpu = get_cpu();
3558 vmx->loaded_vmcs = &vmx->nested.vmcs02;
3559 vmx_vcpu_load(&vmx->vcpu, cpu);
3560
3561 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3562
3563 vmx->loaded_vmcs = &vmx->vmcs01;
3564 vmx_vcpu_load(&vmx->vcpu, cpu);
3565 put_cpu();
3566}
3567
3568/*
3569 * Update the guest state fields of vmcs12 to reflect changes that
3570 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3571 * VM-entry controls is also updated, since this is really a guest
3572 * state bit.)
3573 */
3574static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3575{
3576 struct vcpu_vmx *vmx = to_vmx(vcpu);
3577
3578 if (vmx->nested.hv_evmcs)
3579 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3580
3581 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
3582
3583 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3584 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3585
3586 vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3587 vmcs12->guest_rip = kvm_rip_read(vcpu);
3588 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3589
3590 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3591 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3592
3593 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3594 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3595 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3447 3596
3448 vmcs12->guest_interruptibility_info = 3597 vmcs12->guest_interruptibility_info =
3449 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 3598 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3450 vmcs12->guest_pending_dbg_exceptions = 3599
3451 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3452 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3600 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3453 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 3601 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3454 else 3602 else
@@ -3469,10 +3617,12 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3469 */ 3617 */
3470 if (enable_ept) { 3618 if (enable_ept) {
3471 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 3619 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3472 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 3620 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3473 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 3621 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3474 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 3622 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3475 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 3623 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3624 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3625 }
3476 } 3626 }
3477 3627
3478 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 3628 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
@@ -3484,22 +3634,11 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3484 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 3634 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3485 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 3635 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3486 3636
3487 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { 3637 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
3488 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 3638 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3489 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3490 }
3491 3639
3492 /* TODO: These cannot have changed unless we have MSR bitmaps and
3493 * the relevant bit asks not to trap the change */
3494 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
3495 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
3496 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 3640 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3497 vmcs12->guest_ia32_efer = vcpu->arch.efer; 3641 vmcs12->guest_ia32_efer = vcpu->arch.efer;
3498 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3499 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3500 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3501 if (kvm_mpx_supported())
3502 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3503} 3642}
3504 3643
3505/* 3644/*
@@ -3517,11 +3656,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3517 u32 exit_reason, u32 exit_intr_info, 3656 u32 exit_reason, u32 exit_intr_info,
3518 unsigned long exit_qualification) 3657 unsigned long exit_qualification)
3519{ 3658{
3520 /* update guest state fields: */
3521 sync_vmcs12(vcpu, vmcs12);
3522
3523 /* update exit information fields: */ 3659 /* update exit information fields: */
3524
3525 vmcs12->vm_exit_reason = exit_reason; 3660 vmcs12->vm_exit_reason = exit_reason;
3526 vmcs12->exit_qualification = exit_qualification; 3661 vmcs12->exit_qualification = exit_qualification;
3527 vmcs12->vm_exit_intr_info = exit_intr_info; 3662 vmcs12->vm_exit_intr_info = exit_intr_info;
@@ -3775,18 +3910,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
3775 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 3910 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
3776 3911
3777 nested_ept_uninit_mmu_context(vcpu); 3912 nested_ept_uninit_mmu_context(vcpu);
3778 3913 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3779 /* 3914 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3780 * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3
3781 * points to shadow pages! Fortunately we only get here after a WARN_ON
3782 * if EPT is disabled, so a VMabort is perfectly fine.
3783 */
3784 if (enable_ept) {
3785 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3786 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3787 } else {
3788 nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED);
3789 }
3790 3915
3791 /* 3916 /*
3792 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 3917 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -3794,7 +3919,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
3794 * VMFail, like everything else we just need to ensure our 3919 * VMFail, like everything else we just need to ensure our
3795 * software model is up-to-date. 3920 * software model is up-to-date.
3796 */ 3921 */
3797 ept_save_pdptrs(vcpu); 3922 if (enable_ept)
3923 ept_save_pdptrs(vcpu);
3798 3924
3799 kvm_mmu_reset_context(vcpu); 3925 kvm_mmu_reset_context(vcpu);
3800 3926
@@ -3882,14 +4008,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3882 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4008 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3883 4009
3884 if (likely(!vmx->fail)) { 4010 if (likely(!vmx->fail)) {
3885 if (exit_reason == -1) 4011 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
3886 sync_vmcs12(vcpu, vmcs12); 4012
3887 else 4013 if (exit_reason != -1)
3888 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 4014 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
3889 exit_qualification); 4015 exit_qualification);
3890 4016
3891 /* 4017 /*
3892 * Must happen outside of sync_vmcs12() as it will 4018 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
3893 * also be used to capture vmcs12 cache as part of 4019 * also be used to capture vmcs12 cache as part of
3894 * capturing nVMX state for snapshot (migration). 4020 * capturing nVMX state for snapshot (migration).
3895 * 4021 *
@@ -3945,7 +4071,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3945 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4071 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
3946 4072
3947 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) 4073 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
3948 vmx->nested.need_vmcs12_sync = true; 4074 vmx->nested.need_vmcs12_to_shadow_sync = true;
3949 4075
3950 /* in case we halted in L2 */ 4076 /* in case we halted in L2 */
3951 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4077 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4008,7 +4134,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
4008 * #UD or #GP. 4134 * #UD or #GP.
4009 */ 4135 */
4010int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4136int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4011 u32 vmx_instruction_info, bool wr, gva_t *ret) 4137 u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4012{ 4138{
4013 gva_t off; 4139 gva_t off;
4014 bool exn; 4140 bool exn;
@@ -4115,7 +4241,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4115 */ 4241 */
4116 if (!(s.base == 0 && s.limit == 0xffffffff && 4242 if (!(s.base == 0 && s.limit == 0xffffffff &&
4117 ((s.type & 8) || !(s.type & 4)))) 4243 ((s.type & 8) || !(s.type & 4))))
4118 exn = exn || (off + sizeof(u64) > s.limit); 4244 exn = exn || ((u64)off + len - 1 > s.limit);
4119 } 4245 }
4120 if (exn) { 4246 if (exn) {
4121 kvm_queue_exception_e(vcpu, 4247 kvm_queue_exception_e(vcpu,
@@ -4134,7 +4260,8 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4134 struct x86_exception e; 4260 struct x86_exception e;
4135 4261
4136 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4262 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4137 vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) 4263 vmcs_read32(VMX_INSTRUCTION_INFO), false,
4264 sizeof(*vmpointer), &gva))
4138 return 1; 4265 return 1;
4139 4266
4140 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { 4267 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
@@ -4300,11 +4427,13 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4300 if (vmx->nested.current_vmptr == -1ull) 4427 if (vmx->nested.current_vmptr == -1ull)
4301 return; 4428 return;
4302 4429
4430 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4431
4303 if (enable_shadow_vmcs) { 4432 if (enable_shadow_vmcs) {
4304 /* copy to memory all shadowed fields in case 4433 /* copy to memory all shadowed fields in case
4305 they were modified */ 4434 they were modified */
4306 copy_shadow_to_vmcs12(vmx); 4435 copy_shadow_to_vmcs12(vmx);
4307 vmx->nested.need_vmcs12_sync = false; 4436 vmx->nested.need_vmcs12_to_shadow_sync = false;
4308 vmx_disable_shadow_vmcs(vmx); 4437 vmx_disable_shadow_vmcs(vmx);
4309 } 4438 }
4310 vmx->nested.posted_intr_nv = -1; 4439 vmx->nested.posted_intr_nv = -1;
@@ -4334,6 +4463,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
4334 struct vcpu_vmx *vmx = to_vmx(vcpu); 4463 struct vcpu_vmx *vmx = to_vmx(vcpu);
4335 u32 zero = 0; 4464 u32 zero = 0;
4336 gpa_t vmptr; 4465 gpa_t vmptr;
4466 u64 evmcs_gpa;
4337 4467
4338 if (!nested_vmx_check_permission(vcpu)) 4468 if (!nested_vmx_check_permission(vcpu))
4339 return 1; 4469 return 1;
@@ -4349,10 +4479,18 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
4349 return nested_vmx_failValid(vcpu, 4479 return nested_vmx_failValid(vcpu,
4350 VMXERR_VMCLEAR_VMXON_POINTER); 4480 VMXERR_VMCLEAR_VMXON_POINTER);
4351 4481
4352 if (vmx->nested.hv_evmcs_map.hva) { 4482 /*
4353 if (vmptr == vmx->nested.hv_evmcs_vmptr) 4483 * When Enlightened VMEntry is enabled on the calling CPU we treat
4354 nested_release_evmcs(vcpu); 4484 * memory area pointer by vmptr as Enlightened VMCS (as there's no good
4355 } else { 4485 * way to distinguish it from VMCS12) and we must not corrupt it by
4486 * writing to the non-existent 'launch_state' field. The area doesn't
4487 * have to be the currently active EVMCS on the calling CPU and there's
4488 * nothing KVM has to do to transition it from 'active' to 'non-active'
4489 * state. It is possible that the area will stay mapped as
4490 * vmx->nested.hv_evmcs but this shouldn't be a problem.
4491 */
4492 if (likely(!vmx->nested.enlightened_vmcs_enabled ||
4493 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
4356 if (vmptr == vmx->nested.current_vmptr) 4494 if (vmptr == vmx->nested.current_vmptr)
4357 nested_release_vmcs12(vcpu); 4495 nested_release_vmcs12(vcpu);
4358 4496
@@ -4386,8 +4524,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
4386 u64 field_value; 4524 u64 field_value;
4387 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4525 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4388 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4526 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4527 int len;
4389 gva_t gva = 0; 4528 gva_t gva = 0;
4390 struct vmcs12 *vmcs12; 4529 struct vmcs12 *vmcs12;
4530 short offset;
4391 4531
4392 if (!nested_vmx_check_permission(vcpu)) 4532 if (!nested_vmx_check_permission(vcpu))
4393 return 1; 4533 return 1;
@@ -4409,11 +4549,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
4409 4549
4410 /* Decode instruction info and find the field to read */ 4550 /* Decode instruction info and find the field to read */
4411 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4551 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4412 /* Read the field, zero-extended to a u64 field_value */ 4552
4413 if (vmcs12_read_any(vmcs12, field, &field_value) < 0) 4553 offset = vmcs_field_to_offset(field);
4554 if (offset < 0)
4414 return nested_vmx_failValid(vcpu, 4555 return nested_vmx_failValid(vcpu,
4415 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4556 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4416 4557
4558 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
4559 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4560
4561 /* Read the field, zero-extended to a u64 field_value */
4562 field_value = vmcs12_read_any(vmcs12, field, offset);
4563
4417 /* 4564 /*
4418 * Now copy part of this value to register or memory, as requested. 4565 * Now copy part of this value to register or memory, as requested.
4419 * Note that the number of bits actually copied is 32 or 64 depending 4566 * Note that the number of bits actually copied is 32 or 64 depending
@@ -4423,21 +4570,45 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
4423 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 4570 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
4424 field_value); 4571 field_value);
4425 } else { 4572 } else {
4573 len = is_64_bit_mode(vcpu) ? 8 : 4;
4426 if (get_vmx_mem_address(vcpu, exit_qualification, 4574 if (get_vmx_mem_address(vcpu, exit_qualification,
4427 vmx_instruction_info, true, &gva)) 4575 vmx_instruction_info, true, len, &gva))
4428 return 1; 4576 return 1;
4429 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4577 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
4430 kvm_write_guest_virt_system(vcpu, gva, &field_value, 4578 kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL);
4431 (is_long_mode(vcpu) ? 8 : 4), NULL);
4432 } 4579 }
4433 4580
4434 return nested_vmx_succeed(vcpu); 4581 return nested_vmx_succeed(vcpu);
4435} 4582}
4436 4583
4584static bool is_shadow_field_rw(unsigned long field)
4585{
4586 switch (field) {
4587#define SHADOW_FIELD_RW(x, y) case x:
4588#include "vmcs_shadow_fields.h"
4589 return true;
4590 default:
4591 break;
4592 }
4593 return false;
4594}
4595
4596static bool is_shadow_field_ro(unsigned long field)
4597{
4598 switch (field) {
4599#define SHADOW_FIELD_RO(x, y) case x:
4600#include "vmcs_shadow_fields.h"
4601 return true;
4602 default:
4603 break;
4604 }
4605 return false;
4606}
4437 4607
4438static int handle_vmwrite(struct kvm_vcpu *vcpu) 4608static int handle_vmwrite(struct kvm_vcpu *vcpu)
4439{ 4609{
4440 unsigned long field; 4610 unsigned long field;
4611 int len;
4441 gva_t gva; 4612 gva_t gva;
4442 struct vcpu_vmx *vmx = to_vmx(vcpu); 4613 struct vcpu_vmx *vmx = to_vmx(vcpu);
4443 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4614 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4452,6 +4623,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
4452 u64 field_value = 0; 4623 u64 field_value = 0;
4453 struct x86_exception e; 4624 struct x86_exception e;
4454 struct vmcs12 *vmcs12; 4625 struct vmcs12 *vmcs12;
4626 short offset;
4455 4627
4456 if (!nested_vmx_check_permission(vcpu)) 4628 if (!nested_vmx_check_permission(vcpu))
4457 return 1; 4629 return 1;
@@ -4463,11 +4635,11 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
4463 field_value = kvm_register_readl(vcpu, 4635 field_value = kvm_register_readl(vcpu,
4464 (((vmx_instruction_info) >> 3) & 0xf)); 4636 (((vmx_instruction_info) >> 3) & 0xf));
4465 else { 4637 else {
4638 len = is_64_bit_mode(vcpu) ? 8 : 4;
4466 if (get_vmx_mem_address(vcpu, exit_qualification, 4639 if (get_vmx_mem_address(vcpu, exit_qualification,
4467 vmx_instruction_info, false, &gva)) 4640 vmx_instruction_info, false, len, &gva))
4468 return 1; 4641 return 1;
4469 if (kvm_read_guest_virt(vcpu, gva, &field_value, 4642 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
4470 (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
4471 kvm_inject_page_fault(vcpu, &e); 4643 kvm_inject_page_fault(vcpu, &e);
4472 return 1; 4644 return 1;
4473 } 4645 }
@@ -4484,9 +4656,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
4484 return nested_vmx_failValid(vcpu, 4656 return nested_vmx_failValid(vcpu,
4485 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 4657 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4486 4658
4487 if (!is_guest_mode(vcpu)) 4659 if (!is_guest_mode(vcpu)) {
4488 vmcs12 = get_vmcs12(vcpu); 4660 vmcs12 = get_vmcs12(vcpu);
4489 else { 4661
4662 /*
4663 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
4664 * vmcs12, else we may crush a field or consume a stale value.
4665 */
4666 if (!is_shadow_field_rw(field))
4667 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4668 } else {
4490 /* 4669 /*
4491 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE 4670 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
4492 * to shadowed-field sets the ALU flags for VMfailInvalid. 4671 * to shadowed-field sets the ALU flags for VMfailInvalid.
@@ -4496,28 +4675,46 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
4496 vmcs12 = get_shadow_vmcs12(vcpu); 4675 vmcs12 = get_shadow_vmcs12(vcpu);
4497 } 4676 }
4498 4677
4499 if (vmcs12_write_any(vmcs12, field, field_value) < 0) 4678 offset = vmcs_field_to_offset(field);
4679 if (offset < 0)
4500 return nested_vmx_failValid(vcpu, 4680 return nested_vmx_failValid(vcpu,
4501 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4681 VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4502 4682
4503 /* 4683 /*
4504 * Do not track vmcs12 dirty-state if in guest-mode 4684 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
4505 * as we actually dirty shadow vmcs12 instead of vmcs12. 4685 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
4686 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
4687 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
4688 * from L1 will return a different value than VMREAD from L2 (L1 sees
4689 * the stripped down value, L2 sees the full value as stored by KVM).
4506 */ 4690 */
4507 if (!is_guest_mode(vcpu)) { 4691 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
4508 switch (field) { 4692 field_value &= 0x1f0ff;
4509#define SHADOW_FIELD_RW(x) case x: 4693
4510#include "vmcs_shadow_fields.h" 4694 vmcs12_write_any(vmcs12, field, offset, field_value);
4511 /* 4695
4512 * The fields that can be updated by L1 without a vmexit are 4696 /*
4513 * always updated in the vmcs02, the others go down the slow 4697 * Do not track vmcs12 dirty-state if in guest-mode as we actually
4514 * path of prepare_vmcs02. 4698 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
4515 */ 4699 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
4516 break; 4700 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
4517 default: 4701 */
4518 vmx->nested.dirty_vmcs12 = true; 4702 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
4519 break; 4703 /*
4704 * L1 can read these fields without exiting, ensure the
4705 * shadow VMCS is up-to-date.
4706 */
4707 if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
4708 preempt_disable();
4709 vmcs_load(vmx->vmcs01.shadow_vmcs);
4710
4711 __vmcs_writel(field, field_value);
4712
4713 vmcs_clear(vmx->vmcs01.shadow_vmcs);
4714 vmcs_load(vmx->loaded_vmcs->vmcs);
4715 preempt_enable();
4520 } 4716 }
4717 vmx->nested.dirty_vmcs12 = true;
4521 } 4718 }
4522 4719
4523 return nested_vmx_succeed(vcpu); 4720 return nested_vmx_succeed(vcpu);
@@ -4527,11 +4724,10 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4527{ 4724{
4528 vmx->nested.current_vmptr = vmptr; 4725 vmx->nested.current_vmptr = vmptr;
4529 if (enable_shadow_vmcs) { 4726 if (enable_shadow_vmcs) {
4530 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, 4727 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
4531 SECONDARY_EXEC_SHADOW_VMCS);
4532 vmcs_write64(VMCS_LINK_POINTER, 4728 vmcs_write64(VMCS_LINK_POINTER,
4533 __pa(vmx->vmcs01.shadow_vmcs)); 4729 __pa(vmx->vmcs01.shadow_vmcs));
4534 vmx->nested.need_vmcs12_sync = true; 4730 vmx->nested.need_vmcs12_to_shadow_sync = true;
4535 } 4731 }
4536 vmx->nested.dirty_vmcs12 = true; 4732 vmx->nested.dirty_vmcs12 = true;
4537} 4733}
@@ -4615,7 +4811,8 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
4615 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) 4811 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
4616 return 1; 4812 return 1;
4617 4813
4618 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) 4814 if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
4815 true, sizeof(gpa_t), &gva))
4619 return 1; 4816 return 1;
4620 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 4817 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
4621 if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr, 4818 if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
@@ -4661,7 +4858,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
4661 * operand is read even if it isn't needed (e.g., for type==global) 4858 * operand is read even if it isn't needed (e.g., for type==global)
4662 */ 4859 */
4663 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4860 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4664 vmx_instruction_info, false, &gva)) 4861 vmx_instruction_info, false, sizeof(operand), &gva))
4665 return 1; 4862 return 1;
4666 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 4863 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4667 kvm_inject_page_fault(vcpu, &e); 4864 kvm_inject_page_fault(vcpu, &e);
@@ -4670,13 +4867,11 @@ static int handle_invept(struct kvm_vcpu *vcpu)
4670 4867
4671 switch (type) { 4868 switch (type) {
4672 case VMX_EPT_EXTENT_GLOBAL: 4869 case VMX_EPT_EXTENT_GLOBAL:
4870 case VMX_EPT_EXTENT_CONTEXT:
4673 /* 4871 /*
4674 * TODO: track mappings and invalidate 4872 * TODO: Sync the necessary shadow EPT roots here, rather than
4675 * single context requests appropriately 4873 * at the next emulated VM-entry.
4676 */ 4874 */
4677 case VMX_EPT_EXTENT_CONTEXT:
4678 kvm_mmu_sync_roots(vcpu);
4679 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
4680 break; 4875 break;
4681 default: 4876 default:
4682 BUG_ON(1); 4877 BUG_ON(1);
@@ -4723,7 +4918,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
4723 * operand is read even if it isn't needed (e.g., for type==global) 4918 * operand is read even if it isn't needed (e.g., for type==global)
4724 */ 4919 */
4725 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4920 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4726 vmx_instruction_info, false, &gva)) 4921 vmx_instruction_info, false, sizeof(operand), &gva))
4727 return 1; 4922 return 1;
4728 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 4923 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4729 kvm_inject_page_fault(vcpu, &e); 4924 kvm_inject_page_fault(vcpu, &e);
@@ -5284,12 +5479,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5284 * When running L2, the authoritative vmcs12 state is in the 5479 * When running L2, the authoritative vmcs12 state is in the
5285 * vmcs02. When running L1, the authoritative vmcs12 state is 5480 * vmcs02. When running L1, the authoritative vmcs12 state is
5286 * in the shadow or enlightened vmcs linked to vmcs01, unless 5481 * in the shadow or enlightened vmcs linked to vmcs01, unless
5287 * need_vmcs12_sync is set, in which case, the authoritative 5482 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
5288 * vmcs12 state is in the vmcs12 already. 5483 * vmcs12 state is in the vmcs12 already.
5289 */ 5484 */
5290 if (is_guest_mode(vcpu)) { 5485 if (is_guest_mode(vcpu)) {
5291 sync_vmcs12(vcpu, vmcs12); 5486 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5292 } else if (!vmx->nested.need_vmcs12_sync) { 5487 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5488 } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
5293 if (vmx->nested.hv_evmcs) 5489 if (vmx->nested.hv_evmcs)
5294 copy_enlightened_to_vmcs12(vmx); 5490 copy_enlightened_to_vmcs12(vmx);
5295 else if (enable_shadow_vmcs) 5491 else if (enable_shadow_vmcs)
@@ -5421,7 +5617,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5421 * Sync eVMCS upon entry as we may not have 5617 * Sync eVMCS upon entry as we may not have
5422 * HV_X64_MSR_VP_ASSIST_PAGE set up yet. 5618 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5423 */ 5619 */
5424 vmx->nested.need_vmcs12_sync = true; 5620 vmx->nested.need_vmcs12_to_shadow_sync = true;
5425 } else { 5621 } else {
5426 return -EINVAL; 5622 return -EINVAL;
5427 } 5623 }
@@ -5489,14 +5685,8 @@ error_guest_mode:
5489void nested_vmx_vcpu_setup(void) 5685void nested_vmx_vcpu_setup(void)
5490{ 5686{
5491 if (enable_shadow_vmcs) { 5687 if (enable_shadow_vmcs) {
5492 /*
5493 * At vCPU creation, "VMWRITE to any supported field
5494 * in the VMCS" is supported, so use the more
5495 * permissive vmx_vmread_bitmap to specify both read
5496 * and write permissions for the shadow VMCS.
5497 */
5498 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 5688 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5499 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); 5689 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5500 } 5690 }
5501} 5691}
5502 5692
@@ -5626,10 +5816,15 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5626 msrs->secondary_ctls_low = 0; 5816 msrs->secondary_ctls_low = 0;
5627 msrs->secondary_ctls_high &= 5817 msrs->secondary_ctls_high &=
5628 SECONDARY_EXEC_DESC | 5818 SECONDARY_EXEC_DESC |
5819 SECONDARY_EXEC_RDTSCP |
5629 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 5820 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
5821 SECONDARY_EXEC_WBINVD_EXITING |
5630 SECONDARY_EXEC_APIC_REGISTER_VIRT | 5822 SECONDARY_EXEC_APIC_REGISTER_VIRT |
5631 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 5823 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
5632 SECONDARY_EXEC_WBINVD_EXITING; 5824 SECONDARY_EXEC_RDRAND_EXITING |
5825 SECONDARY_EXEC_ENABLE_INVPCID |
5826 SECONDARY_EXEC_RDSEED_EXITING |
5827 SECONDARY_EXEC_XSAVES;
5633 5828
5634 /* 5829 /*
5635 * We can emulate "VMCS shadowing," even if the hardware 5830 * We can emulate "VMCS shadowing," even if the hardware
@@ -5749,14 +5944,6 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
5749{ 5944{
5750 int i; 5945 int i;
5751 5946
5752 /*
5753 * Without EPT it is not possible to restore L1's CR3 and PDPTR on
5754 * VMfail, because they are not available in vmcs01. Just always
5755 * use hardware checks.
5756 */
5757 if (!enable_ept)
5758 nested_early_check = 1;
5759
5760 if (!cpu_has_vmx_shadow_vmcs()) 5947 if (!cpu_has_vmx_shadow_vmcs())
5761 enable_shadow_vmcs = 0; 5948 enable_shadow_vmcs = 0;
5762 if (enable_shadow_vmcs) { 5949 if (enable_shadow_vmcs) {
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index e847ff1019a2..187d39bf0bf1 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -17,11 +17,11 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry);
17bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); 17bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
18void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 18void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
19 u32 exit_intr_info, unsigned long exit_qualification); 19 u32 exit_intr_info, unsigned long exit_qualification);
20void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu); 20void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
21int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 21int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
22int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); 22int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
23int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 23int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
24 u32 vmx_instruction_info, bool wr, gva_t *ret); 24 u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
25 25
26static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 26static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
27{ 27{
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index b8e50f76fefc..2200fb698dd0 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -146,7 +146,6 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value)
146 146
147 __vmcs_writel(field, value); 147 __vmcs_writel(field, value);
148#ifndef CONFIG_X86_64 148#ifndef CONFIG_X86_64
149 asm volatile ("");
150 __vmcs_writel(field+1, value >> 32); 149 __vmcs_writel(field+1, value >> 32);
151#endif 150#endif
152} 151}
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index cb6079f8a227..481ad879197b 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -42,6 +42,14 @@ struct vmcs_host_state {
42#endif 42#endif
43}; 43};
44 44
45struct vmcs_controls_shadow {
46 u32 vm_entry;
47 u32 vm_exit;
48 u32 pin;
49 u32 exec;
50 u32 secondary_exec;
51};
52
45/* 53/*
46 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also 54 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
47 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs 55 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
@@ -53,7 +61,7 @@ struct loaded_vmcs {
53 int cpu; 61 int cpu;
54 bool launched; 62 bool launched;
55 bool nmi_known_unmasked; 63 bool nmi_known_unmasked;
56 bool hv_timer_armed; 64 bool hv_timer_soft_disabled;
57 /* Support for vnmi-less CPUs */ 65 /* Support for vnmi-less CPUs */
58 int soft_vnmi_blocked; 66 int soft_vnmi_blocked;
59 ktime_t entry_time; 67 ktime_t entry_time;
@@ -61,6 +69,7 @@ struct loaded_vmcs {
61 unsigned long *msr_bitmap; 69 unsigned long *msr_bitmap;
62 struct list_head loaded_vmcss_on_cpu_link; 70 struct list_head loaded_vmcss_on_cpu_link;
63 struct vmcs_host_state host_state; 71 struct vmcs_host_state host_state;
72 struct vmcs_controls_shadow controls_shadow;
64}; 73};
65 74
66static inline bool is_exception_n(u32 intr_info, u8 vector) 75static inline bool is_exception_n(u32 intr_info, u8 vector)
@@ -115,6 +124,12 @@ static inline bool is_nmi(u32 intr_info)
115 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); 124 == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
116} 125}
117 126
127static inline bool is_external_intr(u32 intr_info)
128{
129 return (intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
130 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR);
131}
132
118enum vmcs_field_width { 133enum vmcs_field_width {
119 VMCS_FIELD_WIDTH_U16 = 0, 134 VMCS_FIELD_WIDTH_U16 = 0,
120 VMCS_FIELD_WIDTH_U64 = 1, 135 VMCS_FIELD_WIDTH_U64 = 1,
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 337718fc8a36..d0c6df373f67 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -395,69 +395,48 @@ static inline short vmcs_field_to_offset(unsigned long field)
395 395
396#undef ROL16 396#undef ROL16
397 397
398/* 398static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field,
399 * Read a vmcs12 field. Since these can have varying lengths and we return 399 u16 offset)
400 * one type, we chose the biggest type (u64) and zero-extend the return value
401 * to that size. Note that the caller, handle_vmread, might need to use only
402 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
403 * 64-bit fields are to be returned).
404 */
405static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
406 unsigned long field, u64 *ret)
407{ 400{
408 short offset = vmcs_field_to_offset(field); 401 char *p = (char *)vmcs12 + offset;
409 char *p;
410
411 if (offset < 0)
412 return offset;
413
414 p = (char *)vmcs12 + offset;
415 402
416 switch (vmcs_field_width(field)) { 403 switch (vmcs_field_width(field)) {
417 case VMCS_FIELD_WIDTH_NATURAL_WIDTH: 404 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
418 *ret = *((natural_width *)p); 405 return *((natural_width *)p);
419 return 0;
420 case VMCS_FIELD_WIDTH_U16: 406 case VMCS_FIELD_WIDTH_U16:
421 *ret = *((u16 *)p); 407 return *((u16 *)p);
422 return 0;
423 case VMCS_FIELD_WIDTH_U32: 408 case VMCS_FIELD_WIDTH_U32:
424 *ret = *((u32 *)p); 409 return *((u32 *)p);
425 return 0;
426 case VMCS_FIELD_WIDTH_U64: 410 case VMCS_FIELD_WIDTH_U64:
427 *ret = *((u64 *)p); 411 return *((u64 *)p);
428 return 0;
429 default: 412 default:
430 WARN_ON(1); 413 WARN_ON_ONCE(1);
431 return -ENOENT; 414 return -1;
432 } 415 }
433} 416}
434 417
435static inline int vmcs12_write_any(struct vmcs12 *vmcs12, 418static inline void vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field,
436 unsigned long field, u64 field_value){ 419 u16 offset, u64 field_value)
437 short offset = vmcs_field_to_offset(field); 420{
438 char *p = (char *)vmcs12 + offset; 421 char *p = (char *)vmcs12 + offset;
439 422
440 if (offset < 0)
441 return offset;
442
443 switch (vmcs_field_width(field)) { 423 switch (vmcs_field_width(field)) {
444 case VMCS_FIELD_WIDTH_U16: 424 case VMCS_FIELD_WIDTH_U16:
445 *(u16 *)p = field_value; 425 *(u16 *)p = field_value;
446 return 0; 426 break;
447 case VMCS_FIELD_WIDTH_U32: 427 case VMCS_FIELD_WIDTH_U32:
448 *(u32 *)p = field_value; 428 *(u32 *)p = field_value;
449 return 0; 429 break;
450 case VMCS_FIELD_WIDTH_U64: 430 case VMCS_FIELD_WIDTH_U64:
451 *(u64 *)p = field_value; 431 *(u64 *)p = field_value;
452 return 0; 432 break;
453 case VMCS_FIELD_WIDTH_NATURAL_WIDTH: 433 case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
454 *(natural_width *)p = field_value; 434 *(natural_width *)p = field_value;
455 return 0; 435 break;
456 default: 436 default:
457 WARN_ON(1); 437 WARN_ON_ONCE(1);
458 return -ENOENT; 438 break;
459 } 439 }
460
461} 440}
462 441
463#endif /* __KVM_X86_VMX_VMCS12_H */ 442#endif /* __KVM_X86_VMX_VMCS12_H */
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index 132432f375c2..eb1ecd16fd22 100644
--- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -1,8 +1,12 @@
1#if !defined(SHADOW_FIELD_RO) && !defined(SHADOW_FIELD_RW)
2BUILD_BUG_ON(1)
3#endif
4
1#ifndef SHADOW_FIELD_RO 5#ifndef SHADOW_FIELD_RO
2#define SHADOW_FIELD_RO(x) 6#define SHADOW_FIELD_RO(x, y)
3#endif 7#endif
4#ifndef SHADOW_FIELD_RW 8#ifndef SHADOW_FIELD_RW
5#define SHADOW_FIELD_RW(x) 9#define SHADOW_FIELD_RW(x, y)
6#endif 10#endif
7 11
8/* 12/*
@@ -28,47 +32,48 @@
28 */ 32 */
29 33
30/* 16-bits */ 34/* 16-bits */
31SHADOW_FIELD_RW(GUEST_INTR_STATUS) 35SHADOW_FIELD_RW(GUEST_INTR_STATUS, guest_intr_status)
32SHADOW_FIELD_RW(GUEST_PML_INDEX) 36SHADOW_FIELD_RW(GUEST_PML_INDEX, guest_pml_index)
33SHADOW_FIELD_RW(HOST_FS_SELECTOR) 37SHADOW_FIELD_RW(HOST_FS_SELECTOR, host_fs_selector)
34SHADOW_FIELD_RW(HOST_GS_SELECTOR) 38SHADOW_FIELD_RW(HOST_GS_SELECTOR, host_gs_selector)
35 39
36/* 32-bits */ 40/* 32-bits */
37SHADOW_FIELD_RO(VM_EXIT_REASON) 41SHADOW_FIELD_RO(VM_EXIT_REASON, vm_exit_reason)
38SHADOW_FIELD_RO(VM_EXIT_INTR_INFO) 42SHADOW_FIELD_RO(VM_EXIT_INTR_INFO, vm_exit_intr_info)
39SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) 43SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len)
40SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) 44SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field)
41SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) 45SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code)
42SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) 46SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code)
43SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) 47SHADOW_FIELD_RO(GUEST_CS_AR_BYTES, guest_cs_ar_bytes)
44SHADOW_FIELD_RW(EXCEPTION_BITMAP) 48SHADOW_FIELD_RO(GUEST_SS_AR_BYTES, guest_ss_ar_bytes)
45SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) 49SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control)
46SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) 50SHADOW_FIELD_RW(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control)
47SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) 51SHADOW_FIELD_RW(EXCEPTION_BITMAP, exception_bitmap)
48SHADOW_FIELD_RW(TPR_THRESHOLD) 52SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code)
49SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) 53SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field)
50SHADOW_FIELD_RW(GUEST_SS_AR_BYTES) 54SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len)
51SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) 55SHADOW_FIELD_RW(TPR_THRESHOLD, tpr_threshold)
52SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) 56SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info)
57SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value)
53 58
54/* Natural width */ 59/* Natural width */
55SHADOW_FIELD_RO(EXIT_QUALIFICATION) 60SHADOW_FIELD_RO(EXIT_QUALIFICATION, exit_qualification)
56SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS) 61SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS, guest_linear_address)
57SHADOW_FIELD_RW(GUEST_RIP) 62SHADOW_FIELD_RW(GUEST_RIP, guest_rip)
58SHADOW_FIELD_RW(GUEST_RSP) 63SHADOW_FIELD_RW(GUEST_RSP, guest_rsp)
59SHADOW_FIELD_RW(GUEST_CR0) 64SHADOW_FIELD_RW(GUEST_CR0, guest_cr0)
60SHADOW_FIELD_RW(GUEST_CR3) 65SHADOW_FIELD_RW(GUEST_CR3, guest_cr3)
61SHADOW_FIELD_RW(GUEST_CR4) 66SHADOW_FIELD_RW(GUEST_CR4, guest_cr4)
62SHADOW_FIELD_RW(GUEST_RFLAGS) 67SHADOW_FIELD_RW(GUEST_RFLAGS, guest_rflags)
63SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) 68SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK, cr0_guest_host_mask)
64SHADOW_FIELD_RW(CR0_READ_SHADOW) 69SHADOW_FIELD_RW(CR0_READ_SHADOW, cr0_read_shadow)
65SHADOW_FIELD_RW(CR4_READ_SHADOW) 70SHADOW_FIELD_RW(CR4_READ_SHADOW, cr4_read_shadow)
66SHADOW_FIELD_RW(HOST_FS_BASE) 71SHADOW_FIELD_RW(HOST_FS_BASE, host_fs_base)
67SHADOW_FIELD_RW(HOST_GS_BASE) 72SHADOW_FIELD_RW(HOST_GS_BASE, host_gs_base)
68 73
69/* 64-bit */ 74/* 64-bit */
70SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS) 75SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS, guest_physical_address)
71SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH) 76SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH, guest_physical_address)
72 77
73#undef SHADOW_FIELD_RO 78#undef SHADOW_FIELD_RO
74#undef SHADOW_FIELD_RW 79#undef SHADOW_FIELD_RW
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d98eac371c0a..69536553446d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -389,6 +389,7 @@ static const struct kvm_vmx_segment_field {
389}; 389};
390 390
391u64 host_efer; 391u64 host_efer;
392static unsigned long host_idt_base;
392 393
393/* 394/*
394 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 395 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
@@ -1035,6 +1036,33 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
1035 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1036 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1036} 1037}
1037 1038
1039void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1040 unsigned long fs_base, unsigned long gs_base)
1041{
1042 if (unlikely(fs_sel != host->fs_sel)) {
1043 if (!(fs_sel & 7))
1044 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1045 else
1046 vmcs_write16(HOST_FS_SELECTOR, 0);
1047 host->fs_sel = fs_sel;
1048 }
1049 if (unlikely(gs_sel != host->gs_sel)) {
1050 if (!(gs_sel & 7))
1051 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1052 else
1053 vmcs_write16(HOST_GS_SELECTOR, 0);
1054 host->gs_sel = gs_sel;
1055 }
1056 if (unlikely(fs_base != host->fs_base)) {
1057 vmcs_writel(HOST_FS_BASE, fs_base);
1058 host->fs_base = fs_base;
1059 }
1060 if (unlikely(gs_base != host->gs_base)) {
1061 vmcs_writel(HOST_GS_BASE, gs_base);
1062 host->gs_base = gs_base;
1063 }
1064}
1065
1038void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1066void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1039{ 1067{
1040 struct vcpu_vmx *vmx = to_vmx(vcpu); 1068 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1053,20 +1081,18 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1053 * when guest state is loaded. This happens when guest transitions 1081 * when guest state is loaded. This happens when guest transitions
1054 * to/from long-mode by setting MSR_EFER.LMA. 1082 * to/from long-mode by setting MSR_EFER.LMA.
1055 */ 1083 */
1056 if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { 1084 if (!vmx->guest_msrs_ready) {
1057 vmx->guest_msrs_dirty = false; 1085 vmx->guest_msrs_ready = true;
1058 for (i = 0; i < vmx->save_nmsrs; ++i) 1086 for (i = 0; i < vmx->save_nmsrs; ++i)
1059 kvm_set_shared_msr(vmx->guest_msrs[i].index, 1087 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1060 vmx->guest_msrs[i].data, 1088 vmx->guest_msrs[i].data,
1061 vmx->guest_msrs[i].mask); 1089 vmx->guest_msrs[i].mask);
1062 1090
1063 } 1091 }
1064 1092 if (vmx->guest_state_loaded)
1065 if (vmx->loaded_cpu_state)
1066 return; 1093 return;
1067 1094
1068 vmx->loaded_cpu_state = vmx->loaded_vmcs; 1095 host_state = &vmx->loaded_vmcs->host_state;
1069 host_state = &vmx->loaded_cpu_state->host_state;
1070 1096
1071 /* 1097 /*
1072 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1098 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
@@ -1100,42 +1126,20 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1100 gs_base = segment_base(gs_sel); 1126 gs_base = segment_base(gs_sel);
1101#endif 1127#endif
1102 1128
1103 if (unlikely(fs_sel != host_state->fs_sel)) { 1129 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1104 if (!(fs_sel & 7)) 1130 vmx->guest_state_loaded = true;
1105 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1106 else
1107 vmcs_write16(HOST_FS_SELECTOR, 0);
1108 host_state->fs_sel = fs_sel;
1109 }
1110 if (unlikely(gs_sel != host_state->gs_sel)) {
1111 if (!(gs_sel & 7))
1112 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1113 else
1114 vmcs_write16(HOST_GS_SELECTOR, 0);
1115 host_state->gs_sel = gs_sel;
1116 }
1117 if (unlikely(fs_base != host_state->fs_base)) {
1118 vmcs_writel(HOST_FS_BASE, fs_base);
1119 host_state->fs_base = fs_base;
1120 }
1121 if (unlikely(gs_base != host_state->gs_base)) {
1122 vmcs_writel(HOST_GS_BASE, gs_base);
1123 host_state->gs_base = gs_base;
1124 }
1125} 1131}
1126 1132
1127static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1133static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1128{ 1134{
1129 struct vmcs_host_state *host_state; 1135 struct vmcs_host_state *host_state;
1130 1136
1131 if (!vmx->loaded_cpu_state) 1137 if (!vmx->guest_state_loaded)
1132 return; 1138 return;
1133 1139
1134 WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); 1140 host_state = &vmx->loaded_vmcs->host_state;
1135 host_state = &vmx->loaded_cpu_state->host_state;
1136 1141
1137 ++vmx->vcpu.stat.host_state_reload; 1142 ++vmx->vcpu.stat.host_state_reload;
1138 vmx->loaded_cpu_state = NULL;
1139 1143
1140#ifdef CONFIG_X86_64 1144#ifdef CONFIG_X86_64
1141 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1145 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
@@ -1161,13 +1165,15 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1161 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1165 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1162#endif 1166#endif
1163 load_fixmap_gdt(raw_smp_processor_id()); 1167 load_fixmap_gdt(raw_smp_processor_id());
1168 vmx->guest_state_loaded = false;
1169 vmx->guest_msrs_ready = false;
1164} 1170}
1165 1171
1166#ifdef CONFIG_X86_64 1172#ifdef CONFIG_X86_64
1167static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1173static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1168{ 1174{
1169 preempt_disable(); 1175 preempt_disable();
1170 if (vmx->loaded_cpu_state) 1176 if (vmx->guest_state_loaded)
1171 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1177 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1172 preempt_enable(); 1178 preempt_enable();
1173 return vmx->msr_guest_kernel_gs_base; 1179 return vmx->msr_guest_kernel_gs_base;
@@ -1176,7 +1182,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1176static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1182static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1177{ 1183{
1178 preempt_disable(); 1184 preempt_disable();
1179 if (vmx->loaded_cpu_state) 1185 if (vmx->guest_state_loaded)
1180 wrmsrl(MSR_KERNEL_GS_BASE, data); 1186 wrmsrl(MSR_KERNEL_GS_BASE, data);
1181 preempt_enable(); 1187 preempt_enable();
1182 vmx->msr_guest_kernel_gs_base = data; 1188 vmx->msr_guest_kernel_gs_base = data;
@@ -1225,11 +1231,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
1225 pi_set_on(pi_desc); 1231 pi_set_on(pi_desc);
1226} 1232}
1227 1233
1228/* 1234void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
1229 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1230 * vcpu mutex is already taken.
1231 */
1232void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1233{ 1235{
1234 struct vcpu_vmx *vmx = to_vmx(vcpu); 1236 struct vcpu_vmx *vmx = to_vmx(vcpu);
1235 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1237 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
@@ -1290,8 +1292,20 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1290 if (kvm_has_tsc_control && 1292 if (kvm_has_tsc_control &&
1291 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) 1293 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
1292 decache_tsc_multiplier(vmx); 1294 decache_tsc_multiplier(vmx);
1295}
1296
1297/*
1298 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1299 * vcpu mutex is already taken.
1300 */
1301void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1302{
1303 struct vcpu_vmx *vmx = to_vmx(vcpu);
1304
1305 vmx_vcpu_load_vmcs(vcpu, cpu);
1293 1306
1294 vmx_vcpu_pi_load(vcpu, cpu); 1307 vmx_vcpu_pi_load(vcpu, cpu);
1308
1295 vmx->host_pkru = read_pkru(); 1309 vmx->host_pkru = read_pkru();
1296 vmx->host_debugctlmsr = get_debugctlmsr(); 1310 vmx->host_debugctlmsr = get_debugctlmsr();
1297} 1311}
@@ -1310,7 +1324,7 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
1310 pi_set_sn(pi_desc); 1324 pi_set_sn(pi_desc);
1311} 1325}
1312 1326
1313void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1327static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1314{ 1328{
1315 vmx_vcpu_pi_put(vcpu); 1329 vmx_vcpu_pi_put(vcpu);
1316 1330
@@ -1579,7 +1593,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1579 move_msr_up(vmx, index, save_nmsrs++); 1593 move_msr_up(vmx, index, save_nmsrs++);
1580 1594
1581 vmx->save_nmsrs = save_nmsrs; 1595 vmx->save_nmsrs = save_nmsrs;
1582 vmx->guest_msrs_dirty = true; 1596 vmx->guest_msrs_ready = false;
1583 1597
1584 if (cpu_has_vmx_msr_bitmap()) 1598 if (cpu_has_vmx_msr_bitmap())
1585 vmx_update_msr_bitmap(&vmx->vcpu); 1599 vmx_update_msr_bitmap(&vmx->vcpu);
@@ -1692,9 +1706,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1692 case MSR_IA32_SYSENTER_ESP: 1706 case MSR_IA32_SYSENTER_ESP:
1693 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 1707 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
1694 break; 1708 break;
1695 case MSR_IA32_POWER_CTL:
1696 msr_info->data = vmx->msr_ia32_power_ctl;
1697 break;
1698 case MSR_IA32_BNDCFGS: 1709 case MSR_IA32_BNDCFGS:
1699 if (!kvm_mpx_supported() || 1710 if (!kvm_mpx_supported() ||
1700 (!msr_info->host_initiated && 1711 (!msr_info->host_initiated &&
@@ -1718,7 +1729,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1718 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 1729 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
1719 &msr_info->data); 1730 &msr_info->data);
1720 case MSR_IA32_XSS: 1731 case MSR_IA32_XSS:
1721 if (!vmx_xsaves_supported()) 1732 if (!vmx_xsaves_supported() ||
1733 (!msr_info->host_initiated &&
1734 !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
1735 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
1722 return 1; 1736 return 1;
1723 msr_info->data = vcpu->arch.ia32_xss; 1737 msr_info->data = vcpu->arch.ia32_xss;
1724 break; 1738 break;
@@ -1817,17 +1831,28 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1817 break; 1831 break;
1818#endif 1832#endif
1819 case MSR_IA32_SYSENTER_CS: 1833 case MSR_IA32_SYSENTER_CS:
1834 if (is_guest_mode(vcpu))
1835 get_vmcs12(vcpu)->guest_sysenter_cs = data;
1820 vmcs_write32(GUEST_SYSENTER_CS, data); 1836 vmcs_write32(GUEST_SYSENTER_CS, data);
1821 break; 1837 break;
1822 case MSR_IA32_SYSENTER_EIP: 1838 case MSR_IA32_SYSENTER_EIP:
1839 if (is_guest_mode(vcpu))
1840 get_vmcs12(vcpu)->guest_sysenter_eip = data;
1823 vmcs_writel(GUEST_SYSENTER_EIP, data); 1841 vmcs_writel(GUEST_SYSENTER_EIP, data);
1824 break; 1842 break;
1825 case MSR_IA32_SYSENTER_ESP: 1843 case MSR_IA32_SYSENTER_ESP:
1844 if (is_guest_mode(vcpu))
1845 get_vmcs12(vcpu)->guest_sysenter_esp = data;
1826 vmcs_writel(GUEST_SYSENTER_ESP, data); 1846 vmcs_writel(GUEST_SYSENTER_ESP, data);
1827 break; 1847 break;
1828 case MSR_IA32_POWER_CTL: 1848 case MSR_IA32_DEBUGCTLMSR:
1829 vmx->msr_ia32_power_ctl = data; 1849 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
1850 VM_EXIT_SAVE_DEBUG_CONTROLS)
1851 get_vmcs12(vcpu)->guest_ia32_debugctl = data;
1852
1853 ret = kvm_set_msr_common(vcpu, msr_info);
1830 break; 1854 break;
1855
1831 case MSR_IA32_BNDCFGS: 1856 case MSR_IA32_BNDCFGS:
1832 if (!kvm_mpx_supported() || 1857 if (!kvm_mpx_supported() ||
1833 (!msr_info->host_initiated && 1858 (!msr_info->host_initiated &&
@@ -1896,9 +1921,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1896 MSR_TYPE_W); 1921 MSR_TYPE_W);
1897 break; 1922 break;
1898 case MSR_IA32_CR_PAT: 1923 case MSR_IA32_CR_PAT:
1924 if (!kvm_pat_valid(data))
1925 return 1;
1926
1927 if (is_guest_mode(vcpu) &&
1928 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
1929 get_vmcs12(vcpu)->guest_ia32_pat = data;
1930
1899 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1931 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1900 if (!kvm_pat_valid(data))
1901 return 1;
1902 vmcs_write64(GUEST_IA32_PAT, data); 1932 vmcs_write64(GUEST_IA32_PAT, data);
1903 vcpu->arch.pat = data; 1933 vcpu->arch.pat = data;
1904 break; 1934 break;
@@ -1932,7 +1962,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1932 return 1; 1962 return 1;
1933 return vmx_set_vmx_msr(vcpu, msr_index, data); 1963 return vmx_set_vmx_msr(vcpu, msr_index, data);
1934 case MSR_IA32_XSS: 1964 case MSR_IA32_XSS:
1935 if (!vmx_xsaves_supported()) 1965 if (!vmx_xsaves_supported() ||
1966 (!msr_info->host_initiated &&
1967 !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
1968 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
1936 return 1; 1969 return 1;
1937 /* 1970 /*
1938 * The only supported bit as of Skylake is bit 8, but 1971 * The only supported bit as of Skylake is bit 8, but
@@ -2435,6 +2468,7 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2435 return -ENOMEM; 2468 return -ENOMEM;
2436 2469
2437 loaded_vmcs->shadow_vmcs = NULL; 2470 loaded_vmcs->shadow_vmcs = NULL;
2471 loaded_vmcs->hv_timer_soft_disabled = false;
2438 loaded_vmcs_init(loaded_vmcs); 2472 loaded_vmcs_init(loaded_vmcs);
2439 2473
2440 if (cpu_has_vmx_msr_bitmap()) { 2474 if (cpu_has_vmx_msr_bitmap()) {
@@ -2455,6 +2489,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2455 } 2489 }
2456 2490
2457 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2491 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2492 memset(&loaded_vmcs->controls_shadow, 0,
2493 sizeof(struct vmcs_controls_shadow));
2458 2494
2459 return 0; 2495 return 0;
2460 2496
@@ -2737,7 +2773,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
2737 (unsigned long *)&vcpu->arch.regs_dirty)) 2773 (unsigned long *)&vcpu->arch.regs_dirty))
2738 return; 2774 return;
2739 2775
2740 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 2776 if (is_pae_paging(vcpu)) {
2741 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 2777 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
2742 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 2778 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
2743 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 2779 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
@@ -2749,7 +2785,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
2749{ 2785{
2750 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 2786 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2751 2787
2752 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 2788 if (is_pae_paging(vcpu)) {
2753 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 2789 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
2754 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 2790 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
2755 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 2791 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
@@ -2766,22 +2802,20 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2766 unsigned long cr0, 2802 unsigned long cr0,
2767 struct kvm_vcpu *vcpu) 2803 struct kvm_vcpu *vcpu)
2768{ 2804{
2805 struct vcpu_vmx *vmx = to_vmx(vcpu);
2806
2769 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) 2807 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
2770 vmx_decache_cr3(vcpu); 2808 vmx_decache_cr3(vcpu);
2771 if (!(cr0 & X86_CR0_PG)) { 2809 if (!(cr0 & X86_CR0_PG)) {
2772 /* From paging/starting to nonpaging */ 2810 /* From paging/starting to nonpaging */
2773 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 2811 exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
2774 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | 2812 CPU_BASED_CR3_STORE_EXITING);
2775 (CPU_BASED_CR3_LOAD_EXITING |
2776 CPU_BASED_CR3_STORE_EXITING));
2777 vcpu->arch.cr0 = cr0; 2813 vcpu->arch.cr0 = cr0;
2778 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 2814 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2779 } else if (!is_paging(vcpu)) { 2815 } else if (!is_paging(vcpu)) {
2780 /* From nonpaging to paging */ 2816 /* From nonpaging to paging */
2781 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 2817 exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
2782 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & 2818 CPU_BASED_CR3_STORE_EXITING);
2783 ~(CPU_BASED_CR3_LOAD_EXITING |
2784 CPU_BASED_CR3_STORE_EXITING));
2785 vcpu->arch.cr0 = cr0; 2819 vcpu->arch.cr0 = cr0;
2786 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 2820 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
2787 } 2821 }
@@ -2881,6 +2915,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
2881 2915
2882int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 2916int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2883{ 2917{
2918 struct vcpu_vmx *vmx = to_vmx(vcpu);
2884 /* 2919 /*
2885 * Pass through host's Machine Check Enable value to hw_cr4, which 2920 * Pass through host's Machine Check Enable value to hw_cr4, which
2886 * is in force while we are in guest mode. Do not let guests control 2921 * is in force while we are in guest mode. Do not let guests control
@@ -2891,20 +2926,19 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2891 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 2926 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
2892 if (enable_unrestricted_guest) 2927 if (enable_unrestricted_guest)
2893 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 2928 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
2894 else if (to_vmx(vcpu)->rmode.vm86_active) 2929 else if (vmx->rmode.vm86_active)
2895 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 2930 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
2896 else 2931 else
2897 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 2932 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
2898 2933
2899 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { 2934 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
2900 if (cr4 & X86_CR4_UMIP) { 2935 if (cr4 & X86_CR4_UMIP) {
2901 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, 2936 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
2902 SECONDARY_EXEC_DESC);
2903 hw_cr4 &= ~X86_CR4_UMIP; 2937 hw_cr4 &= ~X86_CR4_UMIP;
2904 } else if (!is_guest_mode(vcpu) || 2938 } else if (!is_guest_mode(vcpu) ||
2905 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) 2939 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
2906 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, 2940 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
2907 SECONDARY_EXEC_DESC); 2941 }
2908 } 2942 }
2909 2943
2910 if (cr4 & X86_CR4_VMXE) { 2944 if (cr4 & X86_CR4_VMXE) {
@@ -2919,7 +2953,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2919 return 1; 2953 return 1;
2920 } 2954 }
2921 2955
2922 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 2956 if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
2923 return 1; 2957 return 1;
2924 2958
2925 vcpu->arch.cr4 = cr4; 2959 vcpu->arch.cr4 = cr4;
@@ -3537,7 +3571,7 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
3537 u8 mode = 0; 3571 u8 mode = 0;
3538 3572
3539 if (cpu_has_secondary_exec_ctrls() && 3573 if (cpu_has_secondary_exec_ctrls() &&
3540 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & 3574 (secondary_exec_controls_get(to_vmx(vcpu)) &
3541 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 3575 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
3542 mode |= MSR_BITMAP_MODE_X2APIC; 3576 mode |= MSR_BITMAP_MODE_X2APIC;
3543 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 3577 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
@@ -3731,7 +3765,6 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
3731{ 3765{
3732 u32 low32, high32; 3766 u32 low32, high32;
3733 unsigned long tmpl; 3767 unsigned long tmpl;
3734 struct desc_ptr dt;
3735 unsigned long cr0, cr3, cr4; 3768 unsigned long cr0, cr3, cr4;
3736 3769
3737 cr0 = read_cr0(); 3770 cr0 = read_cr0();
@@ -3767,9 +3800,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
3767 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3800 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3768 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 3801 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3769 3802
3770 store_idt(&dt); 3803 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
3771 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
3772 vmx->host_idt_base = dt.address;
3773 3804
3774 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 3805 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
3775 3806
@@ -3798,7 +3829,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3798 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 3829 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3799} 3830}
3800 3831
3801static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 3832u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
3802{ 3833{
3803 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 3834 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
3804 3835
@@ -3808,8 +3839,9 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
3808 if (!enable_vnmi) 3839 if (!enable_vnmi)
3809 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 3840 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
3810 3841
3811 /* Enable the preemption timer dynamically */ 3842 if (!enable_preemption_timer)
3812 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 3843 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3844
3813 return pin_based_exec_ctrl; 3845 return pin_based_exec_ctrl;
3814} 3846}
3815 3847
@@ -3817,14 +3849,14 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
3817{ 3849{
3818 struct vcpu_vmx *vmx = to_vmx(vcpu); 3850 struct vcpu_vmx *vmx = to_vmx(vcpu);
3819 3851
3820 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 3852 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
3821 if (cpu_has_secondary_exec_ctrls()) { 3853 if (cpu_has_secondary_exec_ctrls()) {
3822 if (kvm_vcpu_apicv_active(vcpu)) 3854 if (kvm_vcpu_apicv_active(vcpu))
3823 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, 3855 secondary_exec_controls_setbit(vmx,
3824 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3856 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3825 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3857 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3826 else 3858 else
3827 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, 3859 secondary_exec_controls_clearbit(vmx,
3828 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3860 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3829 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3861 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3830 } 3862 }
@@ -4015,15 +4047,14 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
4015 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4047 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4016 4048
4017 /* Control */ 4049 /* Control */
4018 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 4050 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4019 vmx->hv_deadline_tsc = -1; 4051 vmx->hv_deadline_tsc = -1;
4020 4052
4021 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4053 exec_controls_set(vmx, vmx_exec_control(vmx));
4022 4054
4023 if (cpu_has_secondary_exec_ctrls()) { 4055 if (cpu_has_secondary_exec_ctrls()) {
4024 vmx_compute_secondary_exec_control(vmx); 4056 vmx_compute_secondary_exec_control(vmx);
4025 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 4057 secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
4026 vmx->secondary_exec_control);
4027 } 4058 }
4028 4059
4029 if (kvm_vcpu_apicv_active(&vmx->vcpu)) { 4060 if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
@@ -4081,10 +4112,10 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
4081 ++vmx->nmsrs; 4112 ++vmx->nmsrs;
4082 } 4113 }
4083 4114
4084 vm_exit_controls_init(vmx, vmx_vmexit_ctrl()); 4115 vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
4085 4116
4086 /* 22.2.1, 20.8.1 */ 4117 /* 22.2.1, 20.8.1 */
4087 vm_entry_controls_init(vmx, vmx_vmentry_ctrl()); 4118 vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4088 4119
4089 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; 4120 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
4090 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); 4121 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
@@ -4208,8 +4239,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4208 4239
4209static void enable_irq_window(struct kvm_vcpu *vcpu) 4240static void enable_irq_window(struct kvm_vcpu *vcpu)
4210{ 4241{
4211 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, 4242 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
4212 CPU_BASED_VIRTUAL_INTR_PENDING);
4213} 4243}
4214 4244
4215static void enable_nmi_window(struct kvm_vcpu *vcpu) 4245static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -4220,8 +4250,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
4220 return; 4250 return;
4221 } 4251 }
4222 4252
4223 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, 4253 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
4224 CPU_BASED_VIRTUAL_NMI_PENDING);
4225} 4254}
4226 4255
4227static void vmx_inject_irq(struct kvm_vcpu *vcpu) 4256static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4442,11 +4471,11 @@ static void kvm_machine_check(void)
4442 4471
4443static int handle_machine_check(struct kvm_vcpu *vcpu) 4472static int handle_machine_check(struct kvm_vcpu *vcpu)
4444{ 4473{
4445 /* already handled by vcpu_run */ 4474 /* handled by vmx_vcpu_run() */
4446 return 1; 4475 return 1;
4447} 4476}
4448 4477
4449static int handle_exception(struct kvm_vcpu *vcpu) 4478static int handle_exception_nmi(struct kvm_vcpu *vcpu)
4450{ 4479{
4451 struct vcpu_vmx *vmx = to_vmx(vcpu); 4480 struct vcpu_vmx *vmx = to_vmx(vcpu);
4452 struct kvm_run *kvm_run = vcpu->run; 4481 struct kvm_run *kvm_run = vcpu->run;
@@ -4458,11 +4487,8 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4458 vect_info = vmx->idt_vectoring_info; 4487 vect_info = vmx->idt_vectoring_info;
4459 intr_info = vmx->exit_intr_info; 4488 intr_info = vmx->exit_intr_info;
4460 4489
4461 if (is_machine_check(intr_info)) 4490 if (is_machine_check(intr_info) || is_nmi(intr_info))
4462 return handle_machine_check(vcpu); 4491 return 1; /* handled by handle_exception_nmi_irqoff() */
4463
4464 if (is_nmi(intr_info))
4465 return 1; /* already handled by vmx_vcpu_run() */
4466 4492
4467 if (is_invalid_opcode(intr_info)) 4493 if (is_invalid_opcode(intr_info))
4468 return handle_ud(vcpu); 4494 return handle_ud(vcpu);
@@ -4518,7 +4544,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4518 dr6 = vmcs_readl(EXIT_QUALIFICATION); 4544 dr6 = vmcs_readl(EXIT_QUALIFICATION);
4519 if (!(vcpu->guest_debug & 4545 if (!(vcpu->guest_debug &
4520 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 4546 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
4521 vcpu->arch.dr6 &= ~15; 4547 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
4522 vcpu->arch.dr6 |= dr6 | DR6_RTM; 4548 vcpu->arch.dr6 |= dr6 | DR6_RTM;
4523 if (is_icebp(intr_info)) 4549 if (is_icebp(intr_info))
4524 skip_emulated_instruction(vcpu); 4550 skip_emulated_instruction(vcpu);
@@ -4763,7 +4789,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
4763 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 4789 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
4764 return 0; 4790 return 0;
4765 } else { 4791 } else {
4766 vcpu->arch.dr6 &= ~15; 4792 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
4767 vcpu->arch.dr6 |= DR6_BD | DR6_RTM; 4793 vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
4768 kvm_queue_exception(vcpu, DB_VECTOR); 4794 kvm_queue_exception(vcpu, DB_VECTOR);
4769 return 1; 4795 return 1;
@@ -4771,8 +4797,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
4771 } 4797 }
4772 4798
4773 if (vcpu->guest_debug == 0) { 4799 if (vcpu->guest_debug == 0) {
4774 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, 4800 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
4775 CPU_BASED_MOV_DR_EXITING);
4776 4801
4777 /* 4802 /*
4778 * No more DR vmexits; force a reload of the debug registers 4803 * No more DR vmexits; force a reload of the debug registers
@@ -4816,7 +4841,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
4816 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 4841 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
4817 4842
4818 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 4843 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
4819 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); 4844 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
4820} 4845}
4821 4846
4822static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 4847static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -4876,8 +4901,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
4876 4901
4877static int handle_interrupt_window(struct kvm_vcpu *vcpu) 4902static int handle_interrupt_window(struct kvm_vcpu *vcpu)
4878{ 4903{
4879 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, 4904 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
4880 CPU_BASED_VIRTUAL_INTR_PENDING);
4881 4905
4882 kvm_make_request(KVM_REQ_EVENT, vcpu); 4906 kvm_make_request(KVM_REQ_EVENT, vcpu);
4883 4907
@@ -5131,8 +5155,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5131static int handle_nmi_window(struct kvm_vcpu *vcpu) 5155static int handle_nmi_window(struct kvm_vcpu *vcpu)
5132{ 5156{
5133 WARN_ON_ONCE(!enable_vnmi); 5157 WARN_ON_ONCE(!enable_vnmi);
5134 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, 5158 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
5135 CPU_BASED_VIRTUAL_NMI_PENDING);
5136 ++vcpu->stat.nmi_window_exits; 5159 ++vcpu->stat.nmi_window_exits;
5137 kvm_make_request(KVM_REQ_EVENT, vcpu); 5160 kvm_make_request(KVM_REQ_EVENT, vcpu);
5138 5161
@@ -5144,7 +5167,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5144 struct vcpu_vmx *vmx = to_vmx(vcpu); 5167 struct vcpu_vmx *vmx = to_vmx(vcpu);
5145 enum emulation_result err = EMULATE_DONE; 5168 enum emulation_result err = EMULATE_DONE;
5146 int ret = 1; 5169 int ret = 1;
5147 u32 cpu_exec_ctrl;
5148 bool intr_window_requested; 5170 bool intr_window_requested;
5149 unsigned count = 130; 5171 unsigned count = 130;
5150 5172
@@ -5155,8 +5177,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5155 */ 5177 */
5156 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); 5178 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
5157 5179
5158 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5180 intr_window_requested = exec_controls_get(vmx) &
5159 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5181 CPU_BASED_VIRTUAL_INTR_PENDING;
5160 5182
5161 while (vmx->emulation_required && count-- != 0) { 5183 while (vmx->emulation_required && count-- != 0) {
5162 if (intr_window_requested && vmx_interrupt_allowed(vcpu)) 5184 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
@@ -5342,7 +5364,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
5342 * is read even if it isn't needed (e.g., for type==all) 5364 * is read even if it isn't needed (e.g., for type==all)
5343 */ 5365 */
5344 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 5366 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5345 vmx_instruction_info, false, &gva)) 5367 vmx_instruction_info, false,
5368 sizeof(operand), &gva))
5346 return 1; 5369 return 1;
5347 5370
5348 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 5371 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
@@ -5437,8 +5460,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
5437 5460
5438static int handle_preemption_timer(struct kvm_vcpu *vcpu) 5461static int handle_preemption_timer(struct kvm_vcpu *vcpu)
5439{ 5462{
5440 if (!to_vmx(vcpu)->req_immediate_exit) 5463 struct vcpu_vmx *vmx = to_vmx(vcpu);
5464
5465 if (!vmx->req_immediate_exit &&
5466 !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
5441 kvm_lapic_expired_hv_timer(vcpu); 5467 kvm_lapic_expired_hv_timer(vcpu);
5468
5442 return 1; 5469 return 1;
5443} 5470}
5444 5471
@@ -5469,7 +5496,7 @@ static int handle_encls(struct kvm_vcpu *vcpu)
5469 * to be done to userspace and return 0. 5496 * to be done to userspace and return 0.
5470 */ 5497 */
5471static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 5498static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5472 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 5499 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
5473 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 5500 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
5474 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 5501 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
5475 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 5502 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
@@ -5952,6 +5979,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
5952 5979
5953void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 5980void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
5954{ 5981{
5982 struct vcpu_vmx *vmx = to_vmx(vcpu);
5955 u32 sec_exec_control; 5983 u32 sec_exec_control;
5956 5984
5957 if (!lapic_in_kernel(vcpu)) 5985 if (!lapic_in_kernel(vcpu))
@@ -5963,11 +5991,11 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
5963 5991
5964 /* Postpone execution until vmcs01 is the current VMCS. */ 5992 /* Postpone execution until vmcs01 is the current VMCS. */
5965 if (is_guest_mode(vcpu)) { 5993 if (is_guest_mode(vcpu)) {
5966 to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; 5994 vmx->nested.change_vmcs01_virtual_apic_mode = true;
5967 return; 5995 return;
5968 } 5996 }
5969 5997
5970 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 5998 sec_exec_control = secondary_exec_controls_get(vmx);
5971 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 5999 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
5972 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6000 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
5973 6001
@@ -5989,7 +6017,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
5989 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6017 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
5990 break; 6018 break;
5991 } 6019 }
5992 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); 6020 secondary_exec_controls_set(vmx, sec_exec_control);
5993 6021
5994 vmx_update_msr_bitmap(vcpu); 6022 vmx_update_msr_bitmap(vcpu);
5995} 6023}
@@ -6107,76 +6135,81 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6107 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); 6135 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6108} 6136}
6109 6137
6110static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 6138static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
6111{ 6139{
6112 u32 exit_intr_info = 0; 6140 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6113 u16 basic_exit_reason = (u16)vmx->exit_reason;
6114
6115 if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
6116 || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
6117 return;
6118
6119 if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6120 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6121 vmx->exit_intr_info = exit_intr_info;
6122 6141
6123 /* if exit due to PF check for async PF */ 6142 /* if exit due to PF check for async PF */
6124 if (is_page_fault(exit_intr_info)) 6143 if (is_page_fault(vmx->exit_intr_info))
6125 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); 6144 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
6126 6145
6127 /* Handle machine checks before interrupts are enabled */ 6146 /* Handle machine checks before interrupts are enabled */
6128 if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || 6147 if (is_machine_check(vmx->exit_intr_info))
6129 is_machine_check(exit_intr_info))
6130 kvm_machine_check(); 6148 kvm_machine_check();
6131 6149
6132 /* We need to handle NMIs before interrupts are enabled */ 6150 /* We need to handle NMIs before interrupts are enabled */
6133 if (is_nmi(exit_intr_info)) { 6151 if (is_nmi(vmx->exit_intr_info)) {
6134 kvm_before_interrupt(&vmx->vcpu); 6152 kvm_before_interrupt(&vmx->vcpu);
6135 asm("int $2"); 6153 asm("int $2");
6136 kvm_after_interrupt(&vmx->vcpu); 6154 kvm_after_interrupt(&vmx->vcpu);
6137 } 6155 }
6138} 6156}
6139 6157
6140static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) 6158static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6141{ 6159{
6142 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6160 unsigned int vector;
6143 6161 unsigned long entry;
6144 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
6145 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
6146 unsigned int vector;
6147 unsigned long entry;
6148 gate_desc *desc;
6149 struct vcpu_vmx *vmx = to_vmx(vcpu);
6150#ifdef CONFIG_X86_64 6162#ifdef CONFIG_X86_64
6151 unsigned long tmp; 6163 unsigned long tmp;
6152#endif 6164#endif
6165 gate_desc *desc;
6166 u32 intr_info;
6167
6168 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6169 if (WARN_ONCE(!is_external_intr(intr_info),
6170 "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
6171 return;
6153 6172
6154 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 6173 vector = intr_info & INTR_INFO_VECTOR_MASK;
6155 desc = (gate_desc *)vmx->host_idt_base + vector; 6174 desc = (gate_desc *)host_idt_base + vector;
6156 entry = gate_offset(desc); 6175 entry = gate_offset(desc);
6157 asm volatile( 6176
6177 kvm_before_interrupt(vcpu);
6178
6179 asm volatile(
6158#ifdef CONFIG_X86_64 6180#ifdef CONFIG_X86_64
6159 "mov %%" _ASM_SP ", %[sp]\n\t" 6181 "mov %%" _ASM_SP ", %[sp]\n\t"
6160 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" 6182 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
6161 "push $%c[ss]\n\t" 6183 "push $%c[ss]\n\t"
6162 "push %[sp]\n\t" 6184 "push %[sp]\n\t"
6163#endif 6185#endif
6164 "pushf\n\t" 6186 "pushf\n\t"
6165 __ASM_SIZE(push) " $%c[cs]\n\t" 6187 __ASM_SIZE(push) " $%c[cs]\n\t"
6166 CALL_NOSPEC 6188 CALL_NOSPEC
6167 : 6189 :
6168#ifdef CONFIG_X86_64 6190#ifdef CONFIG_X86_64
6169 [sp]"=&r"(tmp), 6191 [sp]"=&r"(tmp),
6170#endif 6192#endif
6171 ASM_CALL_CONSTRAINT 6193 ASM_CALL_CONSTRAINT
6172 : 6194 :
6173 THUNK_TARGET(entry), 6195 THUNK_TARGET(entry),
6174 [ss]"i"(__KERNEL_DS), 6196 [ss]"i"(__KERNEL_DS),
6175 [cs]"i"(__KERNEL_CS) 6197 [cs]"i"(__KERNEL_CS)
6176 ); 6198 );
6177 } 6199
6200 kvm_after_interrupt(vcpu);
6201}
6202STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
6203
6204static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
6205{
6206 struct vcpu_vmx *vmx = to_vmx(vcpu);
6207
6208 if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
6209 handle_external_interrupt_irqoff(vcpu);
6210 else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
6211 handle_exception_nmi_irqoff(vmx);
6178} 6212}
6179STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
6180 6213
6181static bool vmx_has_emulated_msr(int index) 6214static bool vmx_has_emulated_msr(int index)
6182{ 6215{
@@ -6187,6 +6220,8 @@ static bool vmx_has_emulated_msr(int index)
6187 * real mode. 6220 * real mode.
6188 */ 6221 */
6189 return enable_unrestricted_guest || emulate_invalid_guest_state; 6222 return enable_unrestricted_guest || emulate_invalid_guest_state;
6223 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6224 return nested;
6190 case MSR_AMD64_VIRT_SPEC_CTRL: 6225 case MSR_AMD64_VIRT_SPEC_CTRL:
6191 /* This is AMD only. */ 6226 /* This is AMD only. */
6192 return false; 6227 return false;
@@ -6332,15 +6367,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6332 msrs[i].host, false); 6367 msrs[i].host, false);
6333} 6368}
6334 6369
6335static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
6336{
6337 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
6338 if (!vmx->loaded_vmcs->hv_timer_armed)
6339 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
6340 PIN_BASED_VMX_PREEMPTION_TIMER);
6341 vmx->loaded_vmcs->hv_timer_armed = true;
6342}
6343
6344static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) 6370static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
6345{ 6371{
6346 struct vcpu_vmx *vmx = to_vmx(vcpu); 6372 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6348,11 +6374,9 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
6348 u32 delta_tsc; 6374 u32 delta_tsc;
6349 6375
6350 if (vmx->req_immediate_exit) { 6376 if (vmx->req_immediate_exit) {
6351 vmx_arm_hv_timer(vmx, 0); 6377 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
6352 return; 6378 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6353 } 6379 } else if (vmx->hv_deadline_tsc != -1) {
6354
6355 if (vmx->hv_deadline_tsc != -1) {
6356 tscl = rdtsc(); 6380 tscl = rdtsc();
6357 if (vmx->hv_deadline_tsc > tscl) 6381 if (vmx->hv_deadline_tsc > tscl)
6358 /* set_hv_timer ensures the delta fits in 32-bits */ 6382 /* set_hv_timer ensures the delta fits in 32-bits */
@@ -6361,14 +6385,12 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
6361 else 6385 else
6362 delta_tsc = 0; 6386 delta_tsc = 0;
6363 6387
6364 vmx_arm_hv_timer(vmx, delta_tsc); 6388 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
6365 return; 6389 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6390 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
6391 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
6392 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
6366 } 6393 }
6367
6368 if (vmx->loaded_vmcs->hv_timer_armed)
6369 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
6370 PIN_BASED_VMX_PREEMPTION_TIMER);
6371 vmx->loaded_vmcs->hv_timer_armed = false;
6372} 6394}
6373 6395
6374void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 6396void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
@@ -6401,8 +6423,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
6401 vmcs_write32(PLE_WINDOW, vmx->ple_window); 6423 vmcs_write32(PLE_WINDOW, vmx->ple_window);
6402 } 6424 }
6403 6425
6404 if (vmx->nested.need_vmcs12_sync) 6426 if (vmx->nested.need_vmcs12_to_shadow_sync)
6405 nested_sync_from_vmcs12(vcpu); 6427 nested_sync_vmcs12_to_shadow(vcpu);
6406 6428
6407 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 6429 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
6408 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 6430 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -6440,7 +6462,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
6440 6462
6441 atomic_switch_perf_msrs(vmx); 6463 atomic_switch_perf_msrs(vmx);
6442 6464
6443 vmx_update_hv_timer(vcpu); 6465 if (enable_preemption_timer)
6466 vmx_update_hv_timer(vcpu);
6467
6468 if (lapic_in_kernel(vcpu) &&
6469 vcpu->arch.apic->lapic_timer.timer_advance_ns)
6470 kvm_wait_lapic_expire(vcpu);
6444 6471
6445 /* 6472 /*
6446 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 6473 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -6533,13 +6560,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
6533 vmx->idt_vectoring_info = 0; 6560 vmx->idt_vectoring_info = 0;
6534 6561
6535 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); 6562 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
6563 if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
6564 kvm_machine_check();
6565
6536 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) 6566 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6537 return; 6567 return;
6538 6568
6539 vmx->loaded_vmcs->launched = 1; 6569 vmx->loaded_vmcs->launched = 1;
6540 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 6570 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
6541 6571
6542 vmx_complete_atomic_exit(vmx);
6543 vmx_recover_nmi_blocking(vmx); 6572 vmx_recover_nmi_blocking(vmx);
6544 vmx_complete_interrupts(vmx); 6573 vmx_complete_interrupts(vmx);
6545} 6574}
@@ -6630,6 +6659,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6630 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 6659 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
6631 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 6660 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
6632 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 6661 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
6662 if (kvm_cstate_in_guest(kvm)) {
6663 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
6664 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
6665 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
6666 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
6667 }
6633 vmx->msr_bitmap_mode = 0; 6668 vmx->msr_bitmap_mode = 0;
6634 6669
6635 vmx->loaded_vmcs = &vmx->vmcs01; 6670 vmx->loaded_vmcs = &vmx->vmcs01;
@@ -6726,22 +6761,22 @@ static int vmx_vm_init(struct kvm *kvm)
6726 return 0; 6761 return 0;
6727} 6762}
6728 6763
6729static void __init vmx_check_processor_compat(void *rtn) 6764static int __init vmx_check_processor_compat(void)
6730{ 6765{
6731 struct vmcs_config vmcs_conf; 6766 struct vmcs_config vmcs_conf;
6732 struct vmx_capability vmx_cap; 6767 struct vmx_capability vmx_cap;
6733 6768
6734 *(int *)rtn = 0;
6735 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) 6769 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
6736 *(int *)rtn = -EIO; 6770 return -EIO;
6737 if (nested) 6771 if (nested)
6738 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, 6772 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
6739 enable_apicv); 6773 enable_apicv);
6740 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { 6774 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
6741 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", 6775 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
6742 smp_processor_id()); 6776 smp_processor_id());
6743 *(int *)rtn = -EIO; 6777 return -EIO;
6744 } 6778 }
6779 return 0;
6745} 6780}
6746 6781
6747static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 6782static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
@@ -6795,7 +6830,7 @@ static int vmx_get_lpage_level(void)
6795 return PT_PDPE_LEVEL; 6830 return PT_PDPE_LEVEL;
6796} 6831}
6797 6832
6798static void vmcs_set_secondary_exec_control(u32 new_ctl) 6833static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
6799{ 6834{
6800 /* 6835 /*
6801 * These bits in the secondary execution controls field 6836 * These bits in the secondary execution controls field
@@ -6809,10 +6844,10 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl)
6809 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6844 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6810 SECONDARY_EXEC_DESC; 6845 SECONDARY_EXEC_DESC;
6811 6846
6812 u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6847 u32 new_ctl = vmx->secondary_exec_control;
6848 u32 cur_ctl = secondary_exec_controls_get(vmx);
6813 6849
6814 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 6850 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
6815 (new_ctl & ~mask) | (cur_ctl & mask));
6816} 6851}
6817 6852
6818/* 6853/*
@@ -6950,7 +6985,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
6950 6985
6951 if (cpu_has_secondary_exec_ctrls()) { 6986 if (cpu_has_secondary_exec_ctrls()) {
6952 vmx_compute_secondary_exec_control(vmx); 6987 vmx_compute_secondary_exec_control(vmx);
6953 vmcs_set_secondary_exec_control(vmx->secondary_exec_control); 6988 vmcs_set_secondary_exec_control(vmx);
6954 } 6989 }
6955 6990
6956 if (nested_vmx_allowed(vcpu)) 6991 if (nested_vmx_allowed(vcpu))
@@ -7424,10 +7459,14 @@ static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
7424static __init int hardware_setup(void) 7459static __init int hardware_setup(void)
7425{ 7460{
7426 unsigned long host_bndcfgs; 7461 unsigned long host_bndcfgs;
7462 struct desc_ptr dt;
7427 int r, i; 7463 int r, i;
7428 7464
7429 rdmsrl_safe(MSR_EFER, &host_efer); 7465 rdmsrl_safe(MSR_EFER, &host_efer);
7430 7466
7467 store_idt(&dt);
7468 host_idt_base = dt.address;
7469
7431 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) 7470 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
7432 kvm_define_shared_msr(i, vmx_msr_index[i]); 7471 kvm_define_shared_msr(i, vmx_msr_index[i]);
7433 7472
@@ -7531,17 +7570,33 @@ static __init int hardware_setup(void)
7531 } 7570 }
7532 7571
7533 if (!cpu_has_vmx_preemption_timer()) 7572 if (!cpu_has_vmx_preemption_timer())
7534 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; 7573 enable_preemption_timer = false;
7535 7574
7536 if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { 7575 if (enable_preemption_timer) {
7576 u64 use_timer_freq = 5000ULL * 1000 * 1000;
7537 u64 vmx_msr; 7577 u64 vmx_msr;
7538 7578
7539 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); 7579 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
7540 cpu_preemption_timer_multi = 7580 cpu_preemption_timer_multi =
7541 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 7581 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
7542 } else { 7582
7583 if (tsc_khz)
7584 use_timer_freq = (u64)tsc_khz * 1000;
7585 use_timer_freq >>= cpu_preemption_timer_multi;
7586
7587 /*
7588 * KVM "disables" the preemption timer by setting it to its max
7589 * value. Don't use the timer if it might cause spurious exits
7590 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
7591 */
7592 if (use_timer_freq > 0xffffffffu / 10)
7593 enable_preemption_timer = false;
7594 }
7595
7596 if (!enable_preemption_timer) {
7543 kvm_x86_ops->set_hv_timer = NULL; 7597 kvm_x86_ops->set_hv_timer = NULL;
7544 kvm_x86_ops->cancel_hv_timer = NULL; 7598 kvm_x86_ops->cancel_hv_timer = NULL;
7599 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
7545 } 7600 }
7546 7601
7547 kvm_set_posted_intr_wakeup_handler(wakeup_handler); 7602 kvm_set_posted_intr_wakeup_handler(wakeup_handler);
@@ -7683,7 +7738,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
7683 .set_tdp_cr3 = vmx_set_cr3, 7738 .set_tdp_cr3 = vmx_set_cr3,
7684 7739
7685 .check_intercept = vmx_check_intercept, 7740 .check_intercept = vmx_check_intercept,
7686 .handle_external_intr = vmx_handle_external_intr, 7741 .handle_exit_irqoff = vmx_handle_exit_irqoff,
7687 .mpx_supported = vmx_mpx_supported, 7742 .mpx_supported = vmx_mpx_supported,
7688 .xsaves_supported = vmx_xsaves_supported, 7743 .xsaves_supported = vmx_xsaves_supported,
7689 .umip_emulated = vmx_umip_emulated, 7744 .umip_emulated = vmx_umip_emulated,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 61128b48c503..82d0bc3a4d52 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -109,14 +109,21 @@ struct nested_vmx {
109 * to guest memory during VM exit. 109 * to guest memory during VM exit.
110 */ 110 */
111 struct vmcs12 *cached_shadow_vmcs12; 111 struct vmcs12 *cached_shadow_vmcs12;
112
112 /* 113 /*
113 * Indicates if the shadow vmcs or enlightened vmcs must be updated 114 * Indicates if the shadow vmcs or enlightened vmcs must be updated
114 * with the data held by struct vmcs12. 115 * with the data held by struct vmcs12.
115 */ 116 */
116 bool need_vmcs12_sync; 117 bool need_vmcs12_to_shadow_sync;
117 bool dirty_vmcs12; 118 bool dirty_vmcs12;
118 119
119 /* 120 /*
121 * Indicates lazily loaded guest state has not yet been decached from
122 * vmcs02.
123 */
124 bool need_sync_vmcs02_to_vmcs12_rare;
125
126 /*
120 * vmcs02 has been initialized, i.e. state that is constant for 127 * vmcs02 has been initialized, i.e. state that is constant for
121 * vmcs02 has been written to the backing VMCS. Initialization 128 * vmcs02 has been written to the backing VMCS. Initialization
122 * is delayed until L1 actually attempts to run a nested VM. 129 * is delayed until L1 actually attempts to run a nested VM.
@@ -180,14 +187,24 @@ struct vcpu_vmx {
180 struct kvm_vcpu vcpu; 187 struct kvm_vcpu vcpu;
181 u8 fail; 188 u8 fail;
182 u8 msr_bitmap_mode; 189 u8 msr_bitmap_mode;
190
191 /*
192 * If true, host state has been stored in vmx->loaded_vmcs for
193 * the CPU registers that only need to be switched when transitioning
194 * to/from the kernel, and the registers have been loaded with guest
195 * values. If false, host state is loaded in the CPU registers
196 * and vmx->loaded_vmcs->host_state is invalid.
197 */
198 bool guest_state_loaded;
199
183 u32 exit_intr_info; 200 u32 exit_intr_info;
184 u32 idt_vectoring_info; 201 u32 idt_vectoring_info;
185 ulong rflags; 202 ulong rflags;
203
186 struct shared_msr_entry *guest_msrs; 204 struct shared_msr_entry *guest_msrs;
187 int nmsrs; 205 int nmsrs;
188 int save_nmsrs; 206 int save_nmsrs;
189 bool guest_msrs_dirty; 207 bool guest_msrs_ready;
190 unsigned long host_idt_base;
191#ifdef CONFIG_X86_64 208#ifdef CONFIG_X86_64
192 u64 msr_host_kernel_gs_base; 209 u64 msr_host_kernel_gs_base;
193 u64 msr_guest_kernel_gs_base; 210 u64 msr_guest_kernel_gs_base;
@@ -195,21 +212,15 @@ struct vcpu_vmx {
195 212
196 u64 spec_ctrl; 213 u64 spec_ctrl;
197 214
198 u32 vm_entry_controls_shadow;
199 u32 vm_exit_controls_shadow;
200 u32 secondary_exec_control; 215 u32 secondary_exec_control;
201 216
202 /* 217 /*
203 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 218 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
204 * non-nested (L1) guest, it always points to vmcs01. For a nested 219 * non-nested (L1) guest, it always points to vmcs01. For a nested
205 * guest (L2), it points to a different VMCS. loaded_cpu_state points 220 * guest (L2), it points to a different VMCS.
206 * to the VMCS whose state is loaded into the CPU registers that only
207 * need to be switched when transitioning to/from the kernel; a NULL
208 * value indicates that host state is loaded.
209 */ 221 */
210 struct loaded_vmcs vmcs01; 222 struct loaded_vmcs vmcs01;
211 struct loaded_vmcs *loaded_vmcs; 223 struct loaded_vmcs *loaded_vmcs;
212 struct loaded_vmcs *loaded_cpu_state;
213 224
214 struct msr_autoload { 225 struct msr_autoload {
215 struct vmx_msrs guest; 226 struct vmx_msrs guest;
@@ -260,8 +271,6 @@ struct vcpu_vmx {
260 271
261 unsigned long host_debugctlmsr; 272 unsigned long host_debugctlmsr;
262 273
263 u64 msr_ia32_power_ctl;
264
265 /* 274 /*
266 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in 275 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
267 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included 276 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
@@ -292,12 +301,14 @@ struct kvm_vmx {
292}; 301};
293 302
294bool nested_vmx_allowed(struct kvm_vcpu *vcpu); 303bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
304void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
295void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); 305void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
296void vmx_vcpu_put(struct kvm_vcpu *vcpu);
297int allocate_vpid(void); 306int allocate_vpid(void);
298void free_vpid(int vpid); 307void free_vpid(int vpid);
299void vmx_set_constant_host_state(struct vcpu_vmx *vmx); 308void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
300void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); 309void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
310void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
311 unsigned long fs_base, unsigned long gs_base);
301int vmx_get_cpl(struct kvm_vcpu *vcpu); 312int vmx_get_cpl(struct kvm_vcpu *vcpu);
302unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); 313unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
303void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 314void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -376,69 +387,31 @@ static inline u8 vmx_get_rvi(void)
376 return vmcs_read16(GUEST_INTR_STATUS) & 0xff; 387 return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
377} 388}
378 389
379static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) 390#define BUILD_CONTROLS_SHADOW(lname, uname) \
380{ 391static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \
381 vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); 392{ \
382} 393 if (vmx->loaded_vmcs->controls_shadow.lname != val) { \
383 394 vmcs_write32(uname, val); \
384static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) 395 vmx->loaded_vmcs->controls_shadow.lname = val; \
385{ 396 } \
386 vmcs_write32(VM_ENTRY_CONTROLS, val); 397} \
387 vmx->vm_entry_controls_shadow = val; 398static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \
388} 399{ \
389 400 return vmx->loaded_vmcs->controls_shadow.lname; \
390static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) 401} \
391{ 402static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \
392 if (vmx->vm_entry_controls_shadow != val) 403{ \
393 vm_entry_controls_init(vmx, val); 404 lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \
394} 405} \
395 406static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \
396static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) 407{ \
397{ 408 lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \
398 return vmx->vm_entry_controls_shadow;
399}
400
401static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
402{
403 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
404}
405
406static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
407{
408 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
409}
410
411static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
412{
413 vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
414}
415
416static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
417{
418 vmcs_write32(VM_EXIT_CONTROLS, val);
419 vmx->vm_exit_controls_shadow = val;
420}
421
422static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
423{
424 if (vmx->vm_exit_controls_shadow != val)
425 vm_exit_controls_init(vmx, val);
426}
427
428static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
429{
430 return vmx->vm_exit_controls_shadow;
431}
432
433static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
434{
435 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
436}
437
438static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
439{
440 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
441} 409}
410BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS)
411BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS)
412BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
413BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
414BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
442 415
443static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 416static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
444{ 417{
@@ -468,6 +441,7 @@ static inline u32 vmx_vmexit_ctrl(void)
468} 441}
469 442
470u32 vmx_exec_control(struct vcpu_vmx *vmx); 443u32 vmx_exec_control(struct vcpu_vmx *vmx);
444u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx);
471 445
472static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) 446static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
473{ 447{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63bb1ee8258e..4a0b74ecd1de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -717,7 +717,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
717 gfn_t gfn; 717 gfn_t gfn;
718 int r; 718 int r;
719 719
720 if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) 720 if (!is_pae_paging(vcpu))
721 return false; 721 return false;
722 722
723 if (!test_bit(VCPU_EXREG_PDPTR, 723 if (!test_bit(VCPU_EXREG_PDPTR,
@@ -960,8 +960,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
960 if (is_long_mode(vcpu) && 960 if (is_long_mode(vcpu) &&
961 (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))) 961 (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
962 return 1; 962 return 1;
963 else if (is_pae(vcpu) && is_paging(vcpu) && 963 else if (is_pae_paging(vcpu) &&
964 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) 964 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
965 return 1; 965 return 1;
966 966
967 kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); 967 kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
@@ -1174,7 +1174,28 @@ static u32 emulated_msrs[] = {
1174 MSR_AMD64_VIRT_SPEC_CTRL, 1174 MSR_AMD64_VIRT_SPEC_CTRL,
1175 MSR_IA32_POWER_CTL, 1175 MSR_IA32_POWER_CTL,
1176 1176
1177 /*
1178 * The following list leaves out MSRs whose values are determined
1179 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1180 * We always support the "true" VMX control MSRs, even if the host
1181 * processor does not, so I am putting these registers here rather
1182 * than in msrs_to_save.
1183 */
1184 MSR_IA32_VMX_BASIC,
1185 MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1186 MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1187 MSR_IA32_VMX_TRUE_EXIT_CTLS,
1188 MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1189 MSR_IA32_VMX_MISC,
1190 MSR_IA32_VMX_CR0_FIXED0,
1191 MSR_IA32_VMX_CR4_FIXED0,
1192 MSR_IA32_VMX_VMCS_ENUM,
1193 MSR_IA32_VMX_PROCBASED_CTLS2,
1194 MSR_IA32_VMX_EPT_VPID_CAP,
1195 MSR_IA32_VMX_VMFUNC,
1196
1177 MSR_K7_HWCR, 1197 MSR_K7_HWCR,
1198 MSR_KVM_POLL_CONTROL,
1178}; 1199};
1179 1200
1180static unsigned num_emulated_msrs; 1201static unsigned num_emulated_msrs;
@@ -1210,11 +1231,12 @@ static u32 msr_based_features[] = {
1210 1231
1211static unsigned int num_msr_based_features; 1232static unsigned int num_msr_based_features;
1212 1233
1213u64 kvm_get_arch_capabilities(void) 1234static u64 kvm_get_arch_capabilities(void)
1214{ 1235{
1215 u64 data; 1236 u64 data = 0;
1216 1237
1217 rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); 1238 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
1239 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1218 1240
1219 /* 1241 /*
1220 * If we're doing cache flushes (either "always" or "cond") 1242 * If we're doing cache flushes (either "always" or "cond")
@@ -1230,7 +1252,6 @@ u64 kvm_get_arch_capabilities(void)
1230 1252
1231 return data; 1253 return data;
1232} 1254}
1233EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
1234 1255
1235static int kvm_get_msr_feature(struct kvm_msr_entry *msr) 1256static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1236{ 1257{
@@ -2545,13 +2566,24 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2545 } 2566 }
2546 break; 2567 break;
2547 case MSR_IA32_MISC_ENABLE: 2568 case MSR_IA32_MISC_ENABLE:
2548 vcpu->arch.ia32_misc_enable_msr = data; 2569 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
2570 ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
2571 if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
2572 return 1;
2573 vcpu->arch.ia32_misc_enable_msr = data;
2574 kvm_update_cpuid(vcpu);
2575 } else {
2576 vcpu->arch.ia32_misc_enable_msr = data;
2577 }
2549 break; 2578 break;
2550 case MSR_IA32_SMBASE: 2579 case MSR_IA32_SMBASE:
2551 if (!msr_info->host_initiated) 2580 if (!msr_info->host_initiated)
2552 return 1; 2581 return 1;
2553 vcpu->arch.smbase = data; 2582 vcpu->arch.smbase = data;
2554 break; 2583 break;
2584 case MSR_IA32_POWER_CTL:
2585 vcpu->arch.msr_ia32_power_ctl = data;
2586 break;
2555 case MSR_IA32_TSC: 2587 case MSR_IA32_TSC:
2556 kvm_write_tsc(vcpu, msr_info); 2588 kvm_write_tsc(vcpu, msr_info);
2557 break; 2589 break;
@@ -2626,6 +2658,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2626 return 1; 2658 return 1;
2627 break; 2659 break;
2628 2660
2661 case MSR_KVM_POLL_CONTROL:
2662 /* only enable bit supported */
2663 if (data & (-1ULL << 1))
2664 return 1;
2665
2666 vcpu->arch.msr_kvm_poll_control = data;
2667 break;
2668
2629 case MSR_IA32_MCG_CTL: 2669 case MSR_IA32_MCG_CTL:
2630 case MSR_IA32_MCG_STATUS: 2670 case MSR_IA32_MCG_STATUS:
2631 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2671 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
@@ -2803,6 +2843,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2803 return 1; 2843 return 1;
2804 msr_info->data = vcpu->arch.arch_capabilities; 2844 msr_info->data = vcpu->arch.arch_capabilities;
2805 break; 2845 break;
2846 case MSR_IA32_POWER_CTL:
2847 msr_info->data = vcpu->arch.msr_ia32_power_ctl;
2848 break;
2806 case MSR_IA32_TSC: 2849 case MSR_IA32_TSC:
2807 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; 2850 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
2808 break; 2851 break;
@@ -2875,6 +2918,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2875 case MSR_KVM_PV_EOI_EN: 2918 case MSR_KVM_PV_EOI_EN:
2876 msr_info->data = vcpu->arch.pv_eoi.msr_val; 2919 msr_info->data = vcpu->arch.pv_eoi.msr_val;
2877 break; 2920 break;
2921 case MSR_KVM_POLL_CONTROL:
2922 msr_info->data = vcpu->arch.msr_kvm_poll_control;
2923 break;
2878 case MSR_IA32_P5_MC_ADDR: 2924 case MSR_IA32_P5_MC_ADDR:
2879 case MSR_IA32_P5_MC_TYPE: 2925 case MSR_IA32_P5_MC_TYPE:
2880 case MSR_IA32_MCG_CAP: 2926 case MSR_IA32_MCG_CAP:
@@ -3084,6 +3130,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3084 case KVM_CAP_SET_BOOT_CPU_ID: 3130 case KVM_CAP_SET_BOOT_CPU_ID:
3085 case KVM_CAP_SPLIT_IRQCHIP: 3131 case KVM_CAP_SPLIT_IRQCHIP:
3086 case KVM_CAP_IMMEDIATE_EXIT: 3132 case KVM_CAP_IMMEDIATE_EXIT:
3133 case KVM_CAP_PMU_EVENT_FILTER:
3087 case KVM_CAP_GET_MSR_FEATURES: 3134 case KVM_CAP_GET_MSR_FEATURES:
3088 case KVM_CAP_MSR_PLATFORM_INFO: 3135 case KVM_CAP_MSR_PLATFORM_INFO:
3089 case KVM_CAP_EXCEPTION_PAYLOAD: 3136 case KVM_CAP_EXCEPTION_PAYLOAD:
@@ -3096,7 +3143,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3096 r = KVM_CLOCK_TSC_STABLE; 3143 r = KVM_CLOCK_TSC_STABLE;
3097 break; 3144 break;
3098 case KVM_CAP_X86_DISABLE_EXITS: 3145 case KVM_CAP_X86_DISABLE_EXITS:
3099 r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE; 3146 r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
3147 KVM_X86_DISABLE_EXITS_CSTATE;
3100 if(kvm_can_mwait_in_guest()) 3148 if(kvm_can_mwait_in_guest())
3101 r |= KVM_X86_DISABLE_EXITS_MWAIT; 3149 r |= KVM_X86_DISABLE_EXITS_MWAIT;
3102 break; 3150 break;
@@ -4613,6 +4661,8 @@ split_irqchip_unlock:
4613 kvm->arch.hlt_in_guest = true; 4661 kvm->arch.hlt_in_guest = true;
4614 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) 4662 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
4615 kvm->arch.pause_in_guest = true; 4663 kvm->arch.pause_in_guest = true;
4664 if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
4665 kvm->arch.cstate_in_guest = true;
4616 r = 0; 4666 r = 0;
4617 break; 4667 break;
4618 case KVM_CAP_MSR_PLATFORM_INFO: 4668 case KVM_CAP_MSR_PLATFORM_INFO:
@@ -4927,6 +4977,9 @@ set_identity_unlock:
4927 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); 4977 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
4928 break; 4978 break;
4929 } 4979 }
4980 case KVM_SET_PMU_EVENT_FILTER:
4981 r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
4982 break;
4930 default: 4983 default:
4931 r = -ENOTTY; 4984 r = -ENOTTY;
4932 } 4985 }
@@ -6379,7 +6432,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
6379 vcpu->arch.db); 6432 vcpu->arch.db);
6380 6433
6381 if (dr6 != 0) { 6434 if (dr6 != 0) {
6382 vcpu->arch.dr6 &= ~15; 6435 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
6383 vcpu->arch.dr6 |= dr6 | DR6_RTM; 6436 vcpu->arch.dr6 |= dr6 | DR6_RTM;
6384 kvm_queue_exception(vcpu, DB_VECTOR); 6437 kvm_queue_exception(vcpu, DB_VECTOR);
6385 *r = EMULATE_DONE; 6438 *r = EMULATE_DONE;
@@ -6706,7 +6759,7 @@ static void kvm_hyperv_tsc_notifier(void)
6706 struct kvm_vcpu *vcpu; 6759 struct kvm_vcpu *vcpu;
6707 int cpu; 6760 int cpu;
6708 6761
6709 spin_lock(&kvm_lock); 6762 mutex_lock(&kvm_lock);
6710 list_for_each_entry(kvm, &vm_list, vm_list) 6763 list_for_each_entry(kvm, &vm_list, vm_list)
6711 kvm_make_mclock_inprogress_request(kvm); 6764 kvm_make_mclock_inprogress_request(kvm);
6712 6765
@@ -6732,7 +6785,7 @@ static void kvm_hyperv_tsc_notifier(void)
6732 6785
6733 spin_unlock(&ka->pvclock_gtod_sync_lock); 6786 spin_unlock(&ka->pvclock_gtod_sync_lock);
6734 } 6787 }
6735 spin_unlock(&kvm_lock); 6788 mutex_unlock(&kvm_lock);
6736} 6789}
6737#endif 6790#endif
6738 6791
@@ -6783,17 +6836,17 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
6783 6836
6784 smp_call_function_single(cpu, tsc_khz_changed, freq, 1); 6837 smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
6785 6838
6786 spin_lock(&kvm_lock); 6839 mutex_lock(&kvm_lock);
6787 list_for_each_entry(kvm, &vm_list, vm_list) { 6840 list_for_each_entry(kvm, &vm_list, vm_list) {
6788 kvm_for_each_vcpu(i, vcpu, kvm) { 6841 kvm_for_each_vcpu(i, vcpu, kvm) {
6789 if (vcpu->cpu != cpu) 6842 if (vcpu->cpu != cpu)
6790 continue; 6843 continue;
6791 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 6844 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
6792 if (vcpu->cpu != smp_processor_id()) 6845 if (vcpu->cpu != raw_smp_processor_id())
6793 send_ipi = 1; 6846 send_ipi = 1;
6794 } 6847 }
6795 } 6848 }
6796 spin_unlock(&kvm_lock); 6849 mutex_unlock(&kvm_lock);
6797 6850
6798 if (freq->old < freq->new && send_ipi) { 6851 if (freq->old < freq->new && send_ipi) {
6799 /* 6852 /*
@@ -6908,35 +6961,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
6908 .handle_intel_pt_intr = kvm_handle_intel_pt_intr, 6961 .handle_intel_pt_intr = kvm_handle_intel_pt_intr,
6909}; 6962};
6910 6963
6911static void kvm_set_mmio_spte_mask(void)
6912{
6913 u64 mask;
6914 int maxphyaddr = boot_cpu_data.x86_phys_bits;
6915
6916 /*
6917 * Set the reserved bits and the present bit of an paging-structure
6918 * entry to generate page fault with PFER.RSV = 1.
6919 */
6920
6921 /*
6922 * Mask the uppermost physical address bit, which would be reserved as
6923 * long as the supported physical address width is less than 52.
6924 */
6925 mask = 1ull << 51;
6926
6927 /* Set the present bit. */
6928 mask |= 1ull;
6929
6930 /*
6931 * If reserved bit is not supported, clear the present bit to disable
6932 * mmio page fault.
6933 */
6934 if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52)
6935 mask &= ~1ull;
6936
6937 kvm_mmu_set_mmio_spte_mask(mask, mask);
6938}
6939
6940#ifdef CONFIG_X86_64 6964#ifdef CONFIG_X86_64
6941static void pvclock_gtod_update_fn(struct work_struct *work) 6965static void pvclock_gtod_update_fn(struct work_struct *work)
6942{ 6966{
@@ -6945,12 +6969,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
6945 struct kvm_vcpu *vcpu; 6969 struct kvm_vcpu *vcpu;
6946 int i; 6970 int i;
6947 6971
6948 spin_lock(&kvm_lock); 6972 mutex_lock(&kvm_lock);
6949 list_for_each_entry(kvm, &vm_list, vm_list) 6973 list_for_each_entry(kvm, &vm_list, vm_list)
6950 kvm_for_each_vcpu(i, vcpu, kvm) 6974 kvm_for_each_vcpu(i, vcpu, kvm)
6951 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); 6975 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
6952 atomic_set(&kvm_guest_has_master_clock, 0); 6976 atomic_set(&kvm_guest_has_master_clock, 0);
6953 spin_unlock(&kvm_lock); 6977 mutex_unlock(&kvm_lock);
6954} 6978}
6955 6979
6956static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); 6980static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -7033,8 +7057,6 @@ int kvm_arch_init(void *opaque)
7033 if (r) 7057 if (r)
7034 goto out_free_percpu; 7058 goto out_free_percpu;
7035 7059
7036 kvm_set_mmio_spte_mask();
7037
7038 kvm_x86_ops = ops; 7060 kvm_x86_ops = ops;
7039 7061
7040 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 7062 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
@@ -7173,6 +7195,23 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
7173 kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); 7195 kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
7174} 7196}
7175 7197
7198static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
7199{
7200 struct kvm_vcpu *target = NULL;
7201 struct kvm_apic_map *map;
7202
7203 rcu_read_lock();
7204 map = rcu_dereference(kvm->arch.apic_map);
7205
7206 if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
7207 target = map->phys_map[dest_id]->vcpu;
7208
7209 rcu_read_unlock();
7210
7211 if (target)
7212 kvm_vcpu_yield_to(target);
7213}
7214
7176int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 7215int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
7177{ 7216{
7178 unsigned long nr, a0, a1, a2, a3, ret; 7217 unsigned long nr, a0, a1, a2, a3, ret;
@@ -7219,6 +7258,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
7219 case KVM_HC_SEND_IPI: 7258 case KVM_HC_SEND_IPI:
7220 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); 7259 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
7221 break; 7260 break;
7261 case KVM_HC_SCHED_YIELD:
7262 kvm_sched_yield(vcpu->kvm, a0);
7263 ret = 0;
7264 break;
7222 default: 7265 default:
7223 ret = -KVM_ENOSYS; 7266 ret = -KVM_ENOSYS;
7224 break; 7267 break;
@@ -7951,9 +7994,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
7951 } 7994 }
7952 7995
7953 trace_kvm_entry(vcpu->vcpu_id); 7996 trace_kvm_entry(vcpu->vcpu_id);
7954 if (lapic_in_kernel(vcpu) &&
7955 vcpu->arch.apic->lapic_timer.timer_advance_ns)
7956 wait_lapic_expire(vcpu);
7957 guest_enter_irqoff(); 7997 guest_enter_irqoff();
7958 7998
7959 fpregs_assert_state_consistent(); 7999 fpregs_assert_state_consistent();
@@ -8002,13 +8042,29 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
8002 vcpu->mode = OUTSIDE_GUEST_MODE; 8042 vcpu->mode = OUTSIDE_GUEST_MODE;
8003 smp_wmb(); 8043 smp_wmb();
8004 8044
8005 kvm_before_interrupt(vcpu); 8045 kvm_x86_ops->handle_exit_irqoff(vcpu);
8006 kvm_x86_ops->handle_external_intr(vcpu);
8007 kvm_after_interrupt(vcpu);
8008 8046
8047 /*
8048 * Consume any pending interrupts, including the possible source of
8049 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
8050 * An instruction is required after local_irq_enable() to fully unblock
8051 * interrupts on processors that implement an interrupt shadow, the
8052 * stat.exits increment will do nicely.
8053 */
8054 kvm_before_interrupt(vcpu);
8055 local_irq_enable();
8009 ++vcpu->stat.exits; 8056 ++vcpu->stat.exits;
8057 local_irq_disable();
8058 kvm_after_interrupt(vcpu);
8010 8059
8011 guest_exit_irqoff(); 8060 guest_exit_irqoff();
8061 if (lapic_in_kernel(vcpu)) {
8062 s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
8063 if (delta != S64_MIN) {
8064 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
8065 vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
8066 }
8067 }
8012 8068
8013 local_irq_enable(); 8069 local_irq_enable();
8014 preempt_enable(); 8070 preempt_enable();
@@ -8594,7 +8650,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
8594 kvm_update_cpuid(vcpu); 8650 kvm_update_cpuid(vcpu);
8595 8651
8596 idx = srcu_read_lock(&vcpu->kvm->srcu); 8652 idx = srcu_read_lock(&vcpu->kvm->srcu);
8597 if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { 8653 if (is_pae_paging(vcpu)) {
8598 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 8654 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
8599 mmu_reset_needed = 1; 8655 mmu_reset_needed = 1;
8600 } 8656 }
@@ -8875,6 +8931,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
8875 msr.host_initiated = true; 8931 msr.host_initiated = true;
8876 kvm_write_tsc(vcpu, &msr); 8932 kvm_write_tsc(vcpu, &msr);
8877 vcpu_put(vcpu); 8933 vcpu_put(vcpu);
8934
8935 /* poll control enabled by default */
8936 vcpu->arch.msr_kvm_poll_control = 1;
8937
8878 mutex_unlock(&vcpu->mutex); 8938 mutex_unlock(&vcpu->mutex);
8879 8939
8880 if (!kvmclock_periodic_sync) 8940 if (!kvmclock_periodic_sync)
@@ -9107,9 +9167,9 @@ void kvm_arch_hardware_unsetup(void)
9107 kvm_x86_ops->hardware_unsetup(); 9167 kvm_x86_ops->hardware_unsetup();
9108} 9168}
9109 9169
9110void kvm_arch_check_processor_compat(void *rtn) 9170int kvm_arch_check_processor_compat(void)
9111{ 9171{
9112 kvm_x86_ops->check_processor_compatibility(rtn); 9172 return kvm_x86_ops->check_processor_compatibility();
9113} 9173}
9114 9174
9115bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) 9175bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
@@ -9381,6 +9441,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
9381 kvm_ioapic_destroy(kvm); 9441 kvm_ioapic_destroy(kvm);
9382 kvm_free_vcpus(kvm); 9442 kvm_free_vcpus(kvm);
9383 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); 9443 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
9444 kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
9384 kvm_mmu_uninit_vm(kvm); 9445 kvm_mmu_uninit_vm(kvm);
9385 kvm_page_track_cleanup(kvm); 9446 kvm_page_track_cleanup(kvm);
9386 kvm_hv_destroy_vm(kvm); 9447 kvm_hv_destroy_vm(kvm);
@@ -9789,6 +9850,36 @@ static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
9789 sizeof(u32)); 9850 sizeof(u32));
9790} 9851}
9791 9852
9853static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
9854{
9855 if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
9856 return false;
9857
9858 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
9859 (vcpu->arch.apf.send_user_only &&
9860 kvm_x86_ops->get_cpl(vcpu) == 0))
9861 return false;
9862
9863 return true;
9864}
9865
9866bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
9867{
9868 if (unlikely(!lapic_in_kernel(vcpu) ||
9869 kvm_event_needs_reinjection(vcpu) ||
9870 vcpu->arch.exception.pending))
9871 return false;
9872
9873 if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
9874 return false;
9875
9876 /*
9877 * If interrupts are off we cannot even use an artificial
9878 * halt state.
9879 */
9880 return kvm_x86_ops->interrupt_allowed(vcpu);
9881}
9882
9792void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, 9883void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
9793 struct kvm_async_pf *work) 9884 struct kvm_async_pf *work)
9794{ 9885{
@@ -9797,11 +9888,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
9797 trace_kvm_async_pf_not_present(work->arch.token, work->gva); 9888 trace_kvm_async_pf_not_present(work->arch.token, work->gva);
9798 kvm_add_async_pf_gfn(vcpu, work->arch.gfn); 9889 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
9799 9890
9800 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || 9891 if (kvm_can_deliver_async_pf(vcpu) &&
9801 (vcpu->arch.apf.send_user_only && 9892 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
9802 kvm_x86_ops->get_cpl(vcpu) == 0))
9803 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
9804 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
9805 fault.vector = PF_VECTOR; 9893 fault.vector = PF_VECTOR;
9806 fault.error_code_valid = true; 9894 fault.error_code_valid = true;
9807 fault.error_code = 0; 9895 fault.error_code = 0;
@@ -9809,6 +9897,16 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
9809 fault.address = work->arch.token; 9897 fault.address = work->arch.token;
9810 fault.async_page_fault = true; 9898 fault.async_page_fault = true;
9811 kvm_inject_page_fault(vcpu, &fault); 9899 kvm_inject_page_fault(vcpu, &fault);
9900 } else {
9901 /*
9902 * It is not possible to deliver a paravirtualized asynchronous
9903 * page fault, but putting the guest in an artificial halt state
9904 * can be beneficial nevertheless: if an interrupt arrives, we
9905 * can deliver it timely and perhaps the guest will schedule
9906 * another process. When the instruction that triggered a page
9907 * fault is retried, hopefully the page will be ready in the host.
9908 */
9909 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
9812 } 9910 }
9813} 9911}
9814 9912
@@ -9949,6 +10047,13 @@ bool kvm_vector_hashing_enabled(void)
9949} 10047}
9950EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); 10048EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
9951 10049
10050bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
10051{
10052 return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
10053}
10054EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
10055
10056
9952EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 10057EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
9953EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); 10058EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
9954EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 10059EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a470ff0868c5..e08a12892e8b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -139,6 +139,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
139 return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); 139 return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
140} 140}
141 141
142static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
143{
144 return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
145}
146
142static inline u32 bit(int bitno) 147static inline u32 bit(int bitno)
143{ 148{
144 return 1 << (bitno & 31); 149 return 1 << (bitno & 31);
@@ -333,6 +338,11 @@ static inline bool kvm_pause_in_guest(struct kvm *kvm)
333 return kvm->arch.pause_in_guest; 338 return kvm->arch.pause_in_guest;
334} 339}
335 340
341static inline bool kvm_cstate_in_guest(struct kvm *kvm)
342{
343 return kvm->arch.cstate_in_guest;
344}
345
336DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); 346DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
337 347
338static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) 348static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)