summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorSean Christopherson <sean.j.christopherson@intel.com>2019-06-07 14:55:34 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2019-07-05 07:57:06 -0400
commitf087a02941feacf7d6f097522bc67c602fda18e6 (patch)
treeddc9dd0ae59db04d5b619269694261cbec9ed446 /arch/x86
parent335e192a3fa415e1202c8b9ecdaaecd643f823cc (diff)
KVM: nVMX: Stash L1's CR3 in vmcs01.GUEST_CR3 on nested entry w/o EPT
KVM does not have 100% coverage of VMX consistency checks, i.e. some checks that cause VM-Fail may only be detected by hardware during a nested VM-Entry. In such a case, KVM must restore L1's state to the pre-VM-Enter state as L2's state has already been loaded into KVM's software model. L1's CR3 and PDPTRs in particular are loaded from vmcs01.GUEST_*. But when EPT is disabled, the associated fields hold KVM's shadow values, not L1's "real" values. Fortunately, when EPT is disabled the PDPTRs come from memory, i.e. are not cached in the VMCS. Which leaves CR3 as the sole anomaly. A previously applied workaround to handle CR3 was to force nested early checks if EPT is disabled: commit 2b27924bb1d48 ("KVM: nVMX: always use early vmcs check when EPT is disabled") Forcing nested early checks is undesirable as doing so adds hundreds of cycles to every nested VM-Entry. Rather than take this performance hit, handle CR3 by overwriting vmcs01.GUEST_CR3 with L1's CR3 during nested VM-Entry when EPT is disabled *and* nested early checks are disabled. By stuffing vmcs01.GUEST_CR3, nested_vmx_restore_host_state() will naturally restore the correct vcpu->arch.cr3 from vmcs01.GUEST_CR3. These shenanigans work because nested_vmx_restore_host_state() does a full kvm_mmu_reset_context(), i.e. unloads the current MMU, which guarantees vmcs01.GUEST_CR3 will be rewritten with a new shadow CR3 prior to re-entering L1. vcpu->arch.root_mmu.root_hpa is set to INVALID_PAGE via: nested_vmx_restore_host_state() -> kvm_mmu_reset_context() -> kvm_mmu_unload() -> kvm_mmu_free_roots() kvm_mmu_unload() has WARN_ON(root_hpa != INVALID_PAGE), i.e. we can bank on 'root_hpa == INVALID_PAGE' unless the implementation of kvm_mmu_reset_context() is changed. On the way into L1, VMCS.GUEST_CR3 is guaranteed to be written (on a successful entry) via: vcpu_enter_guest() -> kvm_mmu_reload() -> kvm_mmu_load() -> kvm_mmu_load_cr3() -> vmx_set_cr3() Stuff vmcs01.GUEST_CR3 if and only if nested early checks are disabled as a "late" VM-Fail should never happen win that case (KVM WARNs), and the conditional write avoids the need to restore the correct GUEST_CR3 when nested_vmx_check_vmentry_hw() fails. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Message-Id: <20190607185534.24368-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kvm/vmx/nested.c44
2 files changed, 23 insertions, 22 deletions
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d213ec5c3766..f0b0c90dd398 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -146,7 +146,6 @@
146 146
147#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 147#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
148#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 148#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2
149#define VMX_ABORT_VMCS_CORRUPTED 3
150#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 149#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
151 150
152#endif /* _UAPIVMX_H */ 151#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 118d185764ec..d125304ae2c9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2978,6 +2978,25 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2978 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2978 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2979 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 2979 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
2980 2980
2981 /*
2982 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
2983 * nested early checks are disabled. In the event of a "late" VM-Fail,
2984 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
2985 * software model to the pre-VMEntry host state. When EPT is disabled,
2986 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
2987 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
2988 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
2989 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
2990 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
2991 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
2992 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
2993 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
2994 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
2995 * path would need to manually save/restore vmcs01.GUEST_CR3.
2996 */
2997 if (!enable_ept && !nested_early_check)
2998 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
2999
2981 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3000 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
2982 3001
2983 prepare_vmcs02_early(vmx, vmcs12); 3002 prepare_vmcs02_early(vmx, vmcs12);
@@ -3869,18 +3888,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
3869 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 3888 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
3870 3889
3871 nested_ept_uninit_mmu_context(vcpu); 3890 nested_ept_uninit_mmu_context(vcpu);
3872 3891 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3873 /* 3892 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3874 * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3
3875 * points to shadow pages! Fortunately we only get here after a WARN_ON
3876 * if EPT is disabled, so a VMabort is perfectly fine.
3877 */
3878 if (enable_ept) {
3879 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3880 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3881 } else {
3882 nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED);
3883 }
3884 3893
3885 /* 3894 /*
3886 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 3895 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -3888,7 +3897,8 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
3888 * VMFail, like everything else we just need to ensure our 3897 * VMFail, like everything else we just need to ensure our
3889 * software model is up-to-date. 3898 * software model is up-to-date.
3890 */ 3899 */
3891 ept_save_pdptrs(vcpu); 3900 if (enable_ept)
3901 ept_save_pdptrs(vcpu);
3892 3902
3893 kvm_mmu_reset_context(vcpu); 3903 kvm_mmu_reset_context(vcpu);
3894 3904
@@ -5889,14 +5899,6 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
5889{ 5899{
5890 int i; 5900 int i;
5891 5901
5892 /*
5893 * Without EPT it is not possible to restore L1's CR3 and PDPTR on
5894 * VMfail, because they are not available in vmcs01. Just always
5895 * use hardware checks.
5896 */
5897 if (!enable_ept)
5898 nested_early_check = 1;
5899
5900 if (!cpu_has_vmx_shadow_vmcs()) 5902 if (!cpu_has_vmx_shadow_vmcs())
5901 enable_shadow_vmcs = 0; 5903 enable_shadow_vmcs = 0;
5902 if (enable_shadow_vmcs) { 5904 if (enable_shadow_vmcs) {