nEPT: Nested INVEPT

If we let L1 use EPT, we should probably also support the INVEPT instruction. In our current nested EPT implementation, when L1 changes its EPT table for L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course of this modification already calls INVEPT. But if last level of shadow page is unsync not all L1's changes to EPT12 are intercepted, which means roots need to be synced when L1 calls INVEPT. Global INVEPT should not be different since roots are synced by kvm_mmu_load() each time EPTP02 changes. Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> Signed-off-by: Gleb Natapov <gleb@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
author: Nadav Har'El <nyh@il.ibm.com> 2013-08-05 04:07:17 -0400
committer: Paolo Bonzini <pbonzini@redhat.com> 2013-08-07 09:57:42 -0400
commit: bfd0a56b90005f8c8a004baf407ad90045c2b11e (patch)
tree: c1e3a6e26b119d1c818deb4ae5079fd2676855bc /arch/x86
parent: 155a97a3d7c78b46cef6f1a973c831bc5a4f82bb (diff)
4 files changed, 77 insertions, 0 deletions
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa1..966502d4682e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR          0
 #define VMX_EPT_EXTENT_CONTEXT                  1
 #define VMX_EPT_EXTENT_GLOBAL                   2
+#define VMX_EPT_EXTENT_SHIFT                    24
 #define VMX_EPT_EXECUTE_ONLY_BIT                (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT                 (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT                         (1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT                    (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT                    (1ull << 17)
+#define VMX_EPT_INVEPT_BIT                      (1ull << 20)
 #define VMX_EPT_AD_BIT                              (1ull << 21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT              (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT               (1ull << 26)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf7..7a34e8fe54bd 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 992fde984e25..9651c9937588 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3182,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        mmu_sync_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
                                  u32 access, struct x86_exception *exception)
@@ -3451,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
        ++vcpu->stat.tlb_flush;
        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2ae0aa4461e8..5129ba3766c4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -712,6 +712,7 @@ static void nested_release_page_clean(struct page *page)
        kvm_release_page_clean(page);
 }
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
@@ -2161,6 +2162,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
        /*
@@ -6279,6 +6281,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        return 1;
 }
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+        u32 vmx_instruction_info, types;
+        unsigned long type;
+        gva_t gva;
+        struct x86_exception e;
+        struct {
+                u64 eptp, gpa;
+        } operand;
+        u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+        if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+            !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+                kvm_queue_exception(vcpu, UD_VECTOR);
+                return 1;
+        }
+        if (!nested_vmx_check_permission(vcpu))
+                return 1;
+        if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+                kvm_queue_exception(vcpu, UD_VECTOR);
+                return 1;
+        }
+        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+        type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+        types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+        if (!(types & (1UL << type))) {
+                nested_vmx_failValid(vcpu,
+                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+                return 1;
+        }
+        /* According to the Intel VMX instruction reference, the memory
+         * operand is read even if it isn't needed (e.g., for type==global)
+         */
+        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                        vmx_instruction_info, &gva))
+                return 1;
+        if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+                                sizeof(operand), &e)) {
+                kvm_inject_page_fault(vcpu, &e);
+                return 1;
+        }
+        switch (type) {
+        case VMX_EPT_EXTENT_CONTEXT:
+                if ((operand.eptp & eptp_mask) !=
+                                (nested_ept_get_cr3(vcpu) & eptp_mask))
+                        break;
+        case VMX_EPT_EXTENT_GLOBAL:
+                kvm_mmu_sync_roots(vcpu);
+                kvm_mmu_flush_tlb(vcpu);
+                nested_vmx_succeed(vcpu);
+                break;
+        default:
+                BUG_ON(1);
+                break;
+        }
+        skip_emulated_instruction(vcpu);
+        return 1;
+}
 /*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6323,6 +6393,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+        [EXIT_REASON_INVEPT]                  = handle_invept,
 };
 static const int kvm_vmx_max_exit_handlers =
@@ -6549,6 +6620,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
        case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+        case EXIT_REASON_INVEPT:
                /*
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
author	Nadav Har'El <nyh@il.ibm.com>	2013-08-05 04:07:17 -0400
committer	Paolo Bonzini <pbonzini@redhat.com>	2013-08-07 09:57:42 -0400
commit	bfd0a56b90005f8c8a004baf407ad90045c2b11e (patch)
tree	c1e3a6e26b119d1c818deb4ae5079fd2676855bc /arch/x86
parent	155a97a3d7c78b46cef6f1a973c831bc5a4f82bb (diff)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f3e01a2cbaa1..966502d4682e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
387	#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0	387	#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
388	#define VMX_EPT_EXTENT_CONTEXT 1	388	#define VMX_EPT_EXTENT_CONTEXT 1
389	#define VMX_EPT_EXTENT_GLOBAL 2	389	#define VMX_EPT_EXTENT_GLOBAL 2
		390	#define VMX_EPT_EXTENT_SHIFT 24
390		391
391	#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)	392	#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
392	#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)	393	#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
394	#define VMX_EPTP_WB_BIT (1ull << 14)	395	#define VMX_EPTP_WB_BIT (1ull << 14)
395	#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)	396	#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
396	#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)	397	#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
		398	#define VMX_EPT_INVEPT_BIT (1ull << 20)
397	#define VMX_EPT_AD_BIT (1ull << 21)	399	#define VMX_EPT_AD_BIT (1ull << 21)
398	#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)	400	#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
399	#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)	401	#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)


diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index d651082c7cf7..7a34e8fe54bd 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
65	#define EXIT_REASON_EOI_INDUCED 45	65	#define EXIT_REASON_EOI_INDUCED 45
66	#define EXIT_REASON_EPT_VIOLATION 48	66	#define EXIT_REASON_EPT_VIOLATION 48
67	#define EXIT_REASON_EPT_MISCONFIG 49	67	#define EXIT_REASON_EPT_MISCONFIG 49
		68	#define EXIT_REASON_INVEPT 50
68	#define EXIT_REASON_PREEMPTION_TIMER 52	69	#define EXIT_REASON_PREEMPTION_TIMER 52
69	#define EXIT_REASON_WBINVD 54	70	#define EXIT_REASON_WBINVD 54
70	#define EXIT_REASON_XSETBV 55	71	#define EXIT_REASON_XSETBV 55


diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 992fde984e25..9651c9937588 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c
@@ -3182,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3182	mmu_sync_roots(vcpu);	3182	mmu_sync_roots(vcpu);
3183	spin_unlock(&vcpu->kvm->mmu_lock);	3183	spin_unlock(&vcpu->kvm->mmu_lock);
3184	}	3184	}
		3185	EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3185		3186
3186	static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,	3187	static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3187	u32 access, struct x86_exception *exception)	3188	u32 access, struct x86_exception *exception)
@@ -3451,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3451	++vcpu->stat.tlb_flush;	3452	++vcpu->stat.tlb_flush;
3452	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);	3453	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3453	}	3454	}
		3455	EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
3454		3456
3455	static void paging_new_cr3(struct kvm_vcpu *vcpu)	3457	static void paging_new_cr3(struct kvm_vcpu *vcpu)
3456	{	3458	{


diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2ae0aa4461e8..5129ba3766c4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c
@@ -712,6 +712,7 @@ static void nested_release_page_clean(struct page *page)
712	kvm_release_page_clean(page);	712	kvm_release_page_clean(page);
713	}	713	}
714		714
		715	static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
715	static u64 construct_eptp(unsigned long root_hpa);	716	static u64 construct_eptp(unsigned long root_hpa);
716	static void kvm_cpu_vmxon(u64 addr);	717	static void kvm_cpu_vmxon(u64 addr);
717	static void kvm_cpu_vmxoff(void);	718	static void kvm_cpu_vmxoff(void);
@@ -2161,6 +2162,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2161	static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;	2162	static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2162	static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;	2163	static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2163	static u32 nested_vmx_misc_low, nested_vmx_misc_high;	2164	static u32 nested_vmx_misc_low, nested_vmx_misc_high;
		2165	static u32 nested_vmx_ept_caps;
2164	static __init void nested_vmx_setup_ctls_msrs(void)	2166	static __init void nested_vmx_setup_ctls_msrs(void)
2165	{	2167	{
2166	/*	2168	/*
@@ -6279,6 +6281,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
6279	return 1;	6281	return 1;
6280	}	6282	}
6281		6283
		6284	/* Emulate the INVEPT instruction */
		6285	static int handle_invept(struct kvm_vcpu *vcpu)
		6286	{
		6287	u32 vmx_instruction_info, types;
		6288	unsigned long type;
		6289	gva_t gva;
		6290	struct x86_exception e;
		6291	struct {
		6292	u64 eptp, gpa;
		6293	} operand;
		6294	u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
		6295
		6296	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) \|\|
		6297	!(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
		6298	kvm_queue_exception(vcpu, UD_VECTOR);
		6299	return 1;
		6300	}
		6301
		6302	if (!nested_vmx_check_permission(vcpu))
		6303	return 1;
		6304
		6305	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
		6306	kvm_queue_exception(vcpu, UD_VECTOR);
		6307	return 1;
		6308	}
		6309
		6310	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
		6311	type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
		6312
		6313	types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
		6314
		6315	if (!(types & (1UL << type))) {
		6316	nested_vmx_failValid(vcpu,
		6317	VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
		6318	return 1;
		6319	}
		6320
		6321	/* According to the Intel VMX instruction reference, the memory
		6322	* operand is read even if it isn't needed (e.g., for type==global)
		6323	*/
		6324	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
		6325	vmx_instruction_info, &gva))
		6326	return 1;
		6327	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
		6328	sizeof(operand), &e)) {
		6329	kvm_inject_page_fault(vcpu, &e);
		6330	return 1;
		6331	}
		6332
		6333	switch (type) {
		6334	case VMX_EPT_EXTENT_CONTEXT:
		6335	if ((operand.eptp & eptp_mask) !=
		6336	(nested_ept_get_cr3(vcpu) & eptp_mask))
		6337	break;
		6338	case VMX_EPT_EXTENT_GLOBAL:
		6339	kvm_mmu_sync_roots(vcpu);
		6340	kvm_mmu_flush_tlb(vcpu);
		6341	nested_vmx_succeed(vcpu);
		6342	break;
		6343	default:
		6344	BUG_ON(1);
		6345	break;
		6346	}
		6347
		6348	skip_emulated_instruction(vcpu);
		6349	return 1;
		6350	}
		6351
6282	/*	6352	/*
6283	* The exit handlers return 1 if the exit was handled fully and guest execution	6353	* The exit handlers return 1 if the exit was handled fully and guest execution
6284	* may resume. Otherwise they set the kvm_run parameter to indicate what needs	6354	* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -6323,6 +6393,7 @@ static int (const kvm_vmx_exit_handlers[])(struct kvm_vcpu vcpu) = {
6323	[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,	6393	[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6324	[EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,	6394	[EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
6325	[EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,	6395	[EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
		6396	[EXIT_REASON_INVEPT] = handle_invept,
6326	};	6397	};
6327		6398
6328	static const int kvm_vmx_max_exit_handlers =	6399	static const int kvm_vmx_max_exit_handlers =
@@ -6549,6 +6620,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6549	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:	6620	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
6550	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:	6621	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
6551	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:	6622	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
		6623	case EXIT_REASON_INVEPT:
6552	/*	6624	/*
6553	* VMX instructions trap unconditionally. This allows L1 to	6625	* VMX instructions trap unconditionally. This allows L1 to
6554	* emulate them for its L2 guest, i.e., allows 3-level nesting!	6626	* emulate them for its L2 guest, i.e., allows 3-level nesting!