KVM: MMU: mmio page fault support

The idea is from Avi: | We could cache the result of a miss in an spte by using a reserved bit, and | checking the page fault error code (or seeing if we get an ept violation or | ept misconfiguration), so if we get repeated mmio on a page, we don't need to | search the slot list/tree. | (https://lkml.org/lkml/2011/2/22/221) When the page fault is caused by mmio, we cache the info in the shadow page table, and also set the reserved bits in the shadow page table, so if the mmio is caused again, we can quickly identify it and emulate it directly Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it can be reduced by this feature, and also avoid walking guest page table for soft mmu. [jan: fix operator precedence issue] Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> Signed-off-by: Avi Kivity <avi@redhat.com>
author: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> 2011-07-11 15:33:44 -0400
committer: Avi Kivity <avi@redhat.com> 2011-07-24 04:50:40 -0400
commit: ce88decffd17bf9f373cc233c961ad2054965667 (patch)
tree: 65202d01a10c790eacb4b63bacc5fccfbe5bb050 /arch
parent: dd3bfd59dbc69fd970394ab354cfca5f959d5755 (diff)
5 files changed, 248 insertions, 14 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4b1aa6772147..4e22df6f93ec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,6 +197,47 @@ static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
+static void mmu_spte_set(u64 *sptep, u64 spte);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
+{
+        shadow_mmio_mask = mmio_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+{
+        access &= ACC_WRITE_MASK | ACC_USER_MASK;
+        mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+}
+static bool is_mmio_spte(u64 spte)
+{
+        return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+        return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+static unsigned get_mmio_spte_access(u64 spte)
+{
+        return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+}
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+        if (unlikely(is_noslot_pfn(pfn))) {
+                mark_mmio_spte(sptep, gfn, access);
+                return true;
+        }
+        return false;
+}
 static inline u64 rsvd_bits(int s, int e)
 {
@@ -226,7 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 static int is_shadow_present_pte(u64 pte)
 {
-        return pte & PT_PRESENT_MASK;
+        return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
 }
 static int is_large_pte(u64 pte)
@@ -285,6 +326,12 @@ static u64 __get_spte_lockless(u64 *sptep)
 {
        return ACCESS_ONCE(*sptep);
 }
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+        /* It is valid if the spte is zapped. */
+        return spte == 0ull;
+}
 #else
 union split_spte {
        struct {
@@ -388,6 +435,23 @@ retry:
        return spte.spte;
 }
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+        union split_spte sspte = (union split_spte)spte;
+        u32 high_mmio_mask = shadow_mmio_mask >> 32;
+        /* It is valid if the spte is zapped. */
+        if (spte == 0ull)
+                return true;
+        /* It is valid if the spte is being zapped. */
+        if (sspte.spte_low == 0ull &&
+            (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
+                return true;
+        return false;
+}
 #endif
 static bool spte_has_volatile_bits(u64 spte)
@@ -1745,7 +1809,8 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
                        child = page_header(pte & PT64_BASE_ADDR_MASK);
                        drop_parent_pte(child, spte);
                }
-        }
+        } else if (is_mmio_spte(pte))
+                mmu_spte_clear_no_track(spte);
        if (is_large_pte(pte))
                --kvm->stat.lpages;
@@ -2120,6 +2185,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        u64 spte, entry = *sptep;
        int ret = 0;
+        if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+                return 0;
        /*
         * We don't set the accessed bit, since we sometimes want to see
         * whether the guest actually used the pte (in order to detect
@@ -2255,6 +2323,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                kvm_mmu_flush_tlb(vcpu);
        }
+        if (unlikely(is_mmio_spte(*sptep) && emulate))
+                *emulate = 1;
        pgprintk("%s: setting spte %llx\n", __func__, *sptep);
        pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
                 is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2481,7 +2552,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 static bool mmu_invalid_pfn(pfn_t pfn)
 {
-        return unlikely(is_invalid_pfn(pfn) || is_noslot_pfn(pfn));
+        return unlikely(is_invalid_pfn(pfn));
 }
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
@@ -2495,11 +2566,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                goto exit;
        }
-        if (unlikely(is_noslot_pfn(pfn))) {
+        if (unlikely(is_noslot_pfn(pfn)))
                vcpu_cache_mmio_info(vcpu, gva, gfn, access);
-                *ret_val = 1;
-                goto exit;
-        }
        ret = false;
 exit:
@@ -2813,6 +2881,92 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
        return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
 }
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+        if (direct)
+                return vcpu_match_mmio_gpa(vcpu, addr);
+        return vcpu_match_mmio_gva(vcpu, addr);
+}
+/*
+ * On direct hosts, the last spte is only allows two states
+ * for mmio page fault:
+ *   - It is the mmio spte
+ *   - It is zapped or it is being zapped.
+ *
+ * This function completely checks the spte when the last spte
+ * is not the mmio spte.
+ */
+static bool check_direct_spte_mmio_pf(u64 spte)
+{
+        return __check_direct_spte_mmio_pf(spte);
+}
+static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+{
+        struct kvm_shadow_walk_iterator iterator;
+        u64 spte = 0ull;
+        walk_shadow_page_lockless_begin(vcpu);
+        for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+                if (!is_shadow_present_pte(spte))
+                        break;
+        walk_shadow_page_lockless_end(vcpu);
+        return spte;
+}
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 to let CPU fault again on the address, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+        u64 spte;
+        if (quickly_check_mmio_pf(vcpu, addr, direct))
+                return 1;
+        spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+        if (is_mmio_spte(spte)) {
+                gfn_t gfn = get_mmio_spte_gfn(spte);
+                unsigned access = get_mmio_spte_access(spte);
+                if (direct)
+                        addr = 0;
+                vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+                return 1;
+        }
+        /*
+         * It's ok if the gva is remapped by other cpus on shadow guest,
+         * it's a BUG if the gfn is not a mmio page.
+         */
+        if (direct && !check_direct_spte_mmio_pf(spte))
+                return -1;
+        /*
+         * If the page table is zapped by other cpus, let CPU fault again on
+         * the address.
+         */
+        return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+                                  u32 error_code, bool direct)
+{
+        int ret;
+        ret = handle_mmio_page_fault_common(vcpu, addr, direct);
+        WARN_ON(ret < 0);
+        return ret;
+}
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                u32 error_code, bool prefault)
 {
@@ -2820,6 +2974,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
        int r;
        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+        if (unlikely(error_code & PFERR_RSVD_MASK))
+                return handle_mmio_page_fault(vcpu, gva, error_code, true);
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
@@ -2896,6 +3054,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        ASSERT(vcpu);
        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+        if (unlikely(error_code & PFERR_RSVD_MASK))
+                return handle_mmio_page_fault(vcpu, gpa, error_code, true);
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
@@ -2993,6 +3154,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
+static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+                           int *nr_present)
+{
+        if (unlikely(is_mmio_spte(*sptep))) {
+                if (gfn != get_mmio_spte_gfn(*sptep)) {
+                        mmu_spte_clear_no_track(sptep);
+                        return true;
+                }
+                (*nr_present)++;
+                mark_mmio_spte(sptep, gfn, access);
+                return true;
+        }
+        return false;
+}
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 05310b105dac..e374db9af021 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,8 @@
 #define PFERR_FETCH_MASK (1U << 4)
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 67998d3be084..507e2b844cfa 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -577,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+        if (unlikely(error_code & PFERR_RSVD_MASK))
+                return handle_mmio_page_fault(vcpu, addr, error_code,
+                                              mmu_is_nested(vcpu));
        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;
@@ -684,7 +688,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
                                        --vcpu->kvm->stat.lpages;
                                drop_spte(vcpu->kvm, sptep);
                                need_flush = 1;
-                        }
+                        } else if (is_mmio_spte(*sptep))
+                                mmu_spte_clear_no_track(sptep);
                        break;
                }
@@ -780,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                gpa_t pte_gpa;
                gfn_t gfn;
-                if (!is_shadow_present_pte(sp->spt[i]))
+                if (!sp->spt[i])
                        continue;
                pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -789,13 +794,18 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                                          sizeof(pt_element_t)))
                        return -EINVAL;
-                gfn = gpte_to_gfn(gpte);
                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
                        vcpu->kvm->tlbs_dirty++;
                        continue;
                }
+                gfn = gpte_to_gfn(gpte);
+                pte_access = sp->role.access;
+                pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+                if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+                        continue;
                if (gfn != sp->gfns[i]) {
                        drop_spte(vcpu->kvm, &sp->spt[i]);
                        vcpu->kvm->tlbs_dirty++;
@@ -803,8 +813,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                }
                nr_present++;
-                pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
-                                                                  true);
                host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
                set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a644acb6ed80..e65a158dee64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3594,6 +3594,17 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
        return exec_control;
 }
+static void ept_set_mmio_spte_mask(void)
+{
+        /*
+         * EPT Misconfigurations can be generated if the value of bits 2:0
+         * of an EPT paging-structure entry is 110b (write/execute).
+         * Also, magic bits (0xffull << 49) is set to quickly identify mmio
+         * spte.
+         */
+        kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
+}
 /*
 * Sets up the vmcs for emulated real mode.
 */
@@ -4671,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
        u64 sptes[4];
-        int nr_sptes, i;
+        int nr_sptes, i, ret;
        gpa_t gpa;
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+        ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+        if (likely(ret == 1))
+                return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
+                                              EMULATE_DONE;
+        if (unlikely(!ret))
+                return 1;
+        /* It is the real ept misconfig */
        printk(KERN_ERR "EPT: Misconfiguration.\n");
        printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
@@ -7102,6 +7121,7 @@ static int __init vmx_init(void)
        if (enable_ept) {
                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
                                VMX_EPT_EXECUTABLE_MASK);
+                ept_set_mmio_spte_mask();
                kvm_enable_tdp();
        } else
                kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 64c42d90112b..2c9661f230a9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5062,6 +5062,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
+static void kvm_set_mmio_spte_mask(void)
+{
+        u64 mask;
+        int maxphyaddr = boot_cpu_data.x86_phys_bits;
+        /*
+         * Set the reserved bits and the present bit of an paging-structure
+         * entry to generate page fault with PFER.RSV = 1.
+         */
+        mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+        mask |= 1ull;
+#ifdef CONFIG_X86_64
+        /*
+         * If reserved bit is not supported, clear the present bit to disable
+         * mmio page fault.
+         */
+        if (maxphyaddr == 52)
+                mask &= ~1ull;
+#endif
+        kvm_mmu_set_mmio_spte_mask(mask);
+}
 int kvm_arch_init(void *opaque)
 {
        int r;
@@ -5088,6 +5112,7 @@ int kvm_arch_init(void *opaque)
        if (r)
                goto out;
+        kvm_set_mmio_spte_mask();
        kvm_init_msr_list();
        kvm_x86_ops = ops;
author	Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>	2011-07-11 15:33:44 -0400
committer	Avi Kivity <avi@redhat.com>	2011-07-24 04:50:40 -0400
commit	ce88decffd17bf9f373cc233c961ad2054965667 (patch)
tree	65202d01a10c790eacb4b63bacc5fccfbe5bb050 /arch
parent	dd3bfd59dbc69fd970394ab354cfca5f959d5755 (diff)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4b1aa6772147..4e22df6f93ec 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c
@@ -197,6 +197,47 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
197	static u64 __read_mostly shadow_user_mask;	197	static u64 __read_mostly shadow_user_mask;
198	static u64 __read_mostly shadow_accessed_mask;	198	static u64 __read_mostly shadow_accessed_mask;
199	static u64 __read_mostly shadow_dirty_mask;	199	static u64 __read_mostly shadow_dirty_mask;
		200	static u64 __read_mostly shadow_mmio_mask;
		201
		202	static void mmu_spte_set(u64 *sptep, u64 spte);
		203
		204	void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
		205	{
		206	shadow_mmio_mask = mmio_mask;
		207	}
		208	EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
		209
		210	static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
		211	{
		212	access &= ACC_WRITE_MASK \| ACC_USER_MASK;
		213
		214	mmu_spte_set(sptep, shadow_mmio_mask \| access \| gfn << PAGE_SHIFT);
		215	}
		216
		217	static bool is_mmio_spte(u64 spte)
		218	{
		219	return (spte & shadow_mmio_mask) == shadow_mmio_mask;
		220	}
		221
		222	static gfn_t get_mmio_spte_gfn(u64 spte)
		223	{
		224	return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
		225	}
		226
		227	static unsigned get_mmio_spte_access(u64 spte)
		228	{
		229	return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
		230	}
		231
		232	static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
		233	{
		234	if (unlikely(is_noslot_pfn(pfn))) {
		235	mark_mmio_spte(sptep, gfn, access);
		236	return true;
		237	}
		238
		239	return false;
		240	}
200		241
201	static inline u64 rsvd_bits(int s, int e)	242	static inline u64 rsvd_bits(int s, int e)
202	{	243	{
@@ -226,7 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
226		267
227	static int is_shadow_present_pte(u64 pte)	268	static int is_shadow_present_pte(u64 pte)
228	{	269	{
229	return pte & PT_PRESENT_MASK;	270	return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
230	}	271	}
231		272
232	static int is_large_pte(u64 pte)	273	static int is_large_pte(u64 pte)
@@ -285,6 +326,12 @@ static u64 __get_spte_lockless(u64 *sptep)
285	{	326	{
286	return ACCESS_ONCE(*sptep);	327	return ACCESS_ONCE(*sptep);
287	}	328	}
		329
		330	static bool __check_direct_spte_mmio_pf(u64 spte)
		331	{
		332	/* It is valid if the spte is zapped. */
		333	return spte == 0ull;
		334	}
288	#else	335	#else
289	union split_spte {	336	union split_spte {
290	struct {	337	struct {
@@ -388,6 +435,23 @@ retry:
388		435
389	return spte.spte;	436	return spte.spte;
390	}	437	}
		438
		439	static bool __check_direct_spte_mmio_pf(u64 spte)
		440	{
		441	union split_spte sspte = (union split_spte)spte;
		442	u32 high_mmio_mask = shadow_mmio_mask >> 32;
		443
		444	/* It is valid if the spte is zapped. */
		445	if (spte == 0ull)
		446	return true;
		447
		448	/* It is valid if the spte is being zapped. */
		449	if (sspte.spte_low == 0ull &&
		450	(sspte.spte_high & high_mmio_mask) == high_mmio_mask)
		451	return true;
		452
		453	return false;
		454	}
391	#endif	455	#endif
392		456
393	static bool spte_has_volatile_bits(u64 spte)	457	static bool spte_has_volatile_bits(u64 spte)
@@ -1745,7 +1809,8 @@ static void mmu_page_zap_pte(struct kvm kvm, struct kvm_mmu_page sp,
1745	child = page_header(pte & PT64_BASE_ADDR_MASK);	1809	child = page_header(pte & PT64_BASE_ADDR_MASK);
1746	drop_parent_pte(child, spte);	1810	drop_parent_pte(child, spte);
1747	}	1811	}
1748	}	1812	} else if (is_mmio_spte(pte))
		1813	mmu_spte_clear_no_track(spte);
1749		1814
1750	if (is_large_pte(pte))	1815	if (is_large_pte(pte))
1751	--kvm->stat.lpages;	1816	--kvm->stat.lpages;
@@ -2120,6 +2185,9 @@ static int set_spte(struct kvm_vcpu vcpu, u64 sptep,
2120	u64 spte, entry = *sptep;	2185	u64 spte, entry = *sptep;
2121	int ret = 0;	2186	int ret = 0;
2122		2187
		2188	if (set_mmio_spte(sptep, gfn, pfn, pte_access))
		2189	return 0;
		2190
2123	/*	2191	/*
2124	* We don't set the accessed bit, since we sometimes want to see	2192	* We don't set the accessed bit, since we sometimes want to see
2125	* whether the guest actually used the pte (in order to detect	2193	* whether the guest actually used the pte (in order to detect
@@ -2255,6 +2323,9 @@ static void mmu_set_spte(struct kvm_vcpu vcpu, u64 sptep,
2255	kvm_mmu_flush_tlb(vcpu);	2323	kvm_mmu_flush_tlb(vcpu);
2256	}	2324	}
2257		2325
		2326	if (unlikely(is_mmio_spte(*sptep) && emulate))
		2327	*emulate = 1;
		2328
2258	pgprintk("%s: setting spte %llx\n", __func__, *sptep);	2329	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2259	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",	2330	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2260	is_large_pte(*sptep)? "2MB" : "4kB",	2331	is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2481,7 +2552,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2481		2552
2482	static bool mmu_invalid_pfn(pfn_t pfn)	2553	static bool mmu_invalid_pfn(pfn_t pfn)
2483	{	2554	{
2484	return unlikely(is_invalid_pfn(pfn) \|\| is_noslot_pfn(pfn));	2555	return unlikely(is_invalid_pfn(pfn));
2485	}	2556	}
2486		2557
2487	static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,	2558	static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
@@ -2495,11 +2566,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2495	goto exit;	2566	goto exit;
2496	}	2567	}
2497		2568
2498	if (unlikely(is_noslot_pfn(pfn))) {	2569	if (unlikely(is_noslot_pfn(pfn)))
2499	vcpu_cache_mmio_info(vcpu, gva, gfn, access);	2570	vcpu_cache_mmio_info(vcpu, gva, gfn, access);
2500	*ret_val = 1;
2501	goto exit;
2502	}
2503		2571
2504	ret = false;	2572	ret = false;
2505	exit:	2573	exit:
@@ -2813,6 +2881,92 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2813	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);	2881	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2814	}	2882	}
2815		2883
		2884	static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
		2885	{
		2886	if (direct)
		2887	return vcpu_match_mmio_gpa(vcpu, addr);
		2888
		2889	return vcpu_match_mmio_gva(vcpu, addr);
		2890	}
		2891
		2892
		2893	/*
		2894	* On direct hosts, the last spte is only allows two states
		2895	* for mmio page fault:
		2896	* - It is the mmio spte
		2897	* - It is zapped or it is being zapped.
		2898	*
		2899	* This function completely checks the spte when the last spte
		2900	* is not the mmio spte.
		2901	*/
		2902	static bool check_direct_spte_mmio_pf(u64 spte)
		2903	{
		2904	return __check_direct_spte_mmio_pf(spte);
		2905	}
		2906
		2907	static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
		2908	{
		2909	struct kvm_shadow_walk_iterator iterator;
		2910	u64 spte = 0ull;
		2911
		2912	walk_shadow_page_lockless_begin(vcpu);
		2913	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
		2914	if (!is_shadow_present_pte(spte))
		2915	break;
		2916	walk_shadow_page_lockless_end(vcpu);
		2917
		2918	return spte;
		2919	}
		2920
		2921	/*
		2922	* If it is a real mmio page fault, return 1 and emulat the instruction
		2923	* directly, return 0 to let CPU fault again on the address, -1 is
		2924	* returned if bug is detected.
		2925	*/
		2926	int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
		2927	{
		2928	u64 spte;
		2929
		2930	if (quickly_check_mmio_pf(vcpu, addr, direct))
		2931	return 1;
		2932
		2933	spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
		2934
		2935	if (is_mmio_spte(spte)) {
		2936	gfn_t gfn = get_mmio_spte_gfn(spte);
		2937	unsigned access = get_mmio_spte_access(spte);
		2938
		2939	if (direct)
		2940	addr = 0;
		2941	vcpu_cache_mmio_info(vcpu, addr, gfn, access);
		2942	return 1;
		2943	}
		2944
		2945	/*
		2946	* It's ok if the gva is remapped by other cpus on shadow guest,
		2947	* it's a BUG if the gfn is not a mmio page.
		2948	*/
		2949	if (direct && !check_direct_spte_mmio_pf(spte))
		2950	return -1;
		2951
		2952	/*
		2953	* If the page table is zapped by other cpus, let CPU fault again on
		2954	* the address.
		2955	*/
		2956	return 0;
		2957	}
		2958	EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
		2959
		2960	static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
		2961	u32 error_code, bool direct)
		2962	{
		2963	int ret;
		2964
		2965	ret = handle_mmio_page_fault_common(vcpu, addr, direct);
		2966	WARN_ON(ret < 0);
		2967	return ret;
		2968	}
		2969
2816	static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,	2970	static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2817	u32 error_code, bool prefault)	2971	u32 error_code, bool prefault)
2818	{	2972	{
@@ -2820,6 +2974,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2820	int r;	2974	int r;
2821		2975
2822	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);	2976	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
		2977
		2978	if (unlikely(error_code & PFERR_RSVD_MASK))
		2979	return handle_mmio_page_fault(vcpu, gva, error_code, true);
		2980
2823	r = mmu_topup_memory_caches(vcpu);	2981	r = mmu_topup_memory_caches(vcpu);
2824	if (r)	2982	if (r)
2825	return r;	2983	return r;
@@ -2896,6 +3054,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2896	ASSERT(vcpu);	3054	ASSERT(vcpu);
2897	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));	3055	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2898		3056
		3057	if (unlikely(error_code & PFERR_RSVD_MASK))
		3058	return handle_mmio_page_fault(vcpu, gpa, error_code, true);
		3059
2899	r = mmu_topup_memory_caches(vcpu);	3060	r = mmu_topup_memory_caches(vcpu);
2900	if (r)	3061	if (r)
2901	return r;	3062	return r;
@@ -2993,6 +3154,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2993	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;	3154	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2994	}	3155	}
2995		3156
		3157	static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
		3158	int *nr_present)
		3159	{
		3160	if (unlikely(is_mmio_spte(*sptep))) {
		3161	if (gfn != get_mmio_spte_gfn(*sptep)) {
		3162	mmu_spte_clear_no_track(sptep);
		3163	return true;
		3164	}
		3165
		3166	(*nr_present)++;
		3167	mark_mmio_spte(sptep, gfn, access);
		3168	return true;
		3169	}
		3170
		3171	return false;
		3172	}
		3173
2996	#define PTTYPE 64	3174	#define PTTYPE 64
2997	#include "paging_tmpl.h"	3175	#include "paging_tmpl.h"
2998	#undef PTTYPE	3176	#undef PTTYPE


diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 05310b105dac..e374db9af021 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,8 @@
49	#define PFERR_FETCH_MASK (1U << 4)	49	#define PFERR_FETCH_MASK (1U << 4)
50		50
51	int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);	51	int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
		52	void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
		53	int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
52	int kvm_init_shadow_mmu(struct kvm_vcpu vcpu, struct kvm_mmu context);	54	int kvm_init_shadow_mmu(struct kvm_vcpu vcpu, struct kvm_mmu context);
53		55
54	static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)	56	static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)


diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67998d3be084..507e2b844cfa 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h
@@ -577,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
577		577
578	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);	578	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
579		579
		580	if (unlikely(error_code & PFERR_RSVD_MASK))
		581	return handle_mmio_page_fault(vcpu, addr, error_code,
		582	mmu_is_nested(vcpu));
		583
580	r = mmu_topup_memory_caches(vcpu);	584	r = mmu_topup_memory_caches(vcpu);
581	if (r)	585	if (r)
582	return r;	586	return r;
@@ -684,7 +688,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
684	--vcpu->kvm->stat.lpages;	688	--vcpu->kvm->stat.lpages;
685	drop_spte(vcpu->kvm, sptep);	689	drop_spte(vcpu->kvm, sptep);
686	need_flush = 1;	690	need_flush = 1;
687	}	691	} else if (is_mmio_spte(*sptep))
		692	mmu_spte_clear_no_track(sptep);
688		693
689	break;	694	break;
690	}	695	}
@@ -780,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
780	gpa_t pte_gpa;	785	gpa_t pte_gpa;
781	gfn_t gfn;	786	gfn_t gfn;
782		787
783	if (!is_shadow_present_pte(sp->spt[i]))	788	if (!sp->spt[i])
784	continue;	789	continue;
785		790
786	pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);	791	pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -789,13 +794,18 @@ static int FNAME(sync_page)(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
789	sizeof(pt_element_t)))	794	sizeof(pt_element_t)))
790	return -EINVAL;	795	return -EINVAL;
791		796
792	gfn = gpte_to_gfn(gpte);
793
794	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {	797	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
795	vcpu->kvm->tlbs_dirty++;	798	vcpu->kvm->tlbs_dirty++;
796	continue;	799	continue;
797	}	800	}
798		801
		802	gfn = gpte_to_gfn(gpte);
		803	pte_access = sp->role.access;
		804	pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
		805
		806	if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
		807	continue;
		808
799	if (gfn != sp->gfns[i]) {	809	if (gfn != sp->gfns[i]) {
800	drop_spte(vcpu->kvm, &sp->spt[i]);	810	drop_spte(vcpu->kvm, &sp->spt[i]);
801	vcpu->kvm->tlbs_dirty++;	811	vcpu->kvm->tlbs_dirty++;
@@ -803,8 +813,7 @@ static int FNAME(sync_page)(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
803	}	813	}
804		814
805	nr_present++;	815	nr_present++;
806	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,	816
807	true);
808	host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;	817	host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
809		818
810	set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,	819	set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,


diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a644acb6ed80..e65a158dee64 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c
@@ -3594,6 +3594,17 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3594	return exec_control;	3594	return exec_control;
3595	}	3595	}
3596		3596
		3597	static void ept_set_mmio_spte_mask(void)
		3598	{
		3599	/*
		3600	* EPT Misconfigurations can be generated if the value of bits 2:0
		3601	* of an EPT paging-structure entry is 110b (write/execute).
		3602	* Also, magic bits (0xffull << 49) is set to quickly identify mmio
		3603	* spte.
		3604	*/
		3605	kvm_mmu_set_mmio_spte_mask(0xffull << 49 \| 0x6ull);
		3606	}
		3607
3597	/*	3608	/*
3598	* Sets up the vmcs for emulated real mode.	3609	* Sets up the vmcs for emulated real mode.
3599	*/	3610	*/
@@ -4671,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
4671	static int handle_ept_misconfig(struct kvm_vcpu *vcpu)	4682	static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
4672	{	4683	{
4673	u64 sptes[4];	4684	u64 sptes[4];
4674	int nr_sptes, i;	4685	int nr_sptes, i, ret;
4675	gpa_t gpa;	4686	gpa_t gpa;
4676		4687
4677	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);	4688	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
4678		4689
		4690	ret = handle_mmio_page_fault_common(vcpu, gpa, true);
		4691	if (likely(ret == 1))
		4692	return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
		4693	EMULATE_DONE;
		4694	if (unlikely(!ret))
		4695	return 1;
		4696
		4697	/* It is the real ept misconfig */
4679	printk(KERN_ERR "EPT: Misconfiguration.\n");	4698	printk(KERN_ERR "EPT: Misconfiguration.\n");
4680	printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);	4699	printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
4681		4700
@@ -7102,6 +7121,7 @@ static int __init vmx_init(void)
7102	if (enable_ept) {	7121	if (enable_ept) {
7103	kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,	7122	kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
7104	VMX_EPT_EXECUTABLE_MASK);	7123	VMX_EPT_EXECUTABLE_MASK);
		7124	ept_set_mmio_spte_mask();
7105	kvm_enable_tdp();	7125	kvm_enable_tdp();
7106	} else	7126	} else
7107	kvm_disable_tdp();	7127	kvm_disable_tdp();


diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 64c42d90112b..2c9661f230a9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c
@@ -5062,6 +5062,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
5062	}	5062	}
5063	EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);	5063	EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
5064		5064
		5065	static void kvm_set_mmio_spte_mask(void)
		5066	{
		5067	u64 mask;
		5068	int maxphyaddr = boot_cpu_data.x86_phys_bits;
		5069
		5070	/*
		5071	* Set the reserved bits and the present bit of an paging-structure
		5072	* entry to generate page fault with PFER.RSV = 1.
		5073	*/
		5074	mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
		5075	mask \|= 1ull;
		5076
		5077	#ifdef CONFIG_X86_64
		5078	/*
		5079	* If reserved bit is not supported, clear the present bit to disable
		5080	* mmio page fault.
		5081	*/
		5082	if (maxphyaddr == 52)
		5083	mask &= ~1ull;
		5084	#endif
		5085
		5086	kvm_mmu_set_mmio_spte_mask(mask);
		5087	}
		5088
5065	int kvm_arch_init(void *opaque)	5089	int kvm_arch_init(void *opaque)
5066	{	5090	{
5067	int r;	5091	int r;
@@ -5088,6 +5112,7 @@ int kvm_arch_init(void *opaque)
5088	if (r)	5112	if (r)
5089	goto out;	5113	goto out;
5090		5114
		5115	kvm_set_mmio_spte_mask();
5091	kvm_init_msr_list();	5116	kvm_init_msr_list();
5092		5117
5093	kvm_x86_ops = ops;	5118	kvm_x86_ops = ops;