1 files changed, 245 insertions, 56 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87b781c..0d094da49541 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+/*
+ * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
+ * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
+ * number.
+ */
+#define MMIO_SPTE_GEN_LOW_SHIFT         3
+#define MMIO_SPTE_GEN_HIGH_SHIFT        52
+#define MMIO_GEN_SHIFT                  19
+#define MMIO_GEN_LOW_SHIFT              9
+#define MMIO_GEN_LOW_MASK               ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_MASK                   ((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_MAX_GEN                    ((1 << MMIO_GEN_SHIFT) - 1)
+static u64 generation_mmio_spte_mask(unsigned int gen)
 {
-        struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+        u64 mask;
+        WARN_ON(gen > MMIO_MAX_GEN);
+        mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
+        mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+        return mask;
+}
+static unsigned int get_mmio_spte_generation(u64 spte)
+{
+        unsigned int gen;
+        spte &= ~shadow_mmio_mask;
+        gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
+        gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+        return gen;
+}
+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+        /*
+         * Init kvm generation close to MMIO_MAX_GEN to easily test the
+         * code of handling generation number wrap-around.
+         */
+        return (kvm_memslots(kvm)->generation +
+                      MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+}
+static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+                           unsigned access)
+{
+        unsigned int gen = kvm_current_mmio_generation(kvm);
+        u64 mask = generation_mmio_spte_mask(gen);
        access &= ACC_WRITE_MASK | ACC_USER_MASK;
+        mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
-        sp->mmio_cached = true;
+        trace_mark_mmio_spte(sptep, gfn, access, gen);
-        trace_mark_mmio_spte(sptep, gfn, access);
+        mmu_spte_set(sptep, mask);
-        mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
 }
 static bool is_mmio_spte(u64 spte)
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte)
 static gfn_t get_mmio_spte_gfn(u64 spte)
 {
-        return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+        u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+        return (spte & ~mask) >> PAGE_SHIFT;
 }
 static unsigned get_mmio_spte_access(u64 spte)
 {
-        return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+        u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+        return (spte & ~mask) & ~PAGE_MASK;
 }
-static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+                          pfn_t pfn, unsigned access)
 {
        if (unlikely(is_noslot_pfn(pfn))) {
-                mark_mmio_spte(sptep, gfn, access);
+                mark_mmio_spte(kvm, sptep, gfn, access);
                return true;
        }
        return false;
 }
+static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+{
+        unsigned int kvm_gen, spte_gen;
+        kvm_gen = kvm_current_mmio_generation(kvm);
+        spte_gen = get_mmio_spte_generation(spte);
+        trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+        return likely(kvm_gen == spte_gen);
+}
 static inline u64 rsvd_bits(int s, int e)
 {
        return ((1ULL << (e - s + 1)) - 1) << s;
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 /*
 * The idea using the light way get the spte on x86_32 guest is from
 * gup_get_pte(arch/x86/mm/gup.c).
- * The difference is we can not catch the spte tlb flush if we leave
+ *
- * guest mode, so we emulate it by increase clear_spte_count when spte
+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
- * is cleared.
+ * coalesces them and we are running out of the MMU lock.  Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte.  The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race.  This is done using clear_spte_count.
 */
 static u64 __get_spte_lockless(u64 *sptep)
 {
@@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
        if (!direct)
                sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+        /*
+         * The active_mmu_pages list is the FIFO list, do not move the
+         * page until it is zapped. kvm_zap_obsolete_pages depends on
+         * this feature. See the comments in kvm_zap_obsolete_pages().
+         */
        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
        sp->parent_ptes = 0;
        mmu_page_add_parent_pte(vcpu, sp, parent_pte);
@@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                    struct list_head *invalid_list);
+/*
+ * NOTE: we should pay more attention on the zapped-obsolete page
+ * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
+ * since it has been deleted from active_mmu_pages but still can be found
+ * at hast list.
+ *
+ * for_each_gfn_indirect_valid_sp has skipped that kind of page and
+ * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
+ * all the obsolete pages.
+ */
 #define for_each_gfn_sp(_kvm, _sp, _gfn)                                \
        hlist_for_each_entry(_sp,                                       \
          &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
        __clear_sp_write_flooding_count(sp);
 }
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+        return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                             gfn_t gfn,
                                             gva_t gaddr,
@@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                role.quadrant = quadrant;
        }
        for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+                if (is_obsolete_sp(vcpu->kvm, sp))
+                        continue;
                if (!need_sync && sp->unsync)
                        need_sync = true;
@@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                account_shadowed(vcpu->kvm, gfn);
        }
+        sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
        init_shadow_page_table(sp);
        trace_kvm_mmu_get_page(sp, true);
        return sp;
@@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
        ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
        kvm_mmu_page_unlink_children(kvm, sp);
        kvm_mmu_unlink_parents(kvm, sp);
        if (!sp->role.invalid && !sp->role.direct)
                unaccount_shadowed(kvm, sp->gfn);
        if (sp->unsync)
                kvm_unlink_unsync_page(kvm, sp);
        if (!sp->root_count) {
@@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
                kvm_mod_used_mmu_pages(kvm, -1);
        } else {
                list_move(&sp->link, &kvm->arch.active_mmu_pages);
-                kvm_reload_remote_mmus(kvm);
+                /*
+                 * The obsolete pages can not be used on any vcpus.
+                 * See the comments in kvm_mmu_invalidate_zap_all_pages().
+                 */
+                if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+                        kvm_reload_remote_mmus(kvm);
        }
        sp->role.invalid = 1;
@@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        u64 spte;
        int ret = 0;
-        if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+        if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
                return 0;
        spte = PT_PRESENT_MASK;
@@ -2869,22 +2975,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
        if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                return;
-        spin_lock(&vcpu->kvm->mmu_lock);
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
            (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
             vcpu->arch.mmu.direct_map)) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
+                spin_lock(&vcpu->kvm->mmu_lock);
                sp = page_header(root);
                --sp->root_count;
                if (!sp->root_count && sp->role.invalid) {
                        kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
                        kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
                }
-                vcpu->arch.mmu.root_hpa = INVALID_PAGE;
                spin_unlock(&vcpu->kvm->mmu_lock);
+                vcpu->arch.mmu.root_hpa = INVALID_PAGE;
                return;
        }
+        spin_lock(&vcpu->kvm->mmu_lock);
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -3148,17 +3257,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
        return spte;
 }
-/*
- * If it is a real mmio page fault, return 1 and emulat the instruction
- * directly, return 0 to let CPU fault again on the address, -1 is
- * returned if bug is detected.
- */
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
        u64 spte;
        if (quickly_check_mmio_pf(vcpu, addr, direct))
-                return 1;
+                return RET_MMIO_PF_EMULATE;
        spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
@@ -3166,12 +3270,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
                gfn_t gfn = get_mmio_spte_gfn(spte);
                unsigned access = get_mmio_spte_access(spte);
+                if (!check_mmio_spte(vcpu->kvm, spte))
+                        return RET_MMIO_PF_INVALID;
                if (direct)
                        addr = 0;
                trace_handle_mmio_page_fault(addr, gfn, access);
                vcpu_cache_mmio_info(vcpu, addr, gfn, access);
-                return 1;
+                return RET_MMIO_PF_EMULATE;
        }
        /*
@@ -3179,13 +3286,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
         * it's a BUG if the gfn is not a mmio page.
         */
        if (direct && !check_direct_spte_mmio_pf(spte))
-                return -1;
+                return RET_MMIO_PF_BUG;
        /*
         * If the page table is zapped by other cpus, let CPU fault again on
         * the address.
         */
-        return 0;
+        return RET_MMIO_PF_RETRY;
 }
 EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
@@ -3195,7 +3302,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
        int ret;
        ret = handle_mmio_page_fault_common(vcpu, addr, direct);
-        WARN_ON(ret < 0);
+        WARN_ON(ret == RET_MMIO_PF_BUG);
        return ret;
 }
@@ -3207,8 +3314,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
-        if (unlikely(error_code & PFERR_RSVD_MASK))
+        if (unlikely(error_code & PFERR_RSVD_MASK)) {
-                return handle_mmio_page_fault(vcpu, gva, error_code, true);
+                r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+                if (likely(r != RET_MMIO_PF_INVALID))
+                        return r;
+        }
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@ -3284,8 +3395,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        ASSERT(vcpu);
        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-        if (unlikely(error_code & PFERR_RSVD_MASK))
+        if (unlikely(error_code & PFERR_RSVD_MASK)) {
-                return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+                r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+                if (likely(r != RET_MMIO_PF_INVALID))
+                        return r;
+        }
        r = mmu_topup_memory_caches(vcpu);
        if (r)
@@ -3391,8 +3506,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
        *access &= mask;
 }
-static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
-                           int *nr_present)
+                           unsigned access, int *nr_present)
 {
        if (unlikely(is_mmio_spte(*sptep))) {
                if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3401,7 +3516,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
                }
                (*nr_present)++;
-                mark_mmio_spte(sptep, gfn, access);
+                mark_mmio_spte(kvm, sptep, gfn, access);
                return true;
        }
@@ -3764,9 +3879,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        if (r)
                goto out;
        r = mmu_alloc_roots(vcpu);
-        spin_lock(&vcpu->kvm->mmu_lock);
+        kvm_mmu_sync_roots(vcpu);
-        mmu_sync_roots(vcpu);
-        spin_unlock(&vcpu->kvm->mmu_lock);
        if (r)
                goto out;
        /* set_cr3() should ensure TLB has been flushed */
@@ -4179,39 +4292,107 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
        spin_unlock(&kvm->mmu_lock);
 }
-void kvm_mmu_zap_all(struct kvm *kvm)
+#define BATCH_ZAP_PAGES 10
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
        struct kvm_mmu_page *sp, *node;
-        LIST_HEAD(invalid_list);
+        int batch = 0;
-        spin_lock(&kvm->mmu_lock);
 restart:
-        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+        list_for_each_entry_safe_reverse(sp, node,
-                if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+              &kvm->arch.active_mmu_pages, link) {
+                int ret;
+                /*
+                 * No obsolete page exists before new created page since
+                 * active_mmu_pages is the FIFO list.
+                 */
+                if (!is_obsolete_sp(kvm, sp))
+                        break;
+                /*
+                 * Since we are reversely walking the list and the invalid
+                 * list will be moved to the head, skip the invalid page
+                 * can help us to avoid the infinity list walking.
+                 */
+                if (sp->role.invalid)
+                        continue;
+                /*
+                 * Need not flush tlb since we only zap the sp with invalid
+                 * generation number.
+                 */
+                if (batch >= BATCH_ZAP_PAGES &&
+                      cond_resched_lock(&kvm->mmu_lock)) {
+                        batch = 0;
+                        goto restart;
+                }
+                ret = kvm_mmu_prepare_zap_page(kvm, sp,
+                                &kvm->arch.zapped_obsolete_pages);
+                batch += ret;
+                if (ret)
                        goto restart;
+        }
-        kvm_mmu_commit_zap_page(kvm, &invalid_list);
+        /*
-        spin_unlock(&kvm->mmu_lock);
+         * Should flush tlb before free page tables since lockless-walking
+         * may use the pages.
+         */
+        kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
 }
-void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
 {
-        struct kvm_mmu_page *sp, *node;
-        LIST_HEAD(invalid_list);
        spin_lock(&kvm->mmu_lock);
-restart:
+        trace_kvm_mmu_invalidate_zap_all_pages(kvm);
-        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+        kvm->arch.mmu_valid_gen++;
-                if (!sp->mmio_cached)
-                        continue;
-                if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-                        goto restart;
-        }
-        kvm_mmu_commit_zap_page(kvm, &invalid_list);
+        /*
+         * Notify all vcpus to reload its shadow page table
+         * and flush TLB. Then all vcpus will switch to new
+         * shadow page table with the new mmu_valid_gen.
+         *
+         * Note: we should do this under the protection of
+         * mmu-lock, otherwise, vcpu would purge shadow page
+         * but miss tlb flush.
+         */
+        kvm_reload_remote_mmus(kvm);
+        kvm_zap_obsolete_pages(kvm);
        spin_unlock(&kvm->mmu_lock);
 }
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+        return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
+{
+        /*
+         * The very rare case: if the generation-number is round,
+         * zap all shadow pages.
+         *
+         * The max value is MMIO_MAX_GEN - 1 since it is not called
+         * when mark memslot invalid.
+         */
+        if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+                printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
+                kvm_mmu_invalidate_zap_all_pages(kvm);
+        }
+}
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
        struct kvm *kvm;
@@ -4240,15 +4421,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
                 * want to shrink a VM that only started to populate its MMU
                 * anyway.
                 */
-                if (!kvm->arch.n_used_mmu_pages)
+                if (!kvm->arch.n_used_mmu_pages &&
+                      !kvm_has_zapped_obsolete_pages(kvm))
                        continue;
                idx = srcu_read_lock(&kvm->srcu);
                spin_lock(&kvm->mmu_lock);
+                if (kvm_has_zapped_obsolete_pages(kvm)) {
+                        kvm_mmu_commit_zap_page(kvm,
+                              &kvm->arch.zapped_obsolete_pages);
+                        goto unlock;
+                }
                prepare_zap_oldest_mmu_page(kvm, &invalid_list);
                kvm_mmu_commit_zap_page(kvm, &invalid_list);
+unlock:
                spin_unlock(&kvm->mmu_lock);
                srcu_read_unlock(&kvm->srcu, idx);