1 files changed, 62 insertions, 106 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01d7c2ad05f5..4ed3edbe06bd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 static bool spte_is_locklessly_modifiable(u64 spte)
 {
-        return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+        return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
+                (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
 }
 static bool spte_has_volatile_bits(u64 spte)
@@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
        if (host_level == PT_PAGE_TABLE_LEVEL)
                return host_level;
-        max_level = kvm_x86_ops->get_lpage_level() < host_level ?
+        max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
-                kvm_x86_ops->get_lpage_level() : host_level;
        for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
                if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
@@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
 }
 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
-                                 int level, bool pt_protect)
+                                 bool pt_protect)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
        while (mask) {
                rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
                                      PT_PAGE_TABLE_LEVEL, slot);
-                __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
+                __rmap_write_protect(kvm, rmapp, false);
                /* clear the first set bit */
                mask &= mask - 1;
@@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
        for (i = PT_PAGE_TABLE_LEVEL;
             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
                rmapp = __gfn_to_rmap(gfn, i, slot);
-                write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
+                write_protected |= __rmap_write_protect(kvm, rmapp, true);
        }
        return write_protected;
@@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
        percpu_counter_add(&kvm_total_used_mmu_pages, nr);
 }
-/*
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
- * Remove the sp from shadow page cache, after call it,
- * we can not find this sp from the cache, and the shadow
- * page table is still valid.
- * It should be under the protection of mmu lock.
- */
-static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
 {
        ASSERT(is_empty_shadow_page(sp->spt));
        hlist_del(&sp->hash_link);
-        if (!sp->role.direct)
-                free_page((unsigned long)sp->gfns);
-}
-/*
- * Free the shadow page table and the sp, we can do it
- * out of the protection of mmu lock.
- */
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
-{
        list_del(&sp->link);
        free_page((unsigned long)sp->spt);
+        if (!sp->role.direct)
+                free_page((unsigned long)sp->gfns);
        kmem_cache_free(mmu_page_header_cache, sp);
 }
@@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
                sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-        bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
        sp->parent_ptes = 0;
        mmu_page_add_parent_pte(vcpu, sp, parent_pte);
        kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1973,9 +1958,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 {
        u64 spte;
-        spte = __pa(sp->spt)
+        spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-                | PT_PRESENT_MASK | PT_ACCESSED_MASK
+               shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
-                | PT_WRITABLE_MASK | PT_USER_MASK;
        mmu_spte_set(sptep, spte);
 }
@@ -2126,7 +2111,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
        do {
                sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
                WARN_ON(!sp->role.invalid || sp->root_count);
-                kvm_mmu_isolate_page(sp);
                kvm_mmu_free_page(sp);
        } while (!list_empty(invalid_list));
 }
@@ -2144,6 +2128,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
         * change the value
         */
+        spin_lock(&kvm->mmu_lock);
        if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
                while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
                        !list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2158,6 +2144,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
        }
        kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+        spin_unlock(&kvm->mmu_lock);
 }
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2183,14 +2171,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
-        int slot = memslot_id(kvm, gfn);
-        struct kvm_mmu_page *sp = page_header(__pa(pte));
-        __set_bit(slot, sp->slot_bitmap);
-}
 /*
 * The function is based on mtrr_type_lookup() in
 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -2332,9 +2312,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
                if (s->role.level != PT_PAGE_TABLE_LEVEL)
                        return 1;
-                if (!need_unsync && !s->unsync) {
+                if (!s->unsync)
                        need_unsync = true;
-                }
        }
        if (need_unsync)
                kvm_unsync_pages(vcpu, gfn);
@@ -2342,8 +2321,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 }
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
-                    unsigned pte_access, int user_fault,
+                    unsigned pte_access, int level,
-                    int write_fault, int level,
                    gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
 {
@@ -2378,20 +2356,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        spte |= (u64)pfn << PAGE_SHIFT;
-        if ((pte_access & ACC_WRITE_MASK)
+        if (pte_access & ACC_WRITE_MASK) {
-            || (!vcpu->arch.mmu.direct_map && write_fault
-                && !is_write_protection(vcpu) && !user_fault)) {
                /*
-                 * There are two cases:
+                 * Other vcpu creates new sp in the window between
-                 * - the one is other vcpu creates new sp in the window
+                 * mapping_level() and acquiring mmu-lock. We can
-                 *   between mapping_level() and acquiring mmu-lock.
+                 * allow guest to retry the access, the mapping can
-                 * - the another case is the new sp is created by itself
+                 * be fixed if guest refault.
-                 *   (page-fault path) when guest uses the target gfn as
-                 *   its page table.
-                 * Both of these cases can be fixed by allowing guest to
-                 * retry the access, it will refault, then we can establish
-                 * the mapping by using small page.
                 */
                if (level > PT_PAGE_TABLE_LEVEL &&
                    has_wrprotected_page(vcpu->kvm, gfn, level))
@@ -2399,19 +2370,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
-                if (!vcpu->arch.mmu.direct_map
-                    && !(pte_access & ACC_WRITE_MASK)) {
-                        spte &= ~PT_USER_MASK;
-                        /*
-                         * If we converted a user page to a kernel page,
-                         * so that the kernel can write to it when cr0.wp=0,
-                         * then we should prevent the kernel from executing it
-                         * if SMEP is enabled.
-                         */
-                        if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
-                                spte |= PT64_NX_MASK;
-                }
                /*
                 * Optimization: for pte sync, if spte was writable the hash
                 * lookup is unnecessary (and expensive). Write protection
@@ -2441,19 +2399,15 @@ done:
 }
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
-                         unsigned pt_access, unsigned pte_access,
+                         unsigned pte_access, int write_fault, int *emulate,
-                         int user_fault, int write_fault,
+                         int level, gfn_t gfn, pfn_t pfn, bool speculative,
-                         int *emulate, int level, gfn_t gfn,
-                         pfn_t pfn, bool speculative,
                         bool host_writable)
 {
        int was_rmapped = 0;
        int rmap_count;
-        pgprintk("%s: spte %llx access %x write_fault %d"
+        pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
-                 " user_fault %d gfn %llx\n",
+                 *sptep, write_fault, gfn);
-                 __func__, *sptep, pt_access,
-                 write_fault, user_fault, gfn);
        if (is_rmap_spte(*sptep)) {
                /*
@@ -2477,9 +2431,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                        was_rmapped = 1;
        }
-        if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
+        if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
-                      level, gfn, pfn, speculative, true,
+              true, host_writable)) {
-                      host_writable)) {
                if (write_fault)
                        *emulate = 1;
                kvm_mmu_flush_tlb(vcpu);
@@ -2497,7 +2450,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                ++vcpu->kvm->stat.lpages;
        if (is_shadow_present_pte(*sptep)) {
-                page_header_update_slot(vcpu->kvm, sptep, gfn);
                if (!was_rmapped) {
                        rmap_count = rmap_add(vcpu, sptep, gfn);
                        if (rmap_count > RMAP_RECYCLE_THRESHOLD)
@@ -2571,10 +2523,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                return -1;
        for (i = 0; i < ret; i++, gfn++, start++)
-                mmu_set_spte(vcpu, start, ACC_ALL,
+                mmu_set_spte(vcpu, start, access, 0, NULL,
-                             access, 0, 0, NULL,
+                             sp->role.level, gfn, page_to_pfn(pages[i]),
-                             sp->role.level, gfn,
+                             true, true);
-                             page_to_pfn(pages[i]), true, true);
        return 0;
 }
@@ -2633,11 +2584,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
                if (iterator.level == level) {
-                        unsigned pte_access = ACC_ALL;
+                        mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
+                                     write, &emulate, level, gfn, pfn,
-                        mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
+                                     prefault, map_writable);
-                                     0, write, &emulate,
-                                     level, gfn, pfn, prefault, map_writable);
                        direct_pte_prefetch(vcpu, iterator.sptep);
                        ++vcpu->stat.pf_fixed;
                        break;
@@ -2652,11 +2601,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                              iterator.level - 1,
                                              1, ACC_ALL, iterator.sptep);
-                        mmu_spte_set(iterator.sptep,
+                        link_shadow_page(iterator.sptep, sp);
-                                     __pa(sp->spt)
-                                     | PT_PRESENT_MASK | PT_WRITABLE_MASK
-                                     | shadow_user_mask | shadow_x_mask
-                                     | shadow_accessed_mask);
                }
        }
        return emulate;
@@ -3719,6 +3664,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
        else
                r = paging32_init_context(vcpu, context);
+        vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
        vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
        vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
        vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3885,7 +3831,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
                /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
                *gpa &= ~(gpa_t)7;
                *bytes = 8;
-                r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
+                r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
                if (r)
                        gentry = 0;
                new = (const u8 *)&gentry;
@@ -4039,7 +3985,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                              !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
                              & mask.word) && rmap_can_add(vcpu))
                                mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
-                        if (!remote_flush && need_remote_flush(entry, *spte))
+                        if (need_remote_flush(entry, *spte))
                                remote_flush = true;
                        ++spte;
                }
@@ -4198,26 +4144,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
-        struct kvm_mmu_page *sp;
+        struct kvm_memory_slot *memslot;
-        bool flush = false;
+        gfn_t last_gfn;
+        int i;
-        list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
+        memslot = id_to_memslot(kvm->memslots, slot);
-                int i;
+        last_gfn = memslot->base_gfn + memslot->npages - 1;
-                u64 *pt;
-                if (!test_bit(slot, sp->slot_bitmap))
+        spin_lock(&kvm->mmu_lock);
-                        continue;
-                pt = sp->spt;
+        for (i = PT_PAGE_TABLE_LEVEL;
-                for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-                        if (!is_shadow_present_pte(pt[i]) ||
+                unsigned long *rmapp;
-                              !is_last_spte(pt[i], sp->role.level))
+                unsigned long last_index, index;
-                                continue;
-                        spte_write_protect(kvm, &pt[i], &flush, false);
+                rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+                last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+                for (index = 0; index <= last_index; ++index, ++rmapp) {
+                        if (*rmapp)
+                                __rmap_write_protect(kvm, rmapp, false);
+                        if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+                                kvm_flush_remote_tlbs(kvm);
+                                cond_resched_lock(&kvm->mmu_lock);
+                        }
                }
        }
        kvm_flush_remote_tlbs(kvm);
+        spin_unlock(&kvm->mmu_lock);
 }
 void kvm_mmu_zap_all(struct kvm *kvm)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 01d7c2ad05f5..4ed3edbe06bd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
448		448
449	static bool spte_is_locklessly_modifiable(u64 spte)	449	static bool spte_is_locklessly_modifiable(u64 spte)
450	{	450	{
451	return !(~spte & (SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE));	451	return (spte & (SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE)) ==
		452	(SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE);
452	}	453	}
453		454
454	static bool spte_has_volatile_bits(u64 spte)	455	static bool spte_has_volatile_bits(u64 spte)
@@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
831	if (host_level == PT_PAGE_TABLE_LEVEL)	832	if (host_level == PT_PAGE_TABLE_LEVEL)
832	return host_level;	833	return host_level;
833		834
834	max_level = kvm_x86_ops->get_lpage_level() < host_level ?	835	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
835	kvm_x86_ops->get_lpage_level() : host_level;
836		836
837	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)	837	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
838	if (has_wrprotected_page(vcpu->kvm, large_gfn, level))	838	if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
@@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm kvm, u64 sptep, bool *flush, bool pt_protect)
1142	}	1142	}
1143		1143
1144	static bool __rmap_write_protect(struct kvm kvm, unsigned long rmapp,	1144	static bool __rmap_write_protect(struct kvm kvm, unsigned long rmapp,
1145	int level, bool pt_protect)	1145	bool pt_protect)
1146	{	1146	{
1147	u64 *sptep;	1147	u64 *sptep;
1148	struct rmap_iterator iter;	1148	struct rmap_iterator iter;
@@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1180	while (mask) {	1180	while (mask) {
1181	rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),	1181	rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1182	PT_PAGE_TABLE_LEVEL, slot);	1182	PT_PAGE_TABLE_LEVEL, slot);
1183	__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);	1183	__rmap_write_protect(kvm, rmapp, false);
1184		1184
1185	/* clear the first set bit */	1185	/* clear the first set bit */
1186	mask &= mask - 1;	1186	mask &= mask - 1;
@@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1199	for (i = PT_PAGE_TABLE_LEVEL;	1199	for (i = PT_PAGE_TABLE_LEVEL;
1200	i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {	1200	i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1201	rmapp = __gfn_to_rmap(gfn, i, slot);	1201	rmapp = __gfn_to_rmap(gfn, i, slot);
1202	write_protected \|= __rmap_write_protect(kvm, rmapp, i, true);	1202	write_protected \|= __rmap_write_protect(kvm, rmapp, true);
1203	}	1203	}
1204		1204
1205	return write_protected;	1205	return write_protected;
@@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1460	percpu_counter_add(&kvm_total_used_mmu_pages, nr);	1460	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1461	}	1461	}
1462		1462
1463	/*	1463	static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1464	* Remove the sp from shadow page cache, after call it,
1465	* we can not find this sp from the cache, and the shadow
1466	* page table is still valid.
1467	* It should be under the protection of mmu lock.
1468	*/
1469	static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1470	{	1464	{
1471	ASSERT(is_empty_shadow_page(sp->spt));	1465	ASSERT(is_empty_shadow_page(sp->spt));
1472	hlist_del(&sp->hash_link);	1466	hlist_del(&sp->hash_link);
1473	if (!sp->role.direct)
1474	free_page((unsigned long)sp->gfns);
1475	}
1476
1477	/*
1478	* Free the shadow page table and the sp, we can do it
1479	* out of the protection of mmu lock.
1480	*/
1481	static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1482	{
1483	list_del(&sp->link);	1467	list_del(&sp->link);
1484	free_page((unsigned long)sp->spt);	1468	free_page((unsigned long)sp->spt);
		1469	if (!sp->role.direct)
		1470	free_page((unsigned long)sp->gfns);
1485	kmem_cache_free(mmu_page_header_cache, sp);	1471	kmem_cache_free(mmu_page_header_cache, sp);
1486	}	1472	}
1487		1473
@@ -1522,7 +1508,6 @@ static struct kvm_mmu_page kvm_mmu_alloc_page(struct kvm_vcpu vcpu,
1522	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);	1508	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1523	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);	1509	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1524	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);	1510	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1525	bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
1526	sp->parent_ptes = 0;	1511	sp->parent_ptes = 0;
1527	mmu_page_add_parent_pte(vcpu, sp, parent_pte);	1512	mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1528	kvm_mod_used_mmu_pages(vcpu->kvm, +1);	1513	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1973,9 +1958,9 @@ static void link_shadow_page(u64 sptep, struct kvm_mmu_page sp)
1973	{	1958	{
1974	u64 spte;	1959	u64 spte;
1975		1960
1976	spte = __pa(sp->spt)	1961	spte = __pa(sp->spt) \| PT_PRESENT_MASK \| PT_WRITABLE_MASK \|
1977	\| PT_PRESENT_MASK \| PT_ACCESSED_MASK	1962	shadow_user_mask \| shadow_x_mask \| shadow_accessed_mask;
1978	\| PT_WRITABLE_MASK \| PT_USER_MASK;	1963
1979	mmu_spte_set(sptep, spte);	1964	mmu_spte_set(sptep, spte);
1980	}	1965	}
1981		1966
@@ -2126,7 +2111,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2126	do {	2111	do {
2127	sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);	2112	sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
2128	WARN_ON(!sp->role.invalid \|\| sp->root_count);	2113	WARN_ON(!sp->role.invalid \|\| sp->root_count);
2129	kvm_mmu_isolate_page(sp);
2130	kvm_mmu_free_page(sp);	2114	kvm_mmu_free_page(sp);
2131	} while (!list_empty(invalid_list));	2115	} while (!list_empty(invalid_list));
2132	}	2116	}
@@ -2144,6 +2128,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2144	* change the value	2128	* change the value
2145	*/	2129	*/
2146		2130
		2131	spin_lock(&kvm->mmu_lock);
		2132
2147	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {	2133	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2148	while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&	2134	while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
2149	!list_empty(&kvm->arch.active_mmu_pages)) {	2135	!list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2158,6 +2144,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2158	}	2144	}
2159		2145
2160	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;	2146	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
		2147
		2148	spin_unlock(&kvm->mmu_lock);
2161	}	2149	}
2162		2150
2163	int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)	2151	int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2183,14 +2171,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2183	}	2171	}
2184	EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);	2172	EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2185		2173
2186	static void page_header_update_slot(struct kvm kvm, void pte, gfn_t gfn)
2187	{
2188	int slot = memslot_id(kvm, gfn);
2189	struct kvm_mmu_page *sp = page_header(__pa(pte));
2190
2191	__set_bit(slot, sp->slot_bitmap);
2192	}
2193
2194	/*	2174	/*
2195	* The function is based on mtrr_type_lookup() in	2175	* The function is based on mtrr_type_lookup() in
2196	* arch/x86/kernel/cpu/mtrr/generic.c	2176	* arch/x86/kernel/cpu/mtrr/generic.c
@@ -2332,9 +2312,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2332	if (s->role.level != PT_PAGE_TABLE_LEVEL)	2312	if (s->role.level != PT_PAGE_TABLE_LEVEL)
2333	return 1;	2313	return 1;
2334		2314
2335	if (!need_unsync && !s->unsync) {	2315	if (!s->unsync)
2336	need_unsync = true;	2316	need_unsync = true;
2337	}
2338	}	2317	}
2339	if (need_unsync)	2318	if (need_unsync)
2340	kvm_unsync_pages(vcpu, gfn);	2319	kvm_unsync_pages(vcpu, gfn);
@@ -2342,8 +2321,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2342	}	2321	}
2343		2322
2344	static int set_spte(struct kvm_vcpu vcpu, u64 sptep,	2323	static int set_spte(struct kvm_vcpu vcpu, u64 sptep,
2345	unsigned pte_access, int user_fault,	2324	unsigned pte_access, int level,
2346	int write_fault, int level,
2347	gfn_t gfn, pfn_t pfn, bool speculative,	2325	gfn_t gfn, pfn_t pfn, bool speculative,
2348	bool can_unsync, bool host_writable)	2326	bool can_unsync, bool host_writable)
2349	{	2327	{
@@ -2378,20 +2356,13 @@ static int set_spte(struct kvm_vcpu vcpu, u64 sptep,
2378		2356
2379	spte \|= (u64)pfn << PAGE_SHIFT;	2357	spte \|= (u64)pfn << PAGE_SHIFT;
2380		2358
2381	if ((pte_access & ACC_WRITE_MASK)	2359	if (pte_access & ACC_WRITE_MASK) {
2382	\|\| (!vcpu->arch.mmu.direct_map && write_fault
2383	&& !is_write_protection(vcpu) && !user_fault)) {
2384		2360
2385	/*	2361	/*
2386	* There are two cases:	2362	* Other vcpu creates new sp in the window between
2387	* - the one is other vcpu creates new sp in the window	2363	* mapping_level() and acquiring mmu-lock. We can
2388	* between mapping_level() and acquiring mmu-lock.	2364	* allow guest to retry the access, the mapping can
2389	* - the another case is the new sp is created by itself	2365	* be fixed if guest refault.
2390	* (page-fault path) when guest uses the target gfn as
2391	* its page table.
2392	* Both of these cases can be fixed by allowing guest to
2393	* retry the access, it will refault, then we can establish
2394	* the mapping by using small page.
2395	*/	2366	*/
2396	if (level > PT_PAGE_TABLE_LEVEL &&	2367	if (level > PT_PAGE_TABLE_LEVEL &&
2397	has_wrprotected_page(vcpu->kvm, gfn, level))	2368	has_wrprotected_page(vcpu->kvm, gfn, level))
@@ -2399,19 +2370,6 @@ static int set_spte(struct kvm_vcpu vcpu, u64 sptep,
2399		2370
2400	spte \|= PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE;	2371	spte \|= PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE;
2401		2372
2402	if (!vcpu->arch.mmu.direct_map
2403	&& !(pte_access & ACC_WRITE_MASK)) {
2404	spte &= ~PT_USER_MASK;
2405	/*
2406	* If we converted a user page to a kernel page,
2407	* so that the kernel can write to it when cr0.wp=0,
2408	* then we should prevent the kernel from executing it
2409	* if SMEP is enabled.
2410	*/
2411	if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
2412	spte \|= PT64_NX_MASK;
2413	}
2414
2415	/*	2373	/*
2416	* Optimization: for pte sync, if spte was writable the hash	2374	* Optimization: for pte sync, if spte was writable the hash
2417	* lookup is unnecessary (and expensive). Write protection	2375	* lookup is unnecessary (and expensive). Write protection
@@ -2441,19 +2399,15 @@ done:
2441	}	2399	}
2442		2400
2443	static void mmu_set_spte(struct kvm_vcpu vcpu, u64 sptep,	2401	static void mmu_set_spte(struct kvm_vcpu vcpu, u64 sptep,
2444	unsigned pt_access, unsigned pte_access,	2402	unsigned pte_access, int write_fault, int *emulate,
2445	int user_fault, int write_fault,	2403	int level, gfn_t gfn, pfn_t pfn, bool speculative,
2446	int *emulate, int level, gfn_t gfn,
2447	pfn_t pfn, bool speculative,
2448	bool host_writable)	2404	bool host_writable)
2449	{	2405	{
2450	int was_rmapped = 0;	2406	int was_rmapped = 0;
2451	int rmap_count;	2407	int rmap_count;
2452		2408
2453	pgprintk("%s: spte %llx access %x write_fault %d"	2409	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2454	" user_fault %d gfn %llx\n",	2410	*sptep, write_fault, gfn);
2455	__func__, *sptep, pt_access,
2456	write_fault, user_fault, gfn);
2457		2411
2458	if (is_rmap_spte(*sptep)) {	2412	if (is_rmap_spte(*sptep)) {
2459	/*	2413	/*
@@ -2477,9 +2431,8 @@ static void mmu_set_spte(struct kvm_vcpu vcpu, u64 sptep,
2477	was_rmapped = 1;	2431	was_rmapped = 1;
2478	}	2432	}
2479		2433
2480	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,	2434	if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
2481	level, gfn, pfn, speculative, true,	2435	true, host_writable)) {
2482	host_writable)) {
2483	if (write_fault)	2436	if (write_fault)
2484	*emulate = 1;	2437	*emulate = 1;
2485	kvm_mmu_flush_tlb(vcpu);	2438	kvm_mmu_flush_tlb(vcpu);
@@ -2497,7 +2450,6 @@ static void mmu_set_spte(struct kvm_vcpu vcpu, u64 sptep,
2497	++vcpu->kvm->stat.lpages;	2450	++vcpu->kvm->stat.lpages;
2498		2451
2499	if (is_shadow_present_pte(*sptep)) {	2452	if (is_shadow_present_pte(*sptep)) {
2500	page_header_update_slot(vcpu->kvm, sptep, gfn);
2501	if (!was_rmapped) {	2453	if (!was_rmapped) {
2502	rmap_count = rmap_add(vcpu, sptep, gfn);	2454	rmap_count = rmap_add(vcpu, sptep, gfn);
2503	if (rmap_count > RMAP_RECYCLE_THRESHOLD)	2455	if (rmap_count > RMAP_RECYCLE_THRESHOLD)
@@ -2571,10 +2523,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2571	return -1;	2523	return -1;
2572		2524
2573	for (i = 0; i < ret; i++, gfn++, start++)	2525	for (i = 0; i < ret; i++, gfn++, start++)
2574	mmu_set_spte(vcpu, start, ACC_ALL,	2526	mmu_set_spte(vcpu, start, access, 0, NULL,
2575	access, 0, 0, NULL,	2527	sp->role.level, gfn, page_to_pfn(pages[i]),
2576	sp->role.level, gfn,	2528	true, true);
2577	page_to_pfn(pages[i]), true, true);
2578		2529
2579	return 0;	2530	return 0;
2580	}	2531	}
@@ -2633,11 +2584,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2633		2584
2634	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {	2585	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2635	if (iterator.level == level) {	2586	if (iterator.level == level) {
2636	unsigned pte_access = ACC_ALL;	2587	mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
2637		2588	write, &emulate, level, gfn, pfn,
2638	mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,	2589	prefault, map_writable);
2639	0, write, &emulate,
2640	level, gfn, pfn, prefault, map_writable);
2641	direct_pte_prefetch(vcpu, iterator.sptep);	2590	direct_pte_prefetch(vcpu, iterator.sptep);
2642	++vcpu->stat.pf_fixed;	2591	++vcpu->stat.pf_fixed;
2643	break;	2592	break;
@@ -2652,11 +2601,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2652	iterator.level - 1,	2601	iterator.level - 1,
2653	1, ACC_ALL, iterator.sptep);	2602	1, ACC_ALL, iterator.sptep);
2654		2603
2655	mmu_spte_set(iterator.sptep,	2604	link_shadow_page(iterator.sptep, sp);
2656	__pa(sp->spt)
2657	\| PT_PRESENT_MASK \| PT_WRITABLE_MASK
2658	\| shadow_user_mask \| shadow_x_mask
2659	\| shadow_accessed_mask);
2660	}	2605	}
2661	}	2606	}
2662	return emulate;	2607	return emulate;
@@ -3719,6 +3664,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu vcpu, struct kvm_mmu context)
3719	else	3664	else
3720	r = paging32_init_context(vcpu, context);	3665	r = paging32_init_context(vcpu, context);
3721		3666
		3667	vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
3722	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);	3668	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3723	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);	3669	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3724	vcpu->arch.mmu.base_role.smep_andnot_wp	3670	vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3885,7 +3831,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu vcpu, gpa_t gpa,
3885	/* Handle a 32-bit guest writing two halves of a 64-bit gpte */	3831	/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3886	*gpa &= ~(gpa_t)7;	3832	*gpa &= ~(gpa_t)7;
3887	*bytes = 8;	3833	*bytes = 8;
3888	r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));	3834	r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
3889	if (r)	3835	if (r)
3890	gentry = 0;	3836	gentry = 0;
3891	new = (const u8 *)&gentry;	3837	new = (const u8 *)&gentry;
@@ -4039,7 +3985,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
4039	!((sp->role.word ^ vcpu->arch.mmu.base_role.word)	3985	!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
4040	& mask.word) && rmap_can_add(vcpu))	3986	& mask.word) && rmap_can_add(vcpu))
4041	mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);	3987	mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
4042	if (!remote_flush && need_remote_flush(entry, *spte))	3988	if (need_remote_flush(entry, *spte))
4043	remote_flush = true;	3989	remote_flush = true;
4044	++spte;	3990	++spte;
4045	}	3991	}
@@ -4198,26 +4144,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
4198		4144
4199	void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)	4145	void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4200	{	4146	{
4201	struct kvm_mmu_page *sp;	4147	struct kvm_memory_slot *memslot;
4202	bool flush = false;	4148	gfn_t last_gfn;
		4149	int i;
4203		4150
4204	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {	4151	memslot = id_to_memslot(kvm->memslots, slot);
4205	int i;	4152	last_gfn = memslot->base_gfn + memslot->npages - 1;
4206	u64 *pt;
4207		4153
4208	if (!test_bit(slot, sp->slot_bitmap))	4154	spin_lock(&kvm->mmu_lock);
4209	continue;
4210		4155
4211	pt = sp->spt;	4156	for (i = PT_PAGE_TABLE_LEVEL;
4212	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {	4157	i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4213	if (!is_shadow_present_pte(pt[i]) \|\|	4158	unsigned long *rmapp;
4214	!is_last_spte(pt[i], sp->role.level))	4159	unsigned long last_index, index;
4215	continue;
4216		4160
4217	spte_write_protect(kvm, &pt[i], &flush, false);	4161	rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
		4162	last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
		4163
		4164	for (index = 0; index <= last_index; ++index, ++rmapp) {
		4165	if (*rmapp)
		4166	__rmap_write_protect(kvm, rmapp, false);
		4167
		4168	if (need_resched() \|\| spin_needbreak(&kvm->mmu_lock)) {
		4169	kvm_flush_remote_tlbs(kvm);
		4170	cond_resched_lock(&kvm->mmu_lock);
		4171	}
4218	}	4172	}
4219	}	4173	}
		4174
4220	kvm_flush_remote_tlbs(kvm);	4175	kvm_flush_remote_tlbs(kvm);
		4176	spin_unlock(&kvm->mmu_lock);
4221	}	4177	}
4222		4178
4223	void kvm_mmu_zap_all(struct kvm *kvm)	4179	void kvm_mmu_zap_all(struct kvm *kvm)