KVM: create aggregate kvm_total_used_mmu_pages value

Of slab shrinkers, the VM code says: * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is * querying the cache size, so a fastpath for that case is appropriate. and it *means* it. Look at how it calls the shrinkers: nr_before = (*shrinker->shrink)(0, gfp_mask); shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); So, if you do anything stupid in your shrinker, the VM will doubly punish you. The mmu_shrink() function takes the global kvm_lock, then acquires every VM's kvm->mmu_lock in sequence. If we have 100 VMs, then we're going to take 101 locks. We do it twice, so each call takes 202 locks. If we're under memory pressure, we can have each cpu trying to do this. It can get really hairy, and we've seen lock spinning in mmu_shrink() be the dominant entry in profiles. This is guaranteed to optimize at least half of those lock aquisitions away. It removes the need to take any of the locks when simply trying to count objects. A 'percpu_counter' can be a large object, but we only have one of these for the entire system. There are not any better alternatives at the moment, especially ones that handle CPU hotplug. Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com> Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com> Signed-off-by: Avi Kivity <avi@redhat.com>
author: Dave Hansen <dave@linux.vnet.ibm.com> 2010-08-19 21:11:37 -0400
committer: Avi Kivity <avi@redhat.com> 2010-10-24 04:51:19 -0400
commit: 45221ab6684a82a5b60208b76d6f8bfb1bbcb969 (patch)
tree: bdc915bf20cc9dfb40b81b7601ed5182c047d13a /arch/x86/kvm/mmu.c
parent: 49d5ca26636cb8feb05aff92fc4dba3e494ec683 (diff)
1 files changed, 24 insertions, 10 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff39b85d7a4..33d7af50cf8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,6 +178,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -971,6 +972,18 @@ static int is_empty_shadow_page(u64 *spt)
 }
 #endif
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values.  We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+{
+        kvm->arch.n_used_mmu_pages += nr;
+        percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        ASSERT(is_empty_shadow_page(sp->spt));
@@ -980,7 +993,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
        if (!sp->role.direct)
                __free_page(virt_to_page(sp->gfns));
        kmem_cache_free(mmu_page_header_cache, sp);
-        --kvm->arch.n_used_mmu_pages;
+        kvm_mod_used_mmu_pages(kvm, -1);
 }
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1003,7 +1016,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
        bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
        sp->multimapped = 0;
        sp->parent_pte = parent_pte;
-        ++vcpu->kvm->arch.n_used_mmu_pages;
+        kvm_mod_used_mmu_pages(vcpu->kvm, +1);
        return sp;
 }
@@ -3122,23 +3135,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        struct kvm *kvm;
        struct kvm *kvm_freed = NULL;
-        int cache_count = 0;
+        if (nr_to_scan == 0)
+                goto out;
        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-                int npages, idx, freed_pages;
+                int idx, freed_pages;
                LIST_HEAD(invalid_list);
                idx = srcu_read_lock(&kvm->srcu);
                spin_lock(&kvm->mmu_lock);
-                npages = kvm->arch.n_max_mmu_pages -
+                if (!kvm_freed && nr_to_scan > 0 &&
-                         kvm_mmu_available_pages(kvm);
+                    kvm->arch.n_used_mmu_pages > 0) {
-                cache_count += npages;
-                if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
                        freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
                                                          &invalid_list);
-                        cache_count -= freed_pages;
                        kvm_freed = kvm;
                }
                nr_to_scan--;
@@ -3152,7 +3164,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
        spin_unlock(&kvm_lock);
-        return cache_count;
+out:
+        return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
 }
 static struct shrinker mmu_shrinker = {
@@ -3195,6 +3208,7 @@ int kvm_mmu_module_init(void)
        if (!mmu_page_header_cache)
                goto nomem;
+        percpu_counter_init(&kvm_total_used_mmu_pages, 0);
        register_shrinker(&mmu_shrinker);
        return 0;
author	Dave Hansen <dave@linux.vnet.ibm.com>	2010-08-19 21:11:37 -0400
committer	Avi Kivity <avi@redhat.com>	2010-10-24 04:51:19 -0400
commit	45221ab6684a82a5b60208b76d6f8bfb1bbcb969 (patch)
tree	bdc915bf20cc9dfb40b81b7601ed5182c047d13a /arch/x86/kvm/mmu.c
parent	49d5ca26636cb8feb05aff92fc4dba3e494ec683 (diff)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff39b85d7a4..33d7af50cf8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c
@@ -178,6 +178,7 @@ typedef void (mmu_parent_walk_fn) (struct kvm_mmu_page sp, u64 *spte);
178	static struct kmem_cache *pte_chain_cache;	178	static struct kmem_cache *pte_chain_cache;
179	static struct kmem_cache *rmap_desc_cache;	179	static struct kmem_cache *rmap_desc_cache;
180	static struct kmem_cache *mmu_page_header_cache;	180	static struct kmem_cache *mmu_page_header_cache;
		181	static struct percpu_counter kvm_total_used_mmu_pages;
181		182
182	static u64 __read_mostly shadow_trap_nonpresent_pte;	183	static u64 __read_mostly shadow_trap_nonpresent_pte;
183	static u64 __read_mostly shadow_notrap_nonpresent_pte;	184	static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -971,6 +972,18 @@ static int is_empty_shadow_page(u64 *spt)
971	}	972	}
972	#endif	973	#endif
973		974
		975	/*
		976	* This value is the sum of all of the kvm instances's
		977	* kvm->arch.n_used_mmu_pages values. We need a global,
		978	* aggregate version in order to make the slab shrinker
		979	* faster
		980	*/
		981	static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
		982	{
		983	kvm->arch.n_used_mmu_pages += nr;
		984	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
		985	}
		986
974	static void kvm_mmu_free_page(struct kvm kvm, struct kvm_mmu_page sp)	987	static void kvm_mmu_free_page(struct kvm kvm, struct kvm_mmu_page sp)
975	{	988	{
976	ASSERT(is_empty_shadow_page(sp->spt));	989	ASSERT(is_empty_shadow_page(sp->spt));
@@ -980,7 +993,7 @@ static void kvm_mmu_free_page(struct kvm kvm, struct kvm_mmu_page sp)
980	if (!sp->role.direct)	993	if (!sp->role.direct)
981	__free_page(virt_to_page(sp->gfns));	994	__free_page(virt_to_page(sp->gfns));
982	kmem_cache_free(mmu_page_header_cache, sp);	995	kmem_cache_free(mmu_page_header_cache, sp);
983	--kvm->arch.n_used_mmu_pages;	996	kvm_mod_used_mmu_pages(kvm, -1);
984	}	997	}
985		998
986	static unsigned kvm_page_table_hashfn(gfn_t gfn)	999	static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1003,7 +1016,7 @@ static struct kvm_mmu_page kvm_mmu_alloc_page(struct kvm_vcpu vcpu,
1003	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);	1016	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1004	sp->multimapped = 0;	1017	sp->multimapped = 0;
1005	sp->parent_pte = parent_pte;	1018	sp->parent_pte = parent_pte;
1006	++vcpu->kvm->arch.n_used_mmu_pages;	1019	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1007	return sp;	1020	return sp;
1008	}	1021	}
1009		1022
@@ -3122,23 +3135,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3122	{	3135	{
3123	struct kvm *kvm;	3136	struct kvm *kvm;
3124	struct kvm *kvm_freed = NULL;	3137	struct kvm *kvm_freed = NULL;
3125	int cache_count = 0;	3138
		3139	if (nr_to_scan == 0)
		3140	goto out;
3126		3141
3127	spin_lock(&kvm_lock);	3142	spin_lock(&kvm_lock);
3128		3143
3129	list_for_each_entry(kvm, &vm_list, vm_list) {	3144	list_for_each_entry(kvm, &vm_list, vm_list) {
3130	int npages, idx, freed_pages;	3145	int idx, freed_pages;
3131	LIST_HEAD(invalid_list);	3146	LIST_HEAD(invalid_list);
3132		3147
3133	idx = srcu_read_lock(&kvm->srcu);	3148	idx = srcu_read_lock(&kvm->srcu);
3134	spin_lock(&kvm->mmu_lock);	3149	spin_lock(&kvm->mmu_lock);
3135	npages = kvm->arch.n_max_mmu_pages -	3150	if (!kvm_freed && nr_to_scan > 0 &&
3136	kvm_mmu_available_pages(kvm);	3151	kvm->arch.n_used_mmu_pages > 0) {
3137	cache_count += npages;
3138	if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3139	freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,	3152	freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3140	&invalid_list);	3153	&invalid_list);
3141	cache_count -= freed_pages;
3142	kvm_freed = kvm;	3154	kvm_freed = kvm;
3143	}	3155	}
3144	nr_to_scan--;	3156	nr_to_scan--;
@@ -3152,7 +3164,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3152		3164
3153	spin_unlock(&kvm_lock);	3165	spin_unlock(&kvm_lock);
3154		3166
3155	return cache_count;	3167	out:
		3168	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3156	}	3169	}
3157		3170
3158	static struct shrinker mmu_shrinker = {	3171	static struct shrinker mmu_shrinker = {
@@ -3195,6 +3208,7 @@ int kvm_mmu_module_init(void)
3195	if (!mmu_page_header_cache)	3208	if (!mmu_page_header_cache)
3196	goto nomem;	3209	goto nomem;
3197		3210
		3211	percpu_counter_init(&kvm_total_used_mmu_pages, 0);
3198	register_shrinker(&mmu_shrinker);	3212	register_shrinker(&mmu_shrinker);
3199		3213
3200	return 0;	3214	return 0;