memcg/sl[au]b: shrink dead caches

This means that when we destroy a memcg cache that happened to be empty, those caches may take a lot of time to go away: removing the memcg reference won't destroy them - because there are pending references, and the empty pages will stay there, until a shrinker is called upon for any reason. In this patch, we will call kmem_cache_shrink() for all dead caches that cannot be destroyed because of remaining pages. After shrinking, it is possible that it could be freed. If this is not the case, we'll schedule a lazy worker to keep trying. Signed-off-by: Glauber Costa <glommer@parallels.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Frederic Weisbecker <fweisbec@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: JoonSoo Kim <js1304@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Michal Hocko <mhocko@suse.cz> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Rik van Riel <riel@redhat.com> Cc: Suleiman Souhlal <suleiman@google.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Glauber Costa <glommer@parallels.com> 2012-12-18 17:22:59 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-12-18 18:02:14 -0500
commit: 22933152934f30de6f05b600c03f8a08f853a8d2 (patch)
tree: 1abc838ffd9a130d25a493091dfe631145feea26 /mm
parent: 7cf2798240a2a2230cb16a391beef98d8a7ad362 (diff)
1 files changed, 43 insertions, 3 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4b68ec2c8df6..7633e0d429e0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3080,7 +3080,27 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)
        cachep = memcg_params_to_cache(p);
-        if (!atomic_read(&cachep->memcg_params->nr_pages))
+        /*
+         * If we get down to 0 after shrink, we could delete right away.
+         * However, memcg_release_pages() already puts us back in the workqueue
+         * in that case. If we proceed deleting, we'll get a dangling
+         * reference, and removing the object from the workqueue in that case
+         * is unnecessary complication. We are not a fast path.
+         *
+         * Note that this case is fundamentally different from racing with
+         * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
+         * kmem_cache_shrink, not only we would be reinserting a dead cache
+         * into the queue, but doing so from inside the worker racing to
+         * destroy it.
+         *
+         * So if we aren't down to zero, we'll just schedule a worker and try
+         * again
+         */
+        if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+                kmem_cache_shrink(cachep);
+                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+                        return;
+        } else
                kmem_cache_destroy(cachep);
 }
@@ -3090,6 +3110,26 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
                return;
        /*
+         * There are many ways in which we can get here.
+         *
+         * We can get to a memory-pressure situation while the delayed work is
+         * still pending to run. The vmscan shrinkers can then release all
+         * cache memory and get us to destruction. If this is the case, we'll
+         * be executed twice, which is a bug (the second time will execute over
+         * bogus data). In this case, cancelling the work should be fine.
+         *
+         * But we can also get here from the worker itself, if
+         * kmem_cache_shrink is enough to shake all the remaining objects and
+         * get the page count to 0. In this case, we'll deadlock if we try to
+         * cancel the work (the worker runs with an internal lock held, which
+         * is the same lock we would hold for cancel_work_sync().)
+         *
+         * Since we can't possibly know who got us here, just refrain from
+         * running if there is already work pending
+         */
+        if (work_pending(&cachep->memcg_params->destroy))
+                return;
+        /*
         * We have to defer the actual destroying to a workqueue, because
         * we might currently be in a context that cannot sleep.
         */
@@ -3217,7 +3257,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
                 * set, so flip it down to guarantee we are in control.
                 */
                c->memcg_params->dead = false;
-                cancel_delayed_work_sync(&c->memcg_params->destroy);
+                cancel_work_sync(&c->memcg_params->destroy);
                kmem_cache_destroy(c);
        }
        mutex_unlock(&set_limit_mutex);
@@ -3242,7 +3282,7 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
                cachep = memcg_params_to_cache(params);
                cachep->memcg_params->dead = true;
                INIT_WORK(&cachep->memcg_params->destroy,
-                          kmem_cache_destroy_work_func);
+                                  kmem_cache_destroy_work_func);
                schedule_work(&cachep->memcg_params->destroy);
        }
        mutex_unlock(&memcg->slab_caches_mutex);
author	Glauber Costa <glommer@parallels.com>	2012-12-18 17:22:59 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-12-18 18:02:14 -0500
commit	22933152934f30de6f05b600c03f8a08f853a8d2 (patch)
tree	1abc838ffd9a130d25a493091dfe631145feea26 /mm
parent	7cf2798240a2a2230cb16a391beef98d8a7ad362 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b68ec2c8df6..7633e0d429e0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -3080,7 +3080,27 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)
3080		3080
3081	cachep = memcg_params_to_cache(p);	3081	cachep = memcg_params_to_cache(p);
3082		3082
3083	if (!atomic_read(&cachep->memcg_params->nr_pages))	3083	/*
		3084	* If we get down to 0 after shrink, we could delete right away.
		3085	* However, memcg_release_pages() already puts us back in the workqueue
		3086	* in that case. If we proceed deleting, we'll get a dangling
		3087	* reference, and removing the object from the workqueue in that case
		3088	* is unnecessary complication. We are not a fast path.
		3089	*
		3090	* Note that this case is fundamentally different from racing with
		3091	* shrink_slab(): if memcg_cgroup_destroy_cache() is called in
		3092	* kmem_cache_shrink, not only we would be reinserting a dead cache
		3093	* into the queue, but doing so from inside the worker racing to
		3094	* destroy it.
		3095	*
		3096	* So if we aren't down to zero, we'll just schedule a worker and try
		3097	* again
		3098	*/
		3099	if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
		3100	kmem_cache_shrink(cachep);
		3101	if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
		3102	return;
		3103	} else
3084	kmem_cache_destroy(cachep);	3104	kmem_cache_destroy(cachep);
3085	}	3105	}
3086		3106
@@ -3090,6 +3110,26 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3090	return;	3110	return;
3091		3111
3092	/*	3112	/*
		3113	* There are many ways in which we can get here.
		3114	*
		3115	* We can get to a memory-pressure situation while the delayed work is
		3116	* still pending to run. The vmscan shrinkers can then release all
		3117	* cache memory and get us to destruction. If this is the case, we'll
		3118	* be executed twice, which is a bug (the second time will execute over
		3119	* bogus data). In this case, cancelling the work should be fine.
		3120	*
		3121	* But we can also get here from the worker itself, if
		3122	* kmem_cache_shrink is enough to shake all the remaining objects and
		3123	* get the page count to 0. In this case, we'll deadlock if we try to
		3124	* cancel the work (the worker runs with an internal lock held, which
		3125	* is the same lock we would hold for cancel_work_sync().)
		3126	*
		3127	* Since we can't possibly know who got us here, just refrain from
		3128	* running if there is already work pending
		3129	*/
		3130	if (work_pending(&cachep->memcg_params->destroy))
		3131	return;
		3132	/*
3093	* We have to defer the actual destroying to a workqueue, because	3133	* We have to defer the actual destroying to a workqueue, because
3094	* we might currently be in a context that cannot sleep.	3134	* we might currently be in a context that cannot sleep.
3095	*/	3135	*/
@@ -3217,7 +3257,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3217	* set, so flip it down to guarantee we are in control.	3257	* set, so flip it down to guarantee we are in control.
3218	*/	3258	*/
3219	c->memcg_params->dead = false;	3259	c->memcg_params->dead = false;
3220	cancel_delayed_work_sync(&c->memcg_params->destroy);	3260	cancel_work_sync(&c->memcg_params->destroy);
3221	kmem_cache_destroy(c);	3261	kmem_cache_destroy(c);
3222	}	3262	}
3223	mutex_unlock(&set_limit_mutex);	3263	mutex_unlock(&set_limit_mutex);
@@ -3242,7 +3282,7 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3242	cachep = memcg_params_to_cache(params);	3282	cachep = memcg_params_to_cache(params);
3243	cachep->memcg_params->dead = true;	3283	cachep->memcg_params->dead = true;
3244	INIT_WORK(&cachep->memcg_params->destroy,	3284	INIT_WORK(&cachep->memcg_params->destroy,
3245	kmem_cache_destroy_work_func);	3285	kmem_cache_destroy_work_func);
3246	schedule_work(&cachep->memcg_params->destroy);	3286	schedule_work(&cachep->memcg_params->destroy);
3247	}	3287	}
3248	mutex_unlock(&memcg->slab_caches_mutex);	3288	mutex_unlock(&memcg->slab_caches_mutex);