aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorGlauber Costa <glommer@parallels.com>2012-12-18 17:22:13 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-18 18:02:13 -0500
commitc8b2a36fb1597e9390cf4c1a7f2dd394dc7d7b17 (patch)
treeb43700c24ea8737138e523ebf492f3b29eb23ecf /mm
parentbea207c86e4eb158bf20f7b36bee72da9272e8d3 (diff)
memcg: execute the whole memcg freeing in free_worker()
A lot of the initialization we do in mem_cgroup_create() is done with softirqs enabled. This include grabbing a css id, which holds &ss->id_lock->rlock, and the per-zone trees, which holds rtpz->lock->rlock. All of those signal to the lockdep mechanism that those locks can be used in SOFTIRQ-ON-W context. This means that the freeing of memcg structure must happen in a compatible context, otherwise we'll get a deadlock, like the one below, caught by lockdep: free_accounted_pages+0x47/0x4c free_task+0x31/0x5c __put_task_struct+0xc2/0xdb put_task_struct+0x1e/0x22 delayed_put_task_struct+0x7a/0x98 __rcu_process_callbacks+0x269/0x3df rcu_process_callbacks+0x31/0x5b __do_softirq+0x122/0x277 This usage pattern could not be triggered before kmem came into play. With the introduction of kmem stack handling, it is possible that we call the last mem_cgroup_put() from the task destructor, which is run in an rcu callback. Such callbacks are run with softirqs disabled, leading to the offensive usage pattern. In general, we have little, if any, means to guarantee in which context the last memcg_put will happen. The best we can do is test it and try to make sure no invalid context releases are happening. But as we add more code to memcg, the possible interactions grow in number and expose more ways to get context conflicts. One thing to keep in mind, is that part of the freeing process is already deferred to a worker, such as vfree(), that can only be called from process context. For the moment, the only two functions we really need moved away are: * free_css_id(), and * mem_cgroup_remove_from_trees(). But because the later accesses per-zone info, free_mem_cgroup_per_zone_info() needs to be moved as well. With that, we are left with the per_cpu stats only. Better move it all. Signed-off-by: Glauber Costa <glommer@parallels.com> Tested-by: Greg Thelen <gthelen@google.com> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Frederic Weisbecker <fweisbec@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: JoonSoo Kim <js1304@gmail.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Rik van Riel <riel@redhat.com> Cc: Suleiman Souhlal <suleiman@google.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c66
1 files changed, 34 insertions, 32 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f96ccc90fa66..e16694d5e118 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5247,16 +5247,29 @@ out_free:
5247} 5247}
5248 5248
5249/* 5249/*
5250 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, 5250 * At destroying mem_cgroup, references from swap_cgroup can remain.
5251 * but in process context. The work_freeing structure is overlaid 5251 * (scanning all at force_empty is too costly...)
5252 * on the rcu_freeing structure, which itself is overlaid on memsw. 5252 *
5253 * Instead of clearing all references at force_empty, we remember
5254 * the number of reference from swap_cgroup and free mem_cgroup when
5255 * it goes down to 0.
5256 *
5257 * Removal of cgroup itself succeeds regardless of refs from swap.
5253 */ 5258 */
5254static void free_work(struct work_struct *work) 5259
5260static void __mem_cgroup_free(struct mem_cgroup *memcg)
5255{ 5261{
5256 struct mem_cgroup *memcg; 5262 int node;
5257 int size = sizeof(struct mem_cgroup); 5263 int size = sizeof(struct mem_cgroup);
5258 5264
5259 memcg = container_of(work, struct mem_cgroup, work_freeing); 5265 mem_cgroup_remove_from_trees(memcg);
5266 free_css_id(&mem_cgroup_subsys, &memcg->css);
5267
5268 for_each_node(node)
5269 free_mem_cgroup_per_zone_info(memcg, node);
5270
5271 free_percpu(memcg->stat);
5272
5260 /* 5273 /*
5261 * We need to make sure that (at least for now), the jump label 5274 * We need to make sure that (at least for now), the jump label
5262 * destruction code runs outside of the cgroup lock. This is because 5275 * destruction code runs outside of the cgroup lock. This is because
@@ -5275,38 +5288,27 @@ static void free_work(struct work_struct *work)
5275 vfree(memcg); 5288 vfree(memcg);
5276} 5289}
5277 5290
5278static void free_rcu(struct rcu_head *rcu_head)
5279{
5280 struct mem_cgroup *memcg;
5281
5282 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
5283 INIT_WORK(&memcg->work_freeing, free_work);
5284 schedule_work(&memcg->work_freeing);
5285}
5286 5291
5287/* 5292/*
5288 * At destroying mem_cgroup, references from swap_cgroup can remain. 5293 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
5289 * (scanning all at force_empty is too costly...) 5294 * but in process context. The work_freeing structure is overlaid
5290 * 5295 * on the rcu_freeing structure, which itself is overlaid on memsw.
5291 * Instead of clearing all references at force_empty, we remember
5292 * the number of reference from swap_cgroup and free mem_cgroup when
5293 * it goes down to 0.
5294 *
5295 * Removal of cgroup itself succeeds regardless of refs from swap.
5296 */ 5296 */
5297 5297static void free_work(struct work_struct *work)
5298static void __mem_cgroup_free(struct mem_cgroup *memcg)
5299{ 5298{
5300 int node; 5299 struct mem_cgroup *memcg;
5301 5300
5302 mem_cgroup_remove_from_trees(memcg); 5301 memcg = container_of(work, struct mem_cgroup, work_freeing);
5303 free_css_id(&mem_cgroup_subsys, &memcg->css); 5302 __mem_cgroup_free(memcg);
5303}
5304 5304
5305 for_each_node(node) 5305static void free_rcu(struct rcu_head *rcu_head)
5306 free_mem_cgroup_per_zone_info(memcg, node); 5306{
5307 struct mem_cgroup *memcg;
5307 5308
5308 free_percpu(memcg->stat); 5309 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
5309 call_rcu(&memcg->rcu_freeing, free_rcu); 5310 INIT_WORK(&memcg->work_freeing, free_work);
5311 schedule_work(&memcg->work_freeing);
5310} 5312}
5311 5313
5312static void mem_cgroup_get(struct mem_cgroup *memcg) 5314static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -5318,7 +5320,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
5318{ 5320{
5319 if (atomic_sub_and_test(count, &memcg->refcnt)) { 5321 if (atomic_sub_and_test(count, &memcg->refcnt)) {
5320 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5322 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5321 __mem_cgroup_free(memcg); 5323 call_rcu(&memcg->rcu_freeing, free_rcu);
5322 if (parent) 5324 if (parent)
5323 mem_cgroup_put(parent); 5325 mem_cgroup_put(parent);
5324 } 5326 }