aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-04-02 19:57:38 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-02 22:04:55 -0400
commit0b7f569e45bb6be142d87017030669a6a7d327a1 (patch)
tree8df7877b95c093ebf4cb4e1006cea16f75fc79b7
parent81d39c20f5ee2437d71709beb82597e2a38efbbc (diff)
memcg: fix OOM killer under memcg
This patch tries to fix OOM Killer problems caused by hierarchy. Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to kill a task in memcg. But, when hierarchy is used, it's broken and correct task cannot be killed. For example, in following cgroup /groupA/ hierarchy=1, limit=1G, 01 nolimit 02 nolimit All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to groupA's 1Gbytes but OOM Killer just kills tasks in groupA. This patch provides makes the bad process be selected from all tasks under hierarchy. BTW, currently, oom_jiffies is updated against groupA in above case. oom_jiffies of tree should be updated. To see how oom_jiffies is used, please check mem_cgroup_oom_called() callers. [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: const fix] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cgroups/memcg_test.txt20
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--kernel/cgroup.c2
-rw-r--r--mm/memcontrol.c30
4 files changed, 49 insertions, 5 deletions
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 523a9c16c400..8a11caf417a0 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,5 +1,5 @@
1Memory Resource Controller(Memcg) Implementation Memo. 1Memory Resource Controller(Memcg) Implementation Memo.
2Last Updated: 2009/1/19 2Last Updated: 2009/1/20
3Base Kernel Version: based on 2.6.29-rc2. 3Base Kernel Version: based on 2.6.29-rc2.
4 4
5Because VM is getting complex (one of reasons is memcg...), memcg's behavior 5Because VM is getting complex (one of reasons is memcg...), memcg's behavior
@@ -360,3 +360,21 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
360 # kill malloc task. 360 # kill malloc task.
361 361
362 Of course, tmpfs v.s. swapoff test should be tested, too. 362 Of course, tmpfs v.s. swapoff test should be tested, too.
363
364 9.8 OOM-Killer
365 Out-of-memory caused by memcg's limit will kill tasks under
366 the memcg. When hierarchy is used, a task under hierarchy
367 will be killed by the kernel.
368 In this case, panic_on_oom shouldn't be invoked and tasks
369 in other groups shouldn't be killed.
370
371 It's not difficult to cause OOM under memcg as following.
372 Case A) when you can swapoff
373 #swapoff -a
374 #echo 50M > /memory.limit_in_bytes
375 run 51M of malloc
376
377 Case B) when you use mem+swap limitation.
378 #echo 50M > memory.limit_in_bytes
379 #echo 50M > memory.memsw.limit_in_bytes
380 run 51M of malloc
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b2816fba5306..43763bd772b9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -503,7 +503,7 @@ struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
503 503
504/* Returns true if root is ancestor of cg */ 504/* Returns true if root is ancestor of cg */
505bool css_is_ancestor(struct cgroup_subsys_state *cg, 505bool css_is_ancestor(struct cgroup_subsys_state *cg,
506 struct cgroup_subsys_state *root); 506 const struct cgroup_subsys_state *root);
507 507
508/* Get id and depth of css */ 508/* Get id and depth of css */
509unsigned short css_id(struct cgroup_subsys_state *css); 509unsigned short css_id(struct cgroup_subsys_state *css);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f2a3f5c9936c..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3405,7 +3405,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
3405} 3405}
3406 3406
3407bool css_is_ancestor(struct cgroup_subsys_state *child, 3407bool css_is_ancestor(struct cgroup_subsys_state *child,
3408 struct cgroup_subsys_state *root) 3408 const struct cgroup_subsys_state *root)
3409{ 3409{
3410 struct css_id *child_id = rcu_dereference(child->id); 3410 struct css_id *child_id = rcu_dereference(child->id);
3411 struct css_id *root_id = rcu_dereference(root->id); 3411 struct css_id *root_id = rcu_dereference(root->id);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f6a575e77ad..025f8abfae2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -295,6 +295,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
295static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 295static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
296{ 296{
297 struct mem_cgroup *mem = NULL; 297 struct mem_cgroup *mem = NULL;
298
299 if (!mm)
300 return NULL;
298 /* 301 /*
299 * Because we have no locks, mm->owner's may be being moved to other 302 * Because we have no locks, mm->owner's may be being moved to other
300 * cgroup. We use css_tryget() here even if this looks 303 * cgroup. We use css_tryget() here even if this looks
@@ -486,10 +489,20 @@ void mem_cgroup_move_lists(struct page *page,
486int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 489int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
487{ 490{
488 int ret; 491 int ret;
492 struct mem_cgroup *curr = NULL;
489 493
490 task_lock(task); 494 task_lock(task);
491 ret = task->mm && mm_match_cgroup(task->mm, mem); 495 rcu_read_lock();
496 curr = try_get_mem_cgroup_from_mm(task->mm);
497 rcu_read_unlock();
492 task_unlock(task); 498 task_unlock(task);
499 if (!curr)
500 return 0;
501 if (curr->use_hierarchy)
502 ret = css_is_ancestor(&curr->css, &mem->css);
503 else
504 ret = (curr == mem);
505 css_put(&curr->css);
493 return ret; 506 return ret;
494} 507}
495 508
@@ -820,6 +833,19 @@ bool mem_cgroup_oom_called(struct task_struct *task)
820 rcu_read_unlock(); 833 rcu_read_unlock();
821 return ret; 834 return ret;
822} 835}
836
837static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
838{
839 mem->last_oom_jiffies = jiffies;
840 return 0;
841}
842
843static void record_last_oom(struct mem_cgroup *mem)
844{
845 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
846}
847
848
823/* 849/*
824 * Unlike exported interface, "oom" parameter is added. if oom==true, 850 * Unlike exported interface, "oom" parameter is added. if oom==true,
825 * oom-killer can be invoked. 851 * oom-killer can be invoked.
@@ -902,7 +928,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
902 mutex_lock(&memcg_tasklist); 928 mutex_lock(&memcg_tasklist);
903 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 929 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
904 mutex_unlock(&memcg_tasklist); 930 mutex_unlock(&memcg_tasklist);
905 mem_over_limit->last_oom_jiffies = jiffies; 931 record_last_oom(mem_over_limit);
906 } 932 }
907 goto nomem; 933 goto nomem;
908 } 934 }