diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-04-02 19:57:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-02 22:04:55 -0400 |
commit | 0b7f569e45bb6be142d87017030669a6a7d327a1 (patch) | |
tree | 8df7877b95c093ebf4cb4e1006cea16f75fc79b7 | |
parent | 81d39c20f5ee2437d71709beb82597e2a38efbbc (diff) |
memcg: fix OOM killer under memcg
This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.
But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup
/groupA/ hierarchy=1, limit=1G,
01 nolimit
02 nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.
This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.
To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/cgroups/memcg_test.txt | 20 | ||||
-rw-r--r-- | include/linux/cgroup.h | 2 | ||||
-rw-r--r-- | kernel/cgroup.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 30 |
4 files changed, 49 insertions, 5 deletions
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index 523a9c16c400..8a11caf417a0 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | Memory Resource Controller(Memcg) Implementation Memo. | 1 | Memory Resource Controller(Memcg) Implementation Memo. |
2 | Last Updated: 2009/1/19 | 2 | Last Updated: 2009/1/20 |
3 | Base Kernel Version: based on 2.6.29-rc2. | 3 | Base Kernel Version: based on 2.6.29-rc2. |
4 | 4 | ||
5 | Because VM is getting complex (one of reasons is memcg...), memcg's behavior | 5 | Because VM is getting complex (one of reasons is memcg...), memcg's behavior |
@@ -360,3 +360,21 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
360 | # kill malloc task. | 360 | # kill malloc task. |
361 | 361 | ||
362 | Of course, tmpfs v.s. swapoff test should be tested, too. | 362 | Of course, tmpfs v.s. swapoff test should be tested, too. |
363 | |||
364 | 9.8 OOM-Killer | ||
365 | Out-of-memory caused by memcg's limit will kill tasks under | ||
366 | the memcg. When hierarchy is used, a task under hierarchy | ||
367 | will be killed by the kernel. | ||
368 | In this case, panic_on_oom shouldn't be invoked and tasks | ||
369 | in other groups shouldn't be killed. | ||
370 | |||
371 | It's not difficult to cause OOM under memcg as following. | ||
372 | Case A) when you can swapoff | ||
373 | #swapoff -a | ||
374 | #echo 50M > /memory.limit_in_bytes | ||
375 | run 51M of malloc | ||
376 | |||
377 | Case B) when you use mem+swap limitation. | ||
378 | #echo 50M > memory.limit_in_bytes | ||
379 | #echo 50M > memory.memsw.limit_in_bytes | ||
380 | run 51M of malloc | ||
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b2816fba5306..43763bd772b9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -503,7 +503,7 @@ struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, | |||
503 | 503 | ||
504 | /* Returns true if root is ancestor of cg */ | 504 | /* Returns true if root is ancestor of cg */ |
505 | bool css_is_ancestor(struct cgroup_subsys_state *cg, | 505 | bool css_is_ancestor(struct cgroup_subsys_state *cg, |
506 | struct cgroup_subsys_state *root); | 506 | const struct cgroup_subsys_state *root); |
507 | 507 | ||
508 | /* Get id and depth of css */ | 508 | /* Get id and depth of css */ |
509 | unsigned short css_id(struct cgroup_subsys_state *css); | 509 | unsigned short css_id(struct cgroup_subsys_state *css); |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f2a3f5c9936c..382109b5baeb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -3405,7 +3405,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
3405 | } | 3405 | } |
3406 | 3406 | ||
3407 | bool css_is_ancestor(struct cgroup_subsys_state *child, | 3407 | bool css_is_ancestor(struct cgroup_subsys_state *child, |
3408 | struct cgroup_subsys_state *root) | 3408 | const struct cgroup_subsys_state *root) |
3409 | { | 3409 | { |
3410 | struct css_id *child_id = rcu_dereference(child->id); | 3410 | struct css_id *child_id = rcu_dereference(child->id); |
3411 | struct css_id *root_id = rcu_dereference(root->id); | 3411 | struct css_id *root_id = rcu_dereference(root->id); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f6a575e77ad..025f8abfae2d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -295,6 +295,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
295 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 295 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
296 | { | 296 | { |
297 | struct mem_cgroup *mem = NULL; | 297 | struct mem_cgroup *mem = NULL; |
298 | |||
299 | if (!mm) | ||
300 | return NULL; | ||
298 | /* | 301 | /* |
299 | * Because we have no locks, mm->owner's may be being moved to other | 302 | * Because we have no locks, mm->owner's may be being moved to other |
300 | * cgroup. We use css_tryget() here even if this looks | 303 | * cgroup. We use css_tryget() here even if this looks |
@@ -486,10 +489,20 @@ void mem_cgroup_move_lists(struct page *page, | |||
486 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 489 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
487 | { | 490 | { |
488 | int ret; | 491 | int ret; |
492 | struct mem_cgroup *curr = NULL; | ||
489 | 493 | ||
490 | task_lock(task); | 494 | task_lock(task); |
491 | ret = task->mm && mm_match_cgroup(task->mm, mem); | 495 | rcu_read_lock(); |
496 | curr = try_get_mem_cgroup_from_mm(task->mm); | ||
497 | rcu_read_unlock(); | ||
492 | task_unlock(task); | 498 | task_unlock(task); |
499 | if (!curr) | ||
500 | return 0; | ||
501 | if (curr->use_hierarchy) | ||
502 | ret = css_is_ancestor(&curr->css, &mem->css); | ||
503 | else | ||
504 | ret = (curr == mem); | ||
505 | css_put(&curr->css); | ||
493 | return ret; | 506 | return ret; |
494 | } | 507 | } |
495 | 508 | ||
@@ -820,6 +833,19 @@ bool mem_cgroup_oom_called(struct task_struct *task) | |||
820 | rcu_read_unlock(); | 833 | rcu_read_unlock(); |
821 | return ret; | 834 | return ret; |
822 | } | 835 | } |
836 | |||
837 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | ||
838 | { | ||
839 | mem->last_oom_jiffies = jiffies; | ||
840 | return 0; | ||
841 | } | ||
842 | |||
843 | static void record_last_oom(struct mem_cgroup *mem) | ||
844 | { | ||
845 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | ||
846 | } | ||
847 | |||
848 | |||
823 | /* | 849 | /* |
824 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 850 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
825 | * oom-killer can be invoked. | 851 | * oom-killer can be invoked. |
@@ -902,7 +928,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
902 | mutex_lock(&memcg_tasklist); | 928 | mutex_lock(&memcg_tasklist); |
903 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 929 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); |
904 | mutex_unlock(&memcg_tasklist); | 930 | mutex_unlock(&memcg_tasklist); |
905 | mem_over_limit->last_oom_jiffies = jiffies; | 931 | record_last_oom(mem_over_limit); |
906 | } | 932 | } |
907 | goto nomem; | 933 | goto nomem; |
908 | } | 934 | } |