aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2010-05-26 17:42:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-27 12:12:43 -0400
commit3c11ecf448eff8f12922c498b8274ce98587eb74 (patch)
tree0a28aeef7759ead93ff58fd7db2fe61797523f7e /mm/memcontrol.c
parent9490ff275606da012d5b373342a49610ad61cb81 (diff)
memcg: oom kill disable and oom status
This adds a feature to disable oom-killer for memcg, if disabled, of course, tasks under memcg will stop. But now, we have oom-notifier for memcg. And the world around memcg is not under out-of-memory. memcg's out-of-memory just shows memcg hits limit. Then, administrator or management daemon can recover the situation by - kill some process - enlarge limit, add more swap. - migrate some tasks - remove file cache on tmps (difficult ?) Unlike oom-killer, you can take enough information before killing tasks. (by gcore, or, ps etc.) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c113
1 files changed, 94 insertions, 19 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index da2ed3913316..53eb30ebdb49 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -214,6 +214,8 @@ struct mem_cgroup {
214 atomic_t refcnt; 214 atomic_t refcnt;
215 215
216 unsigned int swappiness; 216 unsigned int swappiness;
217 /* OOM-Killer disable */
218 int oom_kill_disable;
217 219
218 /* set when res.limit == memsw.limit */ 220 /* set when res.limit == memsw.limit */
219 bool memsw_is_minimum; 221 bool memsw_is_minimum;
@@ -235,7 +237,6 @@ struct mem_cgroup {
235 * mem_cgroup ? And what type of charges should we move ? 237 * mem_cgroup ? And what type of charges should we move ?
236 */ 238 */
237 unsigned long move_charge_at_immigrate; 239 unsigned long move_charge_at_immigrate;
238
239 /* 240 /*
240 * percpu counter. 241 * percpu counter.
241 */ 242 */
@@ -1342,20 +1343,26 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1342 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1343 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1343} 1344}
1344 1345
1346static void memcg_oom_recover(struct mem_cgroup *mem)
1347{
1348 if (mem->oom_kill_disable && atomic_read(&mem->oom_lock))
1349 memcg_wakeup_oom(mem);
1350}
1351
1345/* 1352/*
1346 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1353 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1347 */ 1354 */
1348bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1355bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1349{ 1356{
1350 struct oom_wait_info owait; 1357 struct oom_wait_info owait;
1351 bool locked; 1358 bool locked, need_to_kill;
1352 1359
1353 owait.mem = mem; 1360 owait.mem = mem;
1354 owait.wait.flags = 0; 1361 owait.wait.flags = 0;
1355 owait.wait.func = memcg_oom_wake_function; 1362 owait.wait.func = memcg_oom_wake_function;
1356 owait.wait.private = current; 1363 owait.wait.private = current;
1357 INIT_LIST_HEAD(&owait.wait.task_list); 1364 INIT_LIST_HEAD(&owait.wait.task_list);
1358 1365 need_to_kill = true;
1359 /* At first, try to OOM lock hierarchy under mem.*/ 1366 /* At first, try to OOM lock hierarchy under mem.*/
1360 mutex_lock(&memcg_oom_mutex); 1367 mutex_lock(&memcg_oom_mutex);
1361 locked = mem_cgroup_oom_lock(mem); 1368 locked = mem_cgroup_oom_lock(mem);
@@ -1364,15 +1371,17 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1364 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1371 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1365 * under OOM is always welcomed, use TASK_KILLABLE here. 1372 * under OOM is always welcomed, use TASK_KILLABLE here.
1366 */ 1373 */
1367 if (!locked) 1374 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1368 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1375 if (!locked || mem->oom_kill_disable)
1369 else 1376 need_to_kill = false;
1377 if (locked)
1370 mem_cgroup_oom_notify(mem); 1378 mem_cgroup_oom_notify(mem);
1371 mutex_unlock(&memcg_oom_mutex); 1379 mutex_unlock(&memcg_oom_mutex);
1372 1380
1373 if (locked) 1381 if (need_to_kill) {
1382 finish_wait(&memcg_oom_waitq, &owait.wait);
1374 mem_cgroup_out_of_memory(mem, mask); 1383 mem_cgroup_out_of_memory(mem, mask);
1375 else { 1384 } else {
1376 schedule(); 1385 schedule();
1377 finish_wait(&memcg_oom_waitq, &owait.wait); 1386 finish_wait(&memcg_oom_waitq, &owait.wait);
1378 } 1387 }
@@ -2162,15 +2171,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2162 /* If swapout, usage of swap doesn't decrease */ 2171 /* If swapout, usage of swap doesn't decrease */
2163 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2172 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2164 uncharge_memsw = false; 2173 uncharge_memsw = false;
2165 /*
2166 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2167 * In those cases, all pages freed continously can be expected to be in
2168 * the same cgroup and we have chance to coalesce uncharges.
2169 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2170 * because we want to do uncharge as soon as possible.
2171 */
2172 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
2173 goto direct_uncharge;
2174 2174
2175 batch = &current->memcg_batch; 2175 batch = &current->memcg_batch;
2176 /* 2176 /*
@@ -2181,6 +2181,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2181 if (!batch->memcg) 2181 if (!batch->memcg)
2182 batch->memcg = mem; 2182 batch->memcg = mem;
2183 /* 2183 /*
2184 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2185 * In those cases, all pages freed continously can be expected to be in
2186 * the same cgroup and we have chance to coalesce uncharges.
2187 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2188 * because we want to do uncharge as soon as possible.
2189 */
2190
2191 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2192 goto direct_uncharge;
2193
2194 /*
2184 * In typical case, batch->memcg == mem. This means we can 2195 * In typical case, batch->memcg == mem. This means we can
2185 * merge a series of uncharges to an uncharge of res_counter. 2196 * merge a series of uncharges to an uncharge of res_counter.
2186 * If not, we uncharge res_counter ony by one. 2197 * If not, we uncharge res_counter ony by one.
@@ -2196,6 +2207,8 @@ direct_uncharge:
2196 res_counter_uncharge(&mem->res, PAGE_SIZE); 2207 res_counter_uncharge(&mem->res, PAGE_SIZE);
2197 if (uncharge_memsw) 2208 if (uncharge_memsw)
2198 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2209 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2210 if (unlikely(batch->memcg != mem))
2211 memcg_oom_recover(mem);
2199 return; 2212 return;
2200} 2213}
2201 2214
@@ -2332,6 +2345,7 @@ void mem_cgroup_uncharge_end(void)
2332 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2345 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2333 if (batch->memsw_bytes) 2346 if (batch->memsw_bytes)
2334 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2347 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2348 memcg_oom_recover(batch->memcg);
2335 /* forget this pointer (for sanity check) */ 2349 /* forget this pointer (for sanity check) */
2336 batch->memcg = NULL; 2350 batch->memcg = NULL;
2337} 2351}
@@ -2568,10 +2582,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2568 unsigned long long val) 2582 unsigned long long val)
2569{ 2583{
2570 int retry_count; 2584 int retry_count;
2571 u64 memswlimit; 2585 u64 memswlimit, memlimit;
2572 int ret = 0; 2586 int ret = 0;
2573 int children = mem_cgroup_count_children(memcg); 2587 int children = mem_cgroup_count_children(memcg);
2574 u64 curusage, oldusage; 2588 u64 curusage, oldusage;
2589 int enlarge;
2575 2590
2576 /* 2591 /*
2577 * For keeping hierarchical_reclaim simple, how long we should retry 2592 * For keeping hierarchical_reclaim simple, how long we should retry
@@ -2582,6 +2597,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2582 2597
2583 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2598 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2584 2599
2600 enlarge = 0;
2585 while (retry_count) { 2601 while (retry_count) {
2586 if (signal_pending(current)) { 2602 if (signal_pending(current)) {
2587 ret = -EINTR; 2603 ret = -EINTR;
@@ -2599,6 +2615,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2599 mutex_unlock(&set_limit_mutex); 2615 mutex_unlock(&set_limit_mutex);
2600 break; 2616 break;
2601 } 2617 }
2618
2619 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2620 if (memlimit < val)
2621 enlarge = 1;
2622
2602 ret = res_counter_set_limit(&memcg->res, val); 2623 ret = res_counter_set_limit(&memcg->res, val);
2603 if (!ret) { 2624 if (!ret) {
2604 if (memswlimit == val) 2625 if (memswlimit == val)
@@ -2620,6 +2641,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2620 else 2641 else
2621 oldusage = curusage; 2642 oldusage = curusage;
2622 } 2643 }
2644 if (!ret && enlarge)
2645 memcg_oom_recover(memcg);
2623 2646
2624 return ret; 2647 return ret;
2625} 2648}
@@ -2628,9 +2651,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2628 unsigned long long val) 2651 unsigned long long val)
2629{ 2652{
2630 int retry_count; 2653 int retry_count;
2631 u64 memlimit, oldusage, curusage; 2654 u64 memlimit, memswlimit, oldusage, curusage;
2632 int children = mem_cgroup_count_children(memcg); 2655 int children = mem_cgroup_count_children(memcg);
2633 int ret = -EBUSY; 2656 int ret = -EBUSY;
2657 int enlarge = 0;
2634 2658
2635 /* see mem_cgroup_resize_res_limit */ 2659 /* see mem_cgroup_resize_res_limit */
2636 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2660 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
@@ -2652,6 +2676,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2652 mutex_unlock(&set_limit_mutex); 2676 mutex_unlock(&set_limit_mutex);
2653 break; 2677 break;
2654 } 2678 }
2679 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2680 if (memswlimit < val)
2681 enlarge = 1;
2655 ret = res_counter_set_limit(&memcg->memsw, val); 2682 ret = res_counter_set_limit(&memcg->memsw, val);
2656 if (!ret) { 2683 if (!ret) {
2657 if (memlimit == val) 2684 if (memlimit == val)
@@ -2674,6 +2701,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2674 else 2701 else
2675 oldusage = curusage; 2702 oldusage = curusage;
2676 } 2703 }
2704 if (!ret && enlarge)
2705 memcg_oom_recover(memcg);
2677 return ret; 2706 return ret;
2678} 2707}
2679 2708
@@ -2865,6 +2894,7 @@ move_account:
2865 if (ret) 2894 if (ret)
2866 break; 2895 break;
2867 } 2896 }
2897 memcg_oom_recover(mem);
2868 /* it seems parent cgroup doesn't have enough mem */ 2898 /* it seems parent cgroup doesn't have enough mem */
2869 if (ret == -ENOMEM) 2899 if (ret == -ENOMEM)
2870 goto try_to_free; 2900 goto try_to_free;
@@ -3645,6 +3675,46 @@ static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3645 return 0; 3675 return 0;
3646} 3676}
3647 3677
3678static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3679 struct cftype *cft, struct cgroup_map_cb *cb)
3680{
3681 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3682
3683 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
3684
3685 if (atomic_read(&mem->oom_lock))
3686 cb->fill(cb, "under_oom", 1);
3687 else
3688 cb->fill(cb, "under_oom", 0);
3689 return 0;
3690}
3691
3692/*
3693 */
3694static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3695 struct cftype *cft, u64 val)
3696{
3697 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3698 struct mem_cgroup *parent;
3699
3700 /* cannot set to root cgroup and only 0 and 1 are allowed */
3701 if (!cgrp->parent || !((val == 0) || (val == 1)))
3702 return -EINVAL;
3703
3704 parent = mem_cgroup_from_cont(cgrp->parent);
3705
3706 cgroup_lock();
3707 /* oom-kill-disable is a flag for subhierarchy. */
3708 if ((parent->use_hierarchy) ||
3709 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
3710 cgroup_unlock();
3711 return -EINVAL;
3712 }
3713 mem->oom_kill_disable = val;
3714 cgroup_unlock();
3715 return 0;
3716}
3717
3648static struct cftype mem_cgroup_files[] = { 3718static struct cftype mem_cgroup_files[] = {
3649 { 3719 {
3650 .name = "usage_in_bytes", 3720 .name = "usage_in_bytes",
@@ -3702,6 +3772,8 @@ static struct cftype mem_cgroup_files[] = {
3702 }, 3772 },
3703 { 3773 {
3704 .name = "oom_control", 3774 .name = "oom_control",
3775 .read_map = mem_cgroup_oom_control_read,
3776 .write_u64 = mem_cgroup_oom_control_write,
3705 .register_event = mem_cgroup_oom_register_event, 3777 .register_event = mem_cgroup_oom_register_event,
3706 .unregister_event = mem_cgroup_oom_unregister_event, 3778 .unregister_event = mem_cgroup_oom_unregister_event,
3707 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 3779 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
@@ -3943,6 +4015,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3943 } else { 4015 } else {
3944 parent = mem_cgroup_from_cont(cont->parent); 4016 parent = mem_cgroup_from_cont(cont->parent);
3945 mem->use_hierarchy = parent->use_hierarchy; 4017 mem->use_hierarchy = parent->use_hierarchy;
4018 mem->oom_kill_disable = parent->oom_kill_disable;
3946 } 4019 }
3947 4020
3948 if (parent && parent->use_hierarchy) { 4021 if (parent && parent->use_hierarchy) {
@@ -4215,6 +4288,7 @@ static void mem_cgroup_clear_mc(void)
4215 if (mc.precharge) { 4288 if (mc.precharge) {
4216 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4289 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4217 mc.precharge = 0; 4290 mc.precharge = 0;
4291 memcg_oom_recover(mc.to);
4218 } 4292 }
4219 /* 4293 /*
4220 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4294 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4223,6 +4297,7 @@ static void mem_cgroup_clear_mc(void)
4223 if (mc.moved_charge) { 4297 if (mc.moved_charge) {
4224 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4298 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4225 mc.moved_charge = 0; 4299 mc.moved_charge = 0;
4300 memcg_oom_recover(mc.from);
4226 } 4301 }
4227 /* we must fixup refcnts and charges */ 4302 /* we must fixup refcnts and charges */
4228 if (mc.moved_swap) { 4303 if (mc.moved_swap) {