diff options
-rw-r--r-- | Documentation/cgroups/memory.txt | 23 | ||||
-rw-r--r-- | mm/memcontrol.c | 113 |
2 files changed, 117 insertions, 19 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index eac22d3b2f7b..44e7ded33448 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -493,6 +493,8 @@ It's applicable for root and non-root cgroup. | |||
493 | 493 | ||
494 | 10. OOM Control | 494 | 10. OOM Control |
495 | 495 | ||
496 | memory.oom_control file is for OOM notification and other controls. | ||
497 | |||
496 | Memory controler implements oom notifier using cgroup notification | 498 | Memory controler implements oom notifier using cgroup notification |
497 | API (See cgroups.txt). It allows to register multiple oom notification | 499 | API (See cgroups.txt). It allows to register multiple oom notification |
498 | delivery and gets notification when oom happens. | 500 | delivery and gets notification when oom happens. |
@@ -505,6 +507,27 @@ To register a notifier, application need: | |||
505 | Application will be notifier through eventfd when oom happens. | 507 | Application will be notifier through eventfd when oom happens. |
506 | OOM notification doesn't work for root cgroup. | 508 | OOM notification doesn't work for root cgroup. |
507 | 509 | ||
510 | You can disable oom-killer by writing "1" to memory.oom_control file. | ||
511 | As. | ||
512 | #echo 1 > memory.oom_control | ||
513 | |||
514 | This operation is only allowed to the top cgroup of subhierarchy. | ||
515 | If oom-killer is disabled, tasks under cgroup will hang/sleep | ||
516 | in memcg's oom-waitq when they request accountable memory. | ||
517 | |||
518 | For running them, you have to relax the memcg's oom sitaution by | ||
519 | * enlarge limit or reduce usage. | ||
520 | To reduce usage, | ||
521 | * kill some tasks. | ||
522 | * move some tasks to other group with account migration. | ||
523 | * remove some files (on tmpfs?) | ||
524 | |||
525 | Then, stopped tasks will work again. | ||
526 | |||
527 | At reading, current status of OOM is shown. | ||
528 | oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) | ||
529 | under_oom 0 or 1 (if 1, the memcg is under OOM,tasks may | ||
530 | be stopped.) | ||
508 | 531 | ||
509 | 11. TODO | 532 | 11. TODO |
510 | 533 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da2ed3913316..53eb30ebdb49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -214,6 +214,8 @@ struct mem_cgroup { | |||
214 | atomic_t refcnt; | 214 | atomic_t refcnt; |
215 | 215 | ||
216 | unsigned int swappiness; | 216 | unsigned int swappiness; |
217 | /* OOM-Killer disable */ | ||
218 | int oom_kill_disable; | ||
217 | 219 | ||
218 | /* set when res.limit == memsw.limit */ | 220 | /* set when res.limit == memsw.limit */ |
219 | bool memsw_is_minimum; | 221 | bool memsw_is_minimum; |
@@ -235,7 +237,6 @@ struct mem_cgroup { | |||
235 | * mem_cgroup ? And what type of charges should we move ? | 237 | * mem_cgroup ? And what type of charges should we move ? |
236 | */ | 238 | */ |
237 | unsigned long move_charge_at_immigrate; | 239 | unsigned long move_charge_at_immigrate; |
238 | |||
239 | /* | 240 | /* |
240 | * percpu counter. | 241 | * percpu counter. |
241 | */ | 242 | */ |
@@ -1342,20 +1343,26 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) | |||
1342 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); | 1343 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); |
1343 | } | 1344 | } |
1344 | 1345 | ||
1346 | static void memcg_oom_recover(struct mem_cgroup *mem) | ||
1347 | { | ||
1348 | if (mem->oom_kill_disable && atomic_read(&mem->oom_lock)) | ||
1349 | memcg_wakeup_oom(mem); | ||
1350 | } | ||
1351 | |||
1345 | /* | 1352 | /* |
1346 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1353 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1347 | */ | 1354 | */ |
1348 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | 1355 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) |
1349 | { | 1356 | { |
1350 | struct oom_wait_info owait; | 1357 | struct oom_wait_info owait; |
1351 | bool locked; | 1358 | bool locked, need_to_kill; |
1352 | 1359 | ||
1353 | owait.mem = mem; | 1360 | owait.mem = mem; |
1354 | owait.wait.flags = 0; | 1361 | owait.wait.flags = 0; |
1355 | owait.wait.func = memcg_oom_wake_function; | 1362 | owait.wait.func = memcg_oom_wake_function; |
1356 | owait.wait.private = current; | 1363 | owait.wait.private = current; |
1357 | INIT_LIST_HEAD(&owait.wait.task_list); | 1364 | INIT_LIST_HEAD(&owait.wait.task_list); |
1358 | 1365 | need_to_kill = true; | |
1359 | /* At first, try to OOM lock hierarchy under mem.*/ | 1366 | /* At first, try to OOM lock hierarchy under mem.*/ |
1360 | mutex_lock(&memcg_oom_mutex); | 1367 | mutex_lock(&memcg_oom_mutex); |
1361 | locked = mem_cgroup_oom_lock(mem); | 1368 | locked = mem_cgroup_oom_lock(mem); |
@@ -1364,15 +1371,17 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1364 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 1371 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL |
1365 | * under OOM is always welcomed, use TASK_KILLABLE here. | 1372 | * under OOM is always welcomed, use TASK_KILLABLE here. |
1366 | */ | 1373 | */ |
1367 | if (!locked) | 1374 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
1368 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 1375 | if (!locked || mem->oom_kill_disable) |
1369 | else | 1376 | need_to_kill = false; |
1377 | if (locked) | ||
1370 | mem_cgroup_oom_notify(mem); | 1378 | mem_cgroup_oom_notify(mem); |
1371 | mutex_unlock(&memcg_oom_mutex); | 1379 | mutex_unlock(&memcg_oom_mutex); |
1372 | 1380 | ||
1373 | if (locked) | 1381 | if (need_to_kill) { |
1382 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
1374 | mem_cgroup_out_of_memory(mem, mask); | 1383 | mem_cgroup_out_of_memory(mem, mask); |
1375 | else { | 1384 | } else { |
1376 | schedule(); | 1385 | schedule(); |
1377 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1386 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1378 | } | 1387 | } |
@@ -2162,15 +2171,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2162 | /* If swapout, usage of swap doesn't decrease */ | 2171 | /* If swapout, usage of swap doesn't decrease */ |
2163 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2172 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2164 | uncharge_memsw = false; | 2173 | uncharge_memsw = false; |
2165 | /* | ||
2166 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2167 | * In those cases, all pages freed continously can be expected to be in | ||
2168 | * the same cgroup and we have chance to coalesce uncharges. | ||
2169 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2170 | * because we want to do uncharge as soon as possible. | ||
2171 | */ | ||
2172 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2173 | goto direct_uncharge; | ||
2174 | 2174 | ||
2175 | batch = ¤t->memcg_batch; | 2175 | batch = ¤t->memcg_batch; |
2176 | /* | 2176 | /* |
@@ -2181,6 +2181,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2181 | if (!batch->memcg) | 2181 | if (!batch->memcg) |
2182 | batch->memcg = mem; | 2182 | batch->memcg = mem; |
2183 | /* | 2183 | /* |
2184 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2185 | * In those cases, all pages freed continously can be expected to be in | ||
2186 | * the same cgroup and we have chance to coalesce uncharges. | ||
2187 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2188 | * because we want to do uncharge as soon as possible. | ||
2189 | */ | ||
2190 | |||
2191 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2192 | goto direct_uncharge; | ||
2193 | |||
2194 | /* | ||
2184 | * In typical case, batch->memcg == mem. This means we can | 2195 | * In typical case, batch->memcg == mem. This means we can |
2185 | * merge a series of uncharges to an uncharge of res_counter. | 2196 | * merge a series of uncharges to an uncharge of res_counter. |
2186 | * If not, we uncharge res_counter ony by one. | 2197 | * If not, we uncharge res_counter ony by one. |
@@ -2196,6 +2207,8 @@ direct_uncharge: | |||
2196 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2207 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
2197 | if (uncharge_memsw) | 2208 | if (uncharge_memsw) |
2198 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 2209 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); |
2210 | if (unlikely(batch->memcg != mem)) | ||
2211 | memcg_oom_recover(mem); | ||
2199 | return; | 2212 | return; |
2200 | } | 2213 | } |
2201 | 2214 | ||
@@ -2332,6 +2345,7 @@ void mem_cgroup_uncharge_end(void) | |||
2332 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | 2345 | res_counter_uncharge(&batch->memcg->res, batch->bytes); |
2333 | if (batch->memsw_bytes) | 2346 | if (batch->memsw_bytes) |
2334 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | 2347 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); |
2348 | memcg_oom_recover(batch->memcg); | ||
2335 | /* forget this pointer (for sanity check) */ | 2349 | /* forget this pointer (for sanity check) */ |
2336 | batch->memcg = NULL; | 2350 | batch->memcg = NULL; |
2337 | } | 2351 | } |
@@ -2568,10 +2582,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2568 | unsigned long long val) | 2582 | unsigned long long val) |
2569 | { | 2583 | { |
2570 | int retry_count; | 2584 | int retry_count; |
2571 | u64 memswlimit; | 2585 | u64 memswlimit, memlimit; |
2572 | int ret = 0; | 2586 | int ret = 0; |
2573 | int children = mem_cgroup_count_children(memcg); | 2587 | int children = mem_cgroup_count_children(memcg); |
2574 | u64 curusage, oldusage; | 2588 | u64 curusage, oldusage; |
2589 | int enlarge; | ||
2575 | 2590 | ||
2576 | /* | 2591 | /* |
2577 | * For keeping hierarchical_reclaim simple, how long we should retry | 2592 | * For keeping hierarchical_reclaim simple, how long we should retry |
@@ -2582,6 +2597,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2582 | 2597 | ||
2583 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2598 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2584 | 2599 | ||
2600 | enlarge = 0; | ||
2585 | while (retry_count) { | 2601 | while (retry_count) { |
2586 | if (signal_pending(current)) { | 2602 | if (signal_pending(current)) { |
2587 | ret = -EINTR; | 2603 | ret = -EINTR; |
@@ -2599,6 +2615,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2599 | mutex_unlock(&set_limit_mutex); | 2615 | mutex_unlock(&set_limit_mutex); |
2600 | break; | 2616 | break; |
2601 | } | 2617 | } |
2618 | |||
2619 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
2620 | if (memlimit < val) | ||
2621 | enlarge = 1; | ||
2622 | |||
2602 | ret = res_counter_set_limit(&memcg->res, val); | 2623 | ret = res_counter_set_limit(&memcg->res, val); |
2603 | if (!ret) { | 2624 | if (!ret) { |
2604 | if (memswlimit == val) | 2625 | if (memswlimit == val) |
@@ -2620,6 +2641,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2620 | else | 2641 | else |
2621 | oldusage = curusage; | 2642 | oldusage = curusage; |
2622 | } | 2643 | } |
2644 | if (!ret && enlarge) | ||
2645 | memcg_oom_recover(memcg); | ||
2623 | 2646 | ||
2624 | return ret; | 2647 | return ret; |
2625 | } | 2648 | } |
@@ -2628,9 +2651,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2628 | unsigned long long val) | 2651 | unsigned long long val) |
2629 | { | 2652 | { |
2630 | int retry_count; | 2653 | int retry_count; |
2631 | u64 memlimit, oldusage, curusage; | 2654 | u64 memlimit, memswlimit, oldusage, curusage; |
2632 | int children = mem_cgroup_count_children(memcg); | 2655 | int children = mem_cgroup_count_children(memcg); |
2633 | int ret = -EBUSY; | 2656 | int ret = -EBUSY; |
2657 | int enlarge = 0; | ||
2634 | 2658 | ||
2635 | /* see mem_cgroup_resize_res_limit */ | 2659 | /* see mem_cgroup_resize_res_limit */ |
2636 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 2660 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
@@ -2652,6 +2676,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2652 | mutex_unlock(&set_limit_mutex); | 2676 | mutex_unlock(&set_limit_mutex); |
2653 | break; | 2677 | break; |
2654 | } | 2678 | } |
2679 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
2680 | if (memswlimit < val) | ||
2681 | enlarge = 1; | ||
2655 | ret = res_counter_set_limit(&memcg->memsw, val); | 2682 | ret = res_counter_set_limit(&memcg->memsw, val); |
2656 | if (!ret) { | 2683 | if (!ret) { |
2657 | if (memlimit == val) | 2684 | if (memlimit == val) |
@@ -2674,6 +2701,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2674 | else | 2701 | else |
2675 | oldusage = curusage; | 2702 | oldusage = curusage; |
2676 | } | 2703 | } |
2704 | if (!ret && enlarge) | ||
2705 | memcg_oom_recover(memcg); | ||
2677 | return ret; | 2706 | return ret; |
2678 | } | 2707 | } |
2679 | 2708 | ||
@@ -2865,6 +2894,7 @@ move_account: | |||
2865 | if (ret) | 2894 | if (ret) |
2866 | break; | 2895 | break; |
2867 | } | 2896 | } |
2897 | memcg_oom_recover(mem); | ||
2868 | /* it seems parent cgroup doesn't have enough mem */ | 2898 | /* it seems parent cgroup doesn't have enough mem */ |
2869 | if (ret == -ENOMEM) | 2899 | if (ret == -ENOMEM) |
2870 | goto try_to_free; | 2900 | goto try_to_free; |
@@ -3645,6 +3675,46 @@ static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | |||
3645 | return 0; | 3675 | return 0; |
3646 | } | 3676 | } |
3647 | 3677 | ||
3678 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | ||
3679 | struct cftype *cft, struct cgroup_map_cb *cb) | ||
3680 | { | ||
3681 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3682 | |||
3683 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | ||
3684 | |||
3685 | if (atomic_read(&mem->oom_lock)) | ||
3686 | cb->fill(cb, "under_oom", 1); | ||
3687 | else | ||
3688 | cb->fill(cb, "under_oom", 0); | ||
3689 | return 0; | ||
3690 | } | ||
3691 | |||
3692 | /* | ||
3693 | */ | ||
3694 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | ||
3695 | struct cftype *cft, u64 val) | ||
3696 | { | ||
3697 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3698 | struct mem_cgroup *parent; | ||
3699 | |||
3700 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | ||
3701 | if (!cgrp->parent || !((val == 0) || (val == 1))) | ||
3702 | return -EINVAL; | ||
3703 | |||
3704 | parent = mem_cgroup_from_cont(cgrp->parent); | ||
3705 | |||
3706 | cgroup_lock(); | ||
3707 | /* oom-kill-disable is a flag for subhierarchy. */ | ||
3708 | if ((parent->use_hierarchy) || | ||
3709 | (mem->use_hierarchy && !list_empty(&cgrp->children))) { | ||
3710 | cgroup_unlock(); | ||
3711 | return -EINVAL; | ||
3712 | } | ||
3713 | mem->oom_kill_disable = val; | ||
3714 | cgroup_unlock(); | ||
3715 | return 0; | ||
3716 | } | ||
3717 | |||
3648 | static struct cftype mem_cgroup_files[] = { | 3718 | static struct cftype mem_cgroup_files[] = { |
3649 | { | 3719 | { |
3650 | .name = "usage_in_bytes", | 3720 | .name = "usage_in_bytes", |
@@ -3702,6 +3772,8 @@ static struct cftype mem_cgroup_files[] = { | |||
3702 | }, | 3772 | }, |
3703 | { | 3773 | { |
3704 | .name = "oom_control", | 3774 | .name = "oom_control", |
3775 | .read_map = mem_cgroup_oom_control_read, | ||
3776 | .write_u64 = mem_cgroup_oom_control_write, | ||
3705 | .register_event = mem_cgroup_oom_register_event, | 3777 | .register_event = mem_cgroup_oom_register_event, |
3706 | .unregister_event = mem_cgroup_oom_unregister_event, | 3778 | .unregister_event = mem_cgroup_oom_unregister_event, |
3707 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 3779 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
@@ -3943,6 +4015,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3943 | } else { | 4015 | } else { |
3944 | parent = mem_cgroup_from_cont(cont->parent); | 4016 | parent = mem_cgroup_from_cont(cont->parent); |
3945 | mem->use_hierarchy = parent->use_hierarchy; | 4017 | mem->use_hierarchy = parent->use_hierarchy; |
4018 | mem->oom_kill_disable = parent->oom_kill_disable; | ||
3946 | } | 4019 | } |
3947 | 4020 | ||
3948 | if (parent && parent->use_hierarchy) { | 4021 | if (parent && parent->use_hierarchy) { |
@@ -4215,6 +4288,7 @@ static void mem_cgroup_clear_mc(void) | |||
4215 | if (mc.precharge) { | 4288 | if (mc.precharge) { |
4216 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | 4289 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); |
4217 | mc.precharge = 0; | 4290 | mc.precharge = 0; |
4291 | memcg_oom_recover(mc.to); | ||
4218 | } | 4292 | } |
4219 | /* | 4293 | /* |
4220 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 4294 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
@@ -4223,6 +4297,7 @@ static void mem_cgroup_clear_mc(void) | |||
4223 | if (mc.moved_charge) { | 4297 | if (mc.moved_charge) { |
4224 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | 4298 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); |
4225 | mc.moved_charge = 0; | 4299 | mc.moved_charge = 0; |
4300 | memcg_oom_recover(mc.from); | ||
4226 | } | 4301 | } |
4227 | /* we must fixup refcnts and charges */ | 4302 | /* we must fixup refcnts and charges */ |
4228 | if (mc.moved_swap) { | 4303 | if (mc.moved_swap) { |