aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/memory.txt23
-rw-r--r--mm/memcontrol.c113
2 files changed, 117 insertions, 19 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index eac22d3b2f7b..44e7ded33448 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -493,6 +493,8 @@ It's applicable for root and non-root cgroup.
493 493
49410. OOM Control 49410. OOM Control
495 495
496memory.oom_control file is for OOM notification and other controls.
497
496Memory controler implements oom notifier using cgroup notification 498Memory controler implements oom notifier using cgroup notification
497API (See cgroups.txt). It allows to register multiple oom notification 499API (See cgroups.txt). It allows to register multiple oom notification
498delivery and gets notification when oom happens. 500delivery and gets notification when oom happens.
@@ -505,6 +507,27 @@ To register a notifier, application need:
505Application will be notifier through eventfd when oom happens. 507Application will be notifier through eventfd when oom happens.
506OOM notification doesn't work for root cgroup. 508OOM notification doesn't work for root cgroup.
507 509
510You can disable oom-killer by writing "1" to memory.oom_control file.
511As.
512 #echo 1 > memory.oom_control
513
514This operation is only allowed to the top cgroup of subhierarchy.
515If oom-killer is disabled, tasks under cgroup will hang/sleep
516in memcg's oom-waitq when they request accountable memory.
517
518For running them, you have to relax the memcg's oom sitaution by
519 * enlarge limit or reduce usage.
520To reduce usage,
521 * kill some tasks.
522 * move some tasks to other group with account migration.
523 * remove some files (on tmpfs?)
524
525Then, stopped tasks will work again.
526
527At reading, current status of OOM is shown.
528 oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
529 under_oom 0 or 1 (if 1, the memcg is under OOM,tasks may
530 be stopped.)
508 531
50911. TODO 53211. TODO
510 533
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index da2ed3913316..53eb30ebdb49 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -214,6 +214,8 @@ struct mem_cgroup {
214 atomic_t refcnt; 214 atomic_t refcnt;
215 215
216 unsigned int swappiness; 216 unsigned int swappiness;
217 /* OOM-Killer disable */
218 int oom_kill_disable;
217 219
218 /* set when res.limit == memsw.limit */ 220 /* set when res.limit == memsw.limit */
219 bool memsw_is_minimum; 221 bool memsw_is_minimum;
@@ -235,7 +237,6 @@ struct mem_cgroup {
235 * mem_cgroup ? And what type of charges should we move ? 237 * mem_cgroup ? And what type of charges should we move ?
236 */ 238 */
237 unsigned long move_charge_at_immigrate; 239 unsigned long move_charge_at_immigrate;
238
239 /* 240 /*
240 * percpu counter. 241 * percpu counter.
241 */ 242 */
@@ -1342,20 +1343,26 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1342 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1343 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1343} 1344}
1344 1345
1346static void memcg_oom_recover(struct mem_cgroup *mem)
1347{
1348 if (mem->oom_kill_disable && atomic_read(&mem->oom_lock))
1349 memcg_wakeup_oom(mem);
1350}
1351
1345/* 1352/*
1346 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1353 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1347 */ 1354 */
1348bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1355bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1349{ 1356{
1350 struct oom_wait_info owait; 1357 struct oom_wait_info owait;
1351 bool locked; 1358 bool locked, need_to_kill;
1352 1359
1353 owait.mem = mem; 1360 owait.mem = mem;
1354 owait.wait.flags = 0; 1361 owait.wait.flags = 0;
1355 owait.wait.func = memcg_oom_wake_function; 1362 owait.wait.func = memcg_oom_wake_function;
1356 owait.wait.private = current; 1363 owait.wait.private = current;
1357 INIT_LIST_HEAD(&owait.wait.task_list); 1364 INIT_LIST_HEAD(&owait.wait.task_list);
1358 1365 need_to_kill = true;
1359 /* At first, try to OOM lock hierarchy under mem.*/ 1366 /* At first, try to OOM lock hierarchy under mem.*/
1360 mutex_lock(&memcg_oom_mutex); 1367 mutex_lock(&memcg_oom_mutex);
1361 locked = mem_cgroup_oom_lock(mem); 1368 locked = mem_cgroup_oom_lock(mem);
@@ -1364,15 +1371,17 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1364 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1371 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1365 * under OOM is always welcomed, use TASK_KILLABLE here. 1372 * under OOM is always welcomed, use TASK_KILLABLE here.
1366 */ 1373 */
1367 if (!locked) 1374 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1368 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1375 if (!locked || mem->oom_kill_disable)
1369 else 1376 need_to_kill = false;
1377 if (locked)
1370 mem_cgroup_oom_notify(mem); 1378 mem_cgroup_oom_notify(mem);
1371 mutex_unlock(&memcg_oom_mutex); 1379 mutex_unlock(&memcg_oom_mutex);
1372 1380
1373 if (locked) 1381 if (need_to_kill) {
1382 finish_wait(&memcg_oom_waitq, &owait.wait);
1374 mem_cgroup_out_of_memory(mem, mask); 1383 mem_cgroup_out_of_memory(mem, mask);
1375 else { 1384 } else {
1376 schedule(); 1385 schedule();
1377 finish_wait(&memcg_oom_waitq, &owait.wait); 1386 finish_wait(&memcg_oom_waitq, &owait.wait);
1378 } 1387 }
@@ -2162,15 +2171,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2162 /* If swapout, usage of swap doesn't decrease */ 2171 /* If swapout, usage of swap doesn't decrease */
2163 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2172 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2164 uncharge_memsw = false; 2173 uncharge_memsw = false;
2165 /*
2166 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2167 * In those cases, all pages freed continously can be expected to be in
2168 * the same cgroup and we have chance to coalesce uncharges.
2169 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2170 * because we want to do uncharge as soon as possible.
2171 */
2172 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
2173 goto direct_uncharge;
2174 2174
2175 batch = &current->memcg_batch; 2175 batch = &current->memcg_batch;
2176 /* 2176 /*
@@ -2181,6 +2181,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2181 if (!batch->memcg) 2181 if (!batch->memcg)
2182 batch->memcg = mem; 2182 batch->memcg = mem;
2183 /* 2183 /*
2184 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2185 * In those cases, all pages freed continously can be expected to be in
2186 * the same cgroup and we have chance to coalesce uncharges.
2187 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2188 * because we want to do uncharge as soon as possible.
2189 */
2190
2191 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2192 goto direct_uncharge;
2193
2194 /*
2184 * In typical case, batch->memcg == mem. This means we can 2195 * In typical case, batch->memcg == mem. This means we can
2185 * merge a series of uncharges to an uncharge of res_counter. 2196 * merge a series of uncharges to an uncharge of res_counter.
2186 * If not, we uncharge res_counter ony by one. 2197 * If not, we uncharge res_counter ony by one.
@@ -2196,6 +2207,8 @@ direct_uncharge:
2196 res_counter_uncharge(&mem->res, PAGE_SIZE); 2207 res_counter_uncharge(&mem->res, PAGE_SIZE);
2197 if (uncharge_memsw) 2208 if (uncharge_memsw)
2198 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2209 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2210 if (unlikely(batch->memcg != mem))
2211 memcg_oom_recover(mem);
2199 return; 2212 return;
2200} 2213}
2201 2214
@@ -2332,6 +2345,7 @@ void mem_cgroup_uncharge_end(void)
2332 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2345 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2333 if (batch->memsw_bytes) 2346 if (batch->memsw_bytes)
2334 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2347 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2348 memcg_oom_recover(batch->memcg);
2335 /* forget this pointer (for sanity check) */ 2349 /* forget this pointer (for sanity check) */
2336 batch->memcg = NULL; 2350 batch->memcg = NULL;
2337} 2351}
@@ -2568,10 +2582,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2568 unsigned long long val) 2582 unsigned long long val)
2569{ 2583{
2570 int retry_count; 2584 int retry_count;
2571 u64 memswlimit; 2585 u64 memswlimit, memlimit;
2572 int ret = 0; 2586 int ret = 0;
2573 int children = mem_cgroup_count_children(memcg); 2587 int children = mem_cgroup_count_children(memcg);
2574 u64 curusage, oldusage; 2588 u64 curusage, oldusage;
2589 int enlarge;
2575 2590
2576 /* 2591 /*
2577 * For keeping hierarchical_reclaim simple, how long we should retry 2592 * For keeping hierarchical_reclaim simple, how long we should retry
@@ -2582,6 +2597,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2582 2597
2583 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2598 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2584 2599
2600 enlarge = 0;
2585 while (retry_count) { 2601 while (retry_count) {
2586 if (signal_pending(current)) { 2602 if (signal_pending(current)) {
2587 ret = -EINTR; 2603 ret = -EINTR;
@@ -2599,6 +2615,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2599 mutex_unlock(&set_limit_mutex); 2615 mutex_unlock(&set_limit_mutex);
2600 break; 2616 break;
2601 } 2617 }
2618
2619 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2620 if (memlimit < val)
2621 enlarge = 1;
2622
2602 ret = res_counter_set_limit(&memcg->res, val); 2623 ret = res_counter_set_limit(&memcg->res, val);
2603 if (!ret) { 2624 if (!ret) {
2604 if (memswlimit == val) 2625 if (memswlimit == val)
@@ -2620,6 +2641,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2620 else 2641 else
2621 oldusage = curusage; 2642 oldusage = curusage;
2622 } 2643 }
2644 if (!ret && enlarge)
2645 memcg_oom_recover(memcg);
2623 2646
2624 return ret; 2647 return ret;
2625} 2648}
@@ -2628,9 +2651,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2628 unsigned long long val) 2651 unsigned long long val)
2629{ 2652{
2630 int retry_count; 2653 int retry_count;
2631 u64 memlimit, oldusage, curusage; 2654 u64 memlimit, memswlimit, oldusage, curusage;
2632 int children = mem_cgroup_count_children(memcg); 2655 int children = mem_cgroup_count_children(memcg);
2633 int ret = -EBUSY; 2656 int ret = -EBUSY;
2657 int enlarge = 0;
2634 2658
2635 /* see mem_cgroup_resize_res_limit */ 2659 /* see mem_cgroup_resize_res_limit */
2636 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2660 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
@@ -2652,6 +2676,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2652 mutex_unlock(&set_limit_mutex); 2676 mutex_unlock(&set_limit_mutex);
2653 break; 2677 break;
2654 } 2678 }
2679 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2680 if (memswlimit < val)
2681 enlarge = 1;
2655 ret = res_counter_set_limit(&memcg->memsw, val); 2682 ret = res_counter_set_limit(&memcg->memsw, val);
2656 if (!ret) { 2683 if (!ret) {
2657 if (memlimit == val) 2684 if (memlimit == val)
@@ -2674,6 +2701,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2674 else 2701 else
2675 oldusage = curusage; 2702 oldusage = curusage;
2676 } 2703 }
2704 if (!ret && enlarge)
2705 memcg_oom_recover(memcg);
2677 return ret; 2706 return ret;
2678} 2707}
2679 2708
@@ -2865,6 +2894,7 @@ move_account:
2865 if (ret) 2894 if (ret)
2866 break; 2895 break;
2867 } 2896 }
2897 memcg_oom_recover(mem);
2868 /* it seems parent cgroup doesn't have enough mem */ 2898 /* it seems parent cgroup doesn't have enough mem */
2869 if (ret == -ENOMEM) 2899 if (ret == -ENOMEM)
2870 goto try_to_free; 2900 goto try_to_free;
@@ -3645,6 +3675,46 @@ static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3645 return 0; 3675 return 0;
3646} 3676}
3647 3677
3678static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3679 struct cftype *cft, struct cgroup_map_cb *cb)
3680{
3681 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3682
3683 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
3684
3685 if (atomic_read(&mem->oom_lock))
3686 cb->fill(cb, "under_oom", 1);
3687 else
3688 cb->fill(cb, "under_oom", 0);
3689 return 0;
3690}
3691
3692/*
3693 */
3694static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3695 struct cftype *cft, u64 val)
3696{
3697 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3698 struct mem_cgroup *parent;
3699
3700 /* cannot set to root cgroup and only 0 and 1 are allowed */
3701 if (!cgrp->parent || !((val == 0) || (val == 1)))
3702 return -EINVAL;
3703
3704 parent = mem_cgroup_from_cont(cgrp->parent);
3705
3706 cgroup_lock();
3707 /* oom-kill-disable is a flag for subhierarchy. */
3708 if ((parent->use_hierarchy) ||
3709 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
3710 cgroup_unlock();
3711 return -EINVAL;
3712 }
3713 mem->oom_kill_disable = val;
3714 cgroup_unlock();
3715 return 0;
3716}
3717
3648static struct cftype mem_cgroup_files[] = { 3718static struct cftype mem_cgroup_files[] = {
3649 { 3719 {
3650 .name = "usage_in_bytes", 3720 .name = "usage_in_bytes",
@@ -3702,6 +3772,8 @@ static struct cftype mem_cgroup_files[] = {
3702 }, 3772 },
3703 { 3773 {
3704 .name = "oom_control", 3774 .name = "oom_control",
3775 .read_map = mem_cgroup_oom_control_read,
3776 .write_u64 = mem_cgroup_oom_control_write,
3705 .register_event = mem_cgroup_oom_register_event, 3777 .register_event = mem_cgroup_oom_register_event,
3706 .unregister_event = mem_cgroup_oom_unregister_event, 3778 .unregister_event = mem_cgroup_oom_unregister_event,
3707 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 3779 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
@@ -3943,6 +4015,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3943 } else { 4015 } else {
3944 parent = mem_cgroup_from_cont(cont->parent); 4016 parent = mem_cgroup_from_cont(cont->parent);
3945 mem->use_hierarchy = parent->use_hierarchy; 4017 mem->use_hierarchy = parent->use_hierarchy;
4018 mem->oom_kill_disable = parent->oom_kill_disable;
3946 } 4019 }
3947 4020
3948 if (parent && parent->use_hierarchy) { 4021 if (parent && parent->use_hierarchy) {
@@ -4215,6 +4288,7 @@ static void mem_cgroup_clear_mc(void)
4215 if (mc.precharge) { 4288 if (mc.precharge) {
4216 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4289 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4217 mc.precharge = 0; 4290 mc.precharge = 0;
4291 memcg_oom_recover(mc.to);
4218 } 4292 }
4219 /* 4293 /*
4220 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4294 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4223,6 +4297,7 @@ static void mem_cgroup_clear_mc(void)
4223 if (mc.moved_charge) { 4297 if (mc.moved_charge) {
4224 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4298 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4225 mc.moved_charge = 0; 4299 mc.moved_charge = 0;
4300 memcg_oom_recover(mc.from);
4226 } 4301 }
4227 /* we must fixup refcnts and charges */ 4302 /* we must fixup refcnts and charges */
4228 if (mc.moved_swap) { 4303 if (mc.moved_swap) {