aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2010-03-10 18:22:39 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-12 18:52:38 -0500
commit867578cbccb0893cc14fc29c670f7185809c90d6 (patch)
treee4d0cefac265fc64399223bc82ed714a88ebe20c
parent0263c12c12ccc90edc9d856fa839f8936183e6d1 (diff)
memcg: fix oom kill behavior
In current page-fault code, handle_mm_fault() -> ... -> mem_cgroup_charge() -> map page or handle error. -> check return code. If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() is called. But if it's caused by memcg, OOM should have been already invoked. Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6. That patch records last_oom_jiffies for memcg's sub-hierarchy and prevents page_fault_out_of_memory from being invoked in near future. But Nishimura-san reported that check by jiffies is not enough when the system is terribly heavy. This patch changes memcg's oom logic as. * If memcg causes OOM-kill, continue to retry. * remove jiffies check which is used now. * add memcg-oom-lock which works like perzone oom lock. * If current is killed(as a process), bypass charge. Something more sophisticated can be added but this pactch does fundamental things. TODO: - add oom notifier - add permemcg disable-oom-kill flag and freezer at oom. - more chances for wake up oom waiter (when changing memory limit etc..) Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h6
-rw-r--r--mm/memcontrol.c134
-rw-r--r--mm/oom_kill.c8
3 files changed, 107 insertions, 41 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1f9b119f4ace..44301c6affa8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(void)
124 return false; 124 return false;
125} 125}
126 126
127extern bool mem_cgroup_oom_called(struct task_struct *task);
128void mem_cgroup_update_file_mapped(struct page *page, int val); 127void mem_cgroup_update_file_mapped(struct page *page, int val);
129unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 128unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
130 gfp_t gfp_mask, int nid, 129 gfp_t gfp_mask, int nid,
@@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(void)
258 return true; 257 return true;
259} 258}
260 259
261static inline bool mem_cgroup_oom_called(struct task_struct *task)
262{
263 return false;
264}
265
266static inline int 260static inline int
267mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 261mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
268{ 262{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f7b910fc14fb..7973b5221fb8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -203,7 +203,7 @@ struct mem_cgroup {
203 * Should the accounting and control be hierarchical, per subtree? 203 * Should the accounting and control be hierarchical, per subtree?
204 */ 204 */
205 bool use_hierarchy; 205 bool use_hierarchy;
206 unsigned long last_oom_jiffies; 206 atomic_t oom_lock;
207 atomic_t refcnt; 207 atomic_t refcnt;
208 208
209 unsigned int swappiness; 209 unsigned int swappiness;
@@ -1246,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1246 return total; 1246 return total;
1247} 1247}
1248 1248
1249bool mem_cgroup_oom_called(struct task_struct *task) 1249static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1250{ 1250{
1251 bool ret = false; 1251 int *val = (int *)data;
1252 struct mem_cgroup *mem; 1252 int x;
1253 struct mm_struct *mm; 1253 /*
1254 * Logically, we can stop scanning immediately when we find
1255 * a memcg is already locked. But condidering unlock ops and
1256 * creation/removal of memcg, scan-all is simple operation.
1257 */
1258 x = atomic_inc_return(&mem->oom_lock);
1259 *val = max(x, *val);
1260 return 0;
1261}
1262/*
1263 * Check OOM-Killer is already running under our hierarchy.
1264 * If someone is running, return false.
1265 */
1266static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1267{
1268 int lock_count = 0;
1254 1269
1255 rcu_read_lock(); 1270 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1256 mm = task->mm; 1271
1257 if (!mm) 1272 if (lock_count == 1)
1258 mm = &init_mm; 1273 return true;
1259 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1274 return false;
1260 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1261 ret = true;
1262 rcu_read_unlock();
1263 return ret;
1264} 1275}
1265 1276
1266static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1277static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1267{ 1278{
1268 mem->last_oom_jiffies = jiffies; 1279 /*
1280 * When a new child is created while the hierarchy is under oom,
1281 * mem_cgroup_oom_lock() may not be called. We have to use
1282 * atomic_add_unless() here.
1283 */
1284 atomic_add_unless(&mem->oom_lock, -1, 0);
1269 return 0; 1285 return 0;
1270} 1286}
1271 1287
1272static void record_last_oom(struct mem_cgroup *mem) 1288static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1273{ 1289{
1274 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1290 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1291}
1292
1293static DEFINE_MUTEX(memcg_oom_mutex);
1294static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1295
1296/*
1297 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1298 */
1299bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1300{
1301 DEFINE_WAIT(wait);
1302 bool locked;
1303
1304 /* At first, try to OOM lock hierarchy under mem.*/
1305 mutex_lock(&memcg_oom_mutex);
1306 locked = mem_cgroup_oom_lock(mem);
1307 /*
1308 * Even if signal_pending(), we can't quit charge() loop without
1309 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1310 * under OOM is always welcomed, use TASK_KILLABLE here.
1311 */
1312 if (!locked)
1313 prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
1314 mutex_unlock(&memcg_oom_mutex);
1315
1316 if (locked)
1317 mem_cgroup_out_of_memory(mem, mask);
1318 else {
1319 schedule();
1320 finish_wait(&memcg_oom_waitq, &wait);
1321 }
1322 mutex_lock(&memcg_oom_mutex);
1323 mem_cgroup_oom_unlock(mem);
1324 /*
1325 * Here, we use global waitq .....more fine grained waitq ?
1326 * Assume following hierarchy.
1327 * A/
1328 * 01
1329 * 02
1330 * assume OOM happens both in A and 01 at the same time. Tthey are
1331 * mutually exclusive by lock. (kill in 01 helps A.)
1332 * When we use per memcg waitq, we have to wake up waiters on A and 02
1333 * in addtion to waiters on 01. We use global waitq for avoiding mess.
1334 * It will not be a big problem.
1335 * (And a task may be moved to other groups while it's waiting for OOM.)
1336 */
1337 wake_up_all(&memcg_oom_waitq);
1338 mutex_unlock(&memcg_oom_mutex);
1339
1340 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1341 return false;
1342 /* Give chance to dying process */
1343 schedule_timeout(1);
1344 return true;
1275} 1345}
1276 1346
1277/* 1347/*
@@ -1443,11 +1513,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1443 struct res_counter *fail_res; 1513 struct res_counter *fail_res;
1444 int csize = CHARGE_SIZE; 1514 int csize = CHARGE_SIZE;
1445 1515
1446 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1516 /*
1447 /* Don't account this! */ 1517 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1448 *memcg = NULL; 1518 * in system level. So, allow to go ahead dying process in addition to
1449 return 0; 1519 * MEMDIE process.
1450 } 1520 */
1521 if (unlikely(test_thread_flag(TIF_MEMDIE)
1522 || fatal_signal_pending(current)))
1523 goto bypass;
1451 1524
1452 /* 1525 /*
1453 * We always charge the cgroup the mm_struct belongs to. 1526 * We always charge the cgroup the mm_struct belongs to.
@@ -1560,11 +1633,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1560 } 1633 }
1561 1634
1562 if (!nr_retries--) { 1635 if (!nr_retries--) {
1563 if (oom) { 1636 if (!oom)
1564 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1637 goto nomem;
1565 record_last_oom(mem_over_limit); 1638 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1639 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1640 continue;
1566 } 1641 }
1567 goto nomem; 1642 /* When we reach here, current task is dying .*/
1643 css_put(&mem->css);
1644 goto bypass;
1568 } 1645 }
1569 } 1646 }
1570 if (csize > PAGE_SIZE) 1647 if (csize > PAGE_SIZE)
@@ -1574,6 +1651,9 @@ done:
1574nomem: 1651nomem:
1575 css_put(&mem->css); 1652 css_put(&mem->css);
1576 return -ENOMEM; 1653 return -ENOMEM;
1654bypass:
1655 *memcg = NULL;
1656 return 0;
1577} 1657}
1578 1658
1579/* 1659/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 71d10bf52dc8..9b223af6a147 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -603,13 +603,6 @@ void pagefault_out_of_memory(void)
603 /* Got some memory back in the last second. */ 603 /* Got some memory back in the last second. */
604 return; 604 return;
605 605
606 /*
607 * If this is from memcg, oom-killer is already invoked.
608 * and not worth to go system-wide-oom.
609 */
610 if (mem_cgroup_oom_called(current))
611 goto rest_and_return;
612
613 if (sysctl_panic_on_oom) 606 if (sysctl_panic_on_oom)
614 panic("out of memory from page fault. panic_on_oom is selected.\n"); 607 panic("out of memory from page fault. panic_on_oom is selected.\n");
615 608
@@ -621,7 +614,6 @@ void pagefault_out_of_memory(void)
621 * Give "p" a good chance of killing itself before we 614 * Give "p" a good chance of killing itself before we
622 * retry to allocate memory. 615 * retry to allocate memory.
623 */ 616 */
624rest_and_return:
625 if (!test_thread_flag(TIF_MEMDIE)) 617 if (!test_thread_flag(TIF_MEMDIE))
626 schedule_timeout_uninterruptible(1); 618 schedule_timeout_uninterruptible(1);
627} 619}