memcg: fix oom kill behavior

In current page-fault code, handle_mm_fault() -> ... -> mem_cgroup_charge() -> map page or handle error. -> check return code. If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() is called. But if it's caused by memcg, OOM should have been already invoked. Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6. That patch records last_oom_jiffies for memcg's sub-hierarchy and prevents page_fault_out_of_memory from being invoked in near future. But Nishimura-san reported that check by jiffies is not enough when the system is terribly heavy. This patch changes memcg's oom logic as. * If memcg causes OOM-kill, continue to retry. * remove jiffies check which is used now. * add memcg-oom-lock which works like perzone oom lock. * If current is killed(as a process), bypass charge. Something more sophisticated can be added but this pactch does fundamental things. TODO: - add oom notifier - add permemcg disable-oom-kill flag and freezer at oom. - more chances for wake up oom waiter (when changing memory limit etc..) Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> 2010-03-10 18:22:39 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-03-12 18:52:38 -0500
commit: 867578cbccb0893cc14fc29c670f7185809c90d6 (patch)
tree: e4d0cefac265fc64399223bc82ed714a88ebe20c
parent: 0263c12c12ccc90edc9d856fa839f8936183e6d1 (diff)
3 files changed, 107 insertions, 41 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1f9b119f4ace..44301c6affa8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(void)
        return false;
 }
-extern bool mem_cgroup_oom_called(struct task_struct *task);
 void mem_cgroup_update_file_mapped(struct page *page, int val);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                                gfp_t gfp_mask, int nid,
@@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(void)
        return true;
 }
-static inline bool mem_cgroup_oom_called(struct task_struct *task)
-{
-        return false;
-}
 static inline int
 mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f7b910fc14fb..7973b5221fb8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -203,7 +203,7 @@ struct mem_cgroup {
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
-        unsigned long   last_oom_jiffies;
+        atomic_t        oom_lock;
        atomic_t        refcnt;
        unsigned int    swappiness;
@@ -1246,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        return total;
 }
-bool mem_cgroup_oom_called(struct task_struct *task)
+static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
 {
-        bool ret = false;
+        int *val = (int *)data;
-        struct mem_cgroup *mem;
+        int x;
-        struct mm_struct *mm;
+        /*
+         * Logically, we can stop scanning immediately when we find
+         * a memcg is already locked. But condidering unlock ops and
+         * creation/removal of memcg, scan-all is simple operation.
+         */
+        x = atomic_inc_return(&mem->oom_lock);
+        *val = max(x, *val);
+        return 0;
+}
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+{
+        int lock_count = 0;
-        rcu_read_lock();
+        mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
-        mm = task->mm;
-        if (!mm)
+        if (lock_count == 1)
-                mm = &init_mm;
+                return true;
-        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+        return false;
-        if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
-                ret = true;
-        rcu_read_unlock();
-        return ret;
 }
-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
 {
-        mem->last_oom_jiffies = jiffies;
+        /*
+         * When a new child is created while the hierarchy is under oom,
+         * mem_cgroup_oom_lock() may not be called. We have to use
+         * atomic_add_unless() here.
+         */
+        atomic_add_unless(&mem->oom_lock, -1, 0);
        return 0;
 }
-static void record_last_oom(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
 {
-        mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+        mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
+}
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
+{
+        DEFINE_WAIT(wait);
+        bool locked;
+        /* At first, try to OOM lock hierarchy under mem.*/
+        mutex_lock(&memcg_oom_mutex);
+        locked = mem_cgroup_oom_lock(mem);
+        /*
+         * Even if signal_pending(), we can't quit charge() loop without
+         * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+         * under OOM is always welcomed, use TASK_KILLABLE here.
+         */
+        if (!locked)
+                prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
+        mutex_unlock(&memcg_oom_mutex);
+        if (locked)
+                mem_cgroup_out_of_memory(mem, mask);
+        else {
+                schedule();
+                finish_wait(&memcg_oom_waitq, &wait);
+        }
+        mutex_lock(&memcg_oom_mutex);
+        mem_cgroup_oom_unlock(mem);
+        /*
+         * Here, we use global waitq .....more fine grained waitq ?
+         * Assume following hierarchy.
+         * A/
+         *   01
+         *   02
+         * assume OOM happens both in A and 01 at the same time. Tthey are
+         * mutually exclusive by lock. (kill in 01 helps A.)
+         * When we use per memcg waitq, we have to wake up waiters on A and 02
+         * in addtion to waiters on 01. We use global waitq for avoiding mess.
+         * It will not be a big problem.
+         * (And a task may be moved to other groups while it's waiting for OOM.)
+         */
+        wake_up_all(&memcg_oom_waitq);
+        mutex_unlock(&memcg_oom_mutex);
+        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+                return false;
+        /* Give chance to dying process */
+        schedule_timeout(1);
+        return true;
 }
 /*
@@ -1443,11 +1513,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
        struct res_counter *fail_res;
        int csize = CHARGE_SIZE;
-        if (unlikely(test_thread_flag(TIF_MEMDIE))) {
+        /*
-                /* Don't account this! */
+         * Unlike gloval-vm's OOM-kill, we're not in memory shortage
-                *memcg = NULL;
+         * in system level. So, allow to go ahead dying process in addition to
-                return 0;
+         * MEMDIE process.
-        }
+         */
+        if (unlikely(test_thread_flag(TIF_MEMDIE)
+                     || fatal_signal_pending(current)))
+                goto bypass;
        /*
         * We always charge the cgroup the mm_struct belongs to.
@@ -1560,11 +1633,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                }
                if (!nr_retries--) {
-                        if (oom) {
+                        if (!oom)
-                                mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
+                                goto nomem;
-                                record_last_oom(mem_over_limit);
+                        if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
+                                nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+                                continue;
                        }
-                        goto nomem;
+                        /* When we reach here, current task is dying .*/
+                        css_put(&mem->css);
+                        goto bypass;
                }
        }
        if (csize > PAGE_SIZE)
@@ -1574,6 +1651,9 @@ done:
 nomem:
        css_put(&mem->css);
        return -ENOMEM;
+bypass:
+        *memcg = NULL;
+        return 0;
 }
 /*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 71d10bf52dc8..9b223af6a147 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -603,13 +603,6 @@ void pagefault_out_of_memory(void)
                /* Got some memory back in the last second. */
                return;
-        /*
-         * If this is from memcg, oom-killer is already invoked.
-         * and not worth to go system-wide-oom.
-         */
-        if (mem_cgroup_oom_called(current))
-                goto rest_and_return;
        if (sysctl_panic_on_oom)
                panic("out of memory from page fault. panic_on_oom is selected.\n");
@@ -621,7 +614,6 @@ void pagefault_out_of_memory(void)
         * Give "p" a good chance of killing itself before we
         * retry to allocate memory.
         */
-rest_and_return:
        if (!test_thread_flag(TIF_MEMDIE))
                schedule_timeout_uninterruptible(1);
 }
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>	2010-03-10 18:22:39 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-03-12 18:52:38 -0500
commit	867578cbccb0893cc14fc29c670f7185809c90d6 (patch)
tree	e4d0cefac265fc64399223bc82ed714a88ebe20c
parent	0263c12c12ccc90edc9d856fa839f8936183e6d1 (diff)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f9b119f4ace..44301c6affa8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h
@@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(void)
124	return false;	124	return false;
125	}	125	}
126		126
127	extern bool mem_cgroup_oom_called(struct task_struct *task);
128	void mem_cgroup_update_file_mapped(struct page *page, int val);	127	void mem_cgroup_update_file_mapped(struct page *page, int val);
129	unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,	128	unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
130	gfp_t gfp_mask, int nid,	129	gfp_t gfp_mask, int nid,
@@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(void)
258	return true;	257	return true;
259	}	258	}
260		259
261	static inline bool mem_cgroup_oom_called(struct task_struct *task)
262	{
263	return false;
264	}
265
266	static inline int	260	static inline int
267	mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)	261	mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
268	{	262	{


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f7b910fc14fb..7973b5221fb8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -203,7 +203,7 @@ struct mem_cgroup {
203	* Should the accounting and control be hierarchical, per subtree?	203	* Should the accounting and control be hierarchical, per subtree?
204	*/	204	*/
205	bool use_hierarchy;	205	bool use_hierarchy;
206	unsigned long last_oom_jiffies;	206	atomic_t oom_lock;
207	atomic_t refcnt;	207	atomic_t refcnt;
208		208
209	unsigned int swappiness;	209	unsigned int swappiness;
@@ -1246,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1246	return total;	1246	return total;
1247	}	1247	}
1248		1248
1249	bool mem_cgroup_oom_called(struct task_struct *task)	1249	static int mem_cgroup_oom_lock_cb(struct mem_cgroup mem, void data)
1250	{	1250	{
1251	bool ret = false;	1251	int val = (int )data;
1252	struct mem_cgroup *mem;	1252	int x;
1253	struct mm_struct *mm;	1253	/*
		1254	* Logically, we can stop scanning immediately when we find
		1255	* a memcg is already locked. But condidering unlock ops and
		1256	* creation/removal of memcg, scan-all is simple operation.
		1257	*/
		1258	x = atomic_inc_return(&mem->oom_lock);
		1259	val = max(x, val);
		1260	return 0;
		1261	}
		1262	/*
		1263	* Check OOM-Killer is already running under our hierarchy.
		1264	* If someone is running, return false.
		1265	*/
		1266	static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
		1267	{
		1268	int lock_count = 0;
1254		1269
1255	rcu_read_lock();	1270	mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1256	mm = task->mm;	1271
1257	if (!mm)	1272	if (lock_count == 1)
1258	mm = &init_mm;	1273	return true;
1259	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));	1274	return false;
1260	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1261	ret = true;
1262	rcu_read_unlock();
1263	return ret;
1264	}	1275	}
1265		1276
1266	static int record_last_oom_cb(struct mem_cgroup mem, void data)	1277	static int mem_cgroup_oom_unlock_cb(struct mem_cgroup mem, void data)
1267	{	1278	{
1268	mem->last_oom_jiffies = jiffies;	1279	/*
		1280	* When a new child is created while the hierarchy is under oom,
		1281	* mem_cgroup_oom_lock() may not be called. We have to use
		1282	* atomic_add_unless() here.
		1283	*/
		1284	atomic_add_unless(&mem->oom_lock, -1, 0);
1269	return 0;	1285	return 0;
1270	}	1286	}
1271		1287
1272	static void record_last_oom(struct mem_cgroup *mem)	1288	static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1273	{	1289	{
1274	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);	1290	mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
		1291	}
		1292
		1293	static DEFINE_MUTEX(memcg_oom_mutex);
		1294	static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
		1295
		1296	/*
		1297	* try to call OOM killer. returns false if we should exit memory-reclaim loop.
		1298	*/
		1299	bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
		1300	{
		1301	DEFINE_WAIT(wait);
		1302	bool locked;
		1303
		1304	/* At first, try to OOM lock hierarchy under mem.*/
		1305	mutex_lock(&memcg_oom_mutex);
		1306	locked = mem_cgroup_oom_lock(mem);
		1307	/*
		1308	* Even if signal_pending(), we can't quit charge() loop without
		1309	* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
		1310	* under OOM is always welcomed, use TASK_KILLABLE here.
		1311	*/
		1312	if (!locked)
		1313	prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
		1314	mutex_unlock(&memcg_oom_mutex);
		1315
		1316	if (locked)
		1317	mem_cgroup_out_of_memory(mem, mask);
		1318	else {
		1319	schedule();
		1320	finish_wait(&memcg_oom_waitq, &wait);
		1321	}
		1322	mutex_lock(&memcg_oom_mutex);
		1323	mem_cgroup_oom_unlock(mem);
		1324	/*
		1325	* Here, we use global waitq .....more fine grained waitq ?
		1326	* Assume following hierarchy.
		1327	* A/
		1328	* 01
		1329	* 02
		1330	* assume OOM happens both in A and 01 at the same time. Tthey are
		1331	* mutually exclusive by lock. (kill in 01 helps A.)
		1332	* When we use per memcg waitq, we have to wake up waiters on A and 02
		1333	* in addtion to waiters on 01. We use global waitq for avoiding mess.
		1334	* It will not be a big problem.
		1335	* (And a task may be moved to other groups while it's waiting for OOM.)
		1336	*/
		1337	wake_up_all(&memcg_oom_waitq);
		1338	mutex_unlock(&memcg_oom_mutex);
		1339
		1340	if (test_thread_flag(TIF_MEMDIE) \|\| fatal_signal_pending(current))
		1341	return false;
		1342	/* Give chance to dying process */
		1343	schedule_timeout(1);
		1344	return true;
1275	}	1345	}
1276		1346
1277	/*	1347	/*
@@ -1443,11 +1513,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1443	struct res_counter *fail_res;	1513	struct res_counter *fail_res;
1444	int csize = CHARGE_SIZE;	1514	int csize = CHARGE_SIZE;
1445		1515
1446	if (unlikely(test_thread_flag(TIF_MEMDIE))) {	1516	/*
1447	/* Don't account this! */	1517	* Unlike gloval-vm's OOM-kill, we're not in memory shortage
1448	*memcg = NULL;	1518	* in system level. So, allow to go ahead dying process in addition to
1449	return 0;	1519	* MEMDIE process.
1450	}	1520	*/
		1521	if (unlikely(test_thread_flag(TIF_MEMDIE)
		1522	\|\| fatal_signal_pending(current)))
		1523	goto bypass;
1451		1524
1452	/*	1525	/*
1453	* We always charge the cgroup the mm_struct belongs to.	1526	* We always charge the cgroup the mm_struct belongs to.
@@ -1560,11 +1633,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1560	}	1633	}
1561		1634
1562	if (!nr_retries--) {	1635	if (!nr_retries--) {
1563	if (oom) {	1636	if (!oom)
1564	mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);	1637	goto nomem;
1565	record_last_oom(mem_over_limit);	1638	if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
		1639	nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
		1640	continue;
1566	}	1641	}
1567	goto nomem;	1642	/* When we reach here, current task is dying .*/
		1643	css_put(&mem->css);
		1644	goto bypass;
1568	}	1645	}
1569	}	1646	}
1570	if (csize > PAGE_SIZE)	1647	if (csize > PAGE_SIZE)
@@ -1574,6 +1651,9 @@ done:
1574	nomem:	1651	nomem:
1575	css_put(&mem->css);	1652	css_put(&mem->css);
1576	return -ENOMEM;	1653	return -ENOMEM;
		1654	bypass:
		1655	*memcg = NULL;
		1656	return 0;
1577	}	1657	}
1578		1658
1579	/*	1659	/*


diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 71d10bf52dc8..9b223af6a147 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c
@@ -603,13 +603,6 @@ void pagefault_out_of_memory(void)
603	/* Got some memory back in the last second. */	603	/* Got some memory back in the last second. */
604	return;	604	return;
605		605
606	/*
607	* If this is from memcg, oom-killer is already invoked.
608	* and not worth to go system-wide-oom.
609	*/
610	if (mem_cgroup_oom_called(current))
611	goto rest_and_return;
612
613	if (sysctl_panic_on_oom)	606	if (sysctl_panic_on_oom)
614	panic("out of memory from page fault. panic_on_oom is selected.\n");	607	panic("out of memory from page fault. panic_on_oom is selected.\n");
615		608
@@ -621,7 +614,6 @@ void pagefault_out_of_memory(void)
621	* Give "p" a good chance of killing itself before we	614	* Give "p" a good chance of killing itself before we
622	* retry to allocate memory.	615	* retry to allocate memory.
623	*/	616	*/
624	rest_and_return:
625	if (!test_thread_flag(TIF_MEMDIE))	617	if (!test_thread_flag(TIF_MEMDIE))
626	schedule_timeout_uninterruptible(1);	618	schedule_timeout_uninterruptible(1);
627	}	619	}