mm: memcg: handle non-error OOM situations more gracefully

Commit 3812c8c8f395 ("mm: memcg: do not trap chargers with full callstack on OOM") assumed that only a few places that can trigger a memcg OOM situation do not return VM_FAULT_OOM, like optional page cache readahead. But there are many more and it's impractical to annotate them all. First of all, we don't want to invoke the OOM killer when the failed allocation is gracefully handled, so defer the actual kill to the end of the fault handling as well. This simplifies the code quite a bit for added bonus. Second, since a failed allocation might not be the abrupt end of the fault, the memcg OOM handler needs to be re-entrant until the fault finishes for subsequent allocation attempts. If an allocation is attempted after the task already OOMed, allow it to bypass the limit so that it can quickly finish the fault and invoke the OOM killer. Reported-by: azurIt <azurit@pobox.sk> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Johannes Weiner <hannes@cmpxchg.org> 2013-10-16 16:46:59 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-10-17 00:35:53 -0400
commit: 4942642080ea82d99ab5b653abb9a12b7ba31f4a (patch)
tree: 7ec12b61f0bfdd1f1466e5233b67432828b25c33 /mm/memcontrol.c
parent: c88b05b2cd07221cdefd56f7f7422c1459eb60c9 (diff)
1 files changed, 51 insertions, 88 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5335b2b6be77..65fc6a449841 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                memcg_wakeup_oom(memcg);
 }
-/*
- * try to call OOM killer
- */
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-        bool locked;
-        int wakeups;
        if (!current->memcg_oom.may_oom)
                return;
-        current->memcg_oom.in_memcg_oom = 1;
        /*
-         * As with any blocking lock, a contender needs to start
+         * We are in the middle of the charge context here, so we
-         * listening for wakeups before attempting the trylock,
+         * don't want to block when potentially sitting on a callstack
-         * otherwise it can miss the wakeup from the unlock and sleep
+         * that holds all kinds of filesystem and mm locks.
-         * indefinitely.  This is just open-coded because our locking
+         *
-         * is so particular to memcg hierarchies.
+         * Also, the caller may handle a failed allocation gracefully
+         * (like optional page cache readahead) and so an OOM killer
+         * invocation might not even be necessary.
+         *
+         * That's why we don't do anything here except remember the
+         * OOM context and then deal with it at the end of the page
+         * fault when the stack is unwound, the locks are released,
+         * and when we know whether the fault was overall successful.
         */
-        wakeups = atomic_read(&memcg->oom_wakeups);
+        css_get(&memcg->css);
-        mem_cgroup_mark_under_oom(memcg);
+        current->memcg_oom.memcg = memcg;
+        current->memcg_oom.gfp_mask = mask;
-        locked = mem_cgroup_oom_trylock(memcg);
+        current->memcg_oom.order = order;
-        if (locked)
-                mem_cgroup_oom_notify(memcg);
-        if (locked && !memcg->oom_kill_disable) {
-                mem_cgroup_unmark_under_oom(memcg);
-                mem_cgroup_out_of_memory(memcg, mask, order);
-                mem_cgroup_oom_unlock(memcg);
-                /*
-                 * There is no guarantee that an OOM-lock contender
-                 * sees the wakeups triggered by the OOM kill
-                 * uncharges.  Wake any sleepers explicitely.
-                 */
-                memcg_oom_recover(memcg);
-        } else {
-                /*
-                 * A system call can just return -ENOMEM, but if this
-                 * is a page fault and somebody else is handling the
-                 * OOM already, we need to sleep on the OOM waitqueue
-                 * for this memcg until the situation is resolved.
-                 * Which can take some time because it might be
-                 * handled by a userspace task.
-                 *
-                 * However, this is the charge context, which means
-                 * that we may sit on a large call stack and hold
-                 * various filesystem locks, the mmap_sem etc. and we
-                 * don't want the OOM handler to deadlock on them
-                 * while we sit here and wait.  Store the current OOM
-                 * context in the task_struct, then return -ENOMEM.
-                 * At the end of the page fault handler, with the
-                 * stack unwound, pagefault_out_of_memory() will check
-                 * back with us by calling
-                 * mem_cgroup_oom_synchronize(), possibly putting the
-                 * task to sleep.
-                 */
-                current->memcg_oom.oom_locked = locked;
-                current->memcg_oom.wakeups = wakeups;
-                css_get(&memcg->css);
-                current->memcg_oom.wait_on_memcg = memcg;
-        }
 }
 /**
 * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
 *
- * This has to be called at the end of a page fault if the the memcg
+ * This has to be called at the end of a page fault if the memcg OOM
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * handler was enabled.
 *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
 * sleep on a waitqueue until the userspace task resolves the
 * situation.  Sleeping directly in the charge context with all kinds
 * of locks held is not a good idea, instead we remember an OOM state
 * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
+ * the end of the page fault to complete the OOM handling.
- * OOM state.
 *
 * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
 */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
 {
+        struct mem_cgroup *memcg = current->memcg_oom.memcg;
        struct oom_wait_info owait;
-        struct mem_cgroup *memcg;
+        bool locked;
        /* OOM is global, do not handle */
-        if (!current->memcg_oom.in_memcg_oom)
-                return false;
-        /*
-         * We invoked the OOM killer but there is a chance that a kill
-         * did not free up any charges.  Everybody else might already
-         * be sleeping, so restart the fault and keep the rampage
-         * going until some charges are released.
-         */
-        memcg = current->memcg_oom.wait_on_memcg;
        if (!memcg)
-                goto out;
+                return false;
-        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+        if (!handle)
-                goto out_memcg;
+                goto cleanup;
        owait.memcg = memcg;
        owait.wait.flags = 0;
@@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
        INIT_LIST_HEAD(&owait.wait.task_list);
        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-        /* Only sleep if we didn't miss any wakeups since OOM */
+        mem_cgroup_mark_under_oom(memcg);
-        if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+        locked = mem_cgroup_oom_trylock(memcg);
+        if (locked)
+                mem_cgroup_oom_notify(memcg);
+        if (locked && !memcg->oom_kill_disable) {
+                mem_cgroup_unmark_under_oom(memcg);
+                finish_wait(&memcg_oom_waitq, &owait.wait);
+                mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+                                         current->memcg_oom.order);
+        } else {
                schedule();
-        finish_wait(&memcg_oom_waitq, &owait.wait);
+                mem_cgroup_unmark_under_oom(memcg);
-out_memcg:
+                finish_wait(&memcg_oom_waitq, &owait.wait);
-        mem_cgroup_unmark_under_oom(memcg);
+        }
-        if (current->memcg_oom.oom_locked) {
+        if (locked) {
                mem_cgroup_oom_unlock(memcg);
                /*
                 * There is no guarantee that an OOM-lock contender
@@ -2288,10 +2249,9 @@ out_memcg:
                 */
                memcg_oom_recover(memcg);
        }
+cleanup:
+        current->memcg_oom.memcg = NULL;
        css_put(&memcg->css);
-        current->memcg_oom.wait_on_memcg = NULL;
-out:
-        current->memcg_oom.in_memcg_oom = 0;
        return true;
 }
@@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                     || fatal_signal_pending(current)))
                goto bypass;
+        if (unlikely(task_in_memcg_oom(current)))
+                goto bypass;
        /*
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
author	Johannes Weiner <hannes@cmpxchg.org>	2013-10-16 16:46:59 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-10-17 00:35:53 -0400
commit	4942642080ea82d99ab5b653abb9a12b7ba31f4a (patch)
tree	7ec12b61f0bfdd1f1466e5233b67432828b25c33 /mm/memcontrol.c
parent	c88b05b2cd07221cdefd56f7f7422c1459eb60c9 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5335b2b6be77..65fc6a449841 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2161	memcg_wakeup_oom(memcg);	2161	memcg_wakeup_oom(memcg);
2162	}	2162	}
2163		2163
2164	/*
2165	* try to call OOM killer
2166	*/
2167	static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)	2164	static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2168	{	2165	{
2169	bool locked;
2170	int wakeups;
2171
2172	if (!current->memcg_oom.may_oom)	2166	if (!current->memcg_oom.may_oom)
2173	return;	2167	return;
2174
2175	current->memcg_oom.in_memcg_oom = 1;
2176
2177	/*	2168	/*
2178	* As with any blocking lock, a contender needs to start	2169	* We are in the middle of the charge context here, so we
2179	* listening for wakeups before attempting the trylock,	2170	* don't want to block when potentially sitting on a callstack
2180	* otherwise it can miss the wakeup from the unlock and sleep	2171	* that holds all kinds of filesystem and mm locks.
2181	* indefinitely. This is just open-coded because our locking	2172	*
2182	* is so particular to memcg hierarchies.	2173	* Also, the caller may handle a failed allocation gracefully
		2174	* (like optional page cache readahead) and so an OOM killer
		2175	* invocation might not even be necessary.
		2176	*
		2177	* That's why we don't do anything here except remember the
		2178	* OOM context and then deal with it at the end of the page
		2179	* fault when the stack is unwound, the locks are released,
		2180	* and when we know whether the fault was overall successful.
2183	*/	2181	*/
2184	wakeups = atomic_read(&memcg->oom_wakeups);	2182	css_get(&memcg->css);
2185	mem_cgroup_mark_under_oom(memcg);	2183	current->memcg_oom.memcg = memcg;
2186		2184	current->memcg_oom.gfp_mask = mask;
2187	locked = mem_cgroup_oom_trylock(memcg);	2185	current->memcg_oom.order = order;
2188
2189	if (locked)
2190	mem_cgroup_oom_notify(memcg);
2191
2192	if (locked && !memcg->oom_kill_disable) {
2193	mem_cgroup_unmark_under_oom(memcg);
2194	mem_cgroup_out_of_memory(memcg, mask, order);
2195	mem_cgroup_oom_unlock(memcg);
2196	/*
2197	* There is no guarantee that an OOM-lock contender
2198	* sees the wakeups triggered by the OOM kill
2199	* uncharges. Wake any sleepers explicitely.
2200	*/
2201	memcg_oom_recover(memcg);
2202	} else {
2203	/*
2204	* A system call can just return -ENOMEM, but if this
2205	* is a page fault and somebody else is handling the
2206	* OOM already, we need to sleep on the OOM waitqueue
2207	* for this memcg until the situation is resolved.
2208	* Which can take some time because it might be
2209	* handled by a userspace task.
2210	*
2211	* However, this is the charge context, which means
2212	* that we may sit on a large call stack and hold
2213	* various filesystem locks, the mmap_sem etc. and we
2214	* don't want the OOM handler to deadlock on them
2215	* while we sit here and wait. Store the current OOM
2216	* context in the task_struct, then return -ENOMEM.
2217	* At the end of the page fault handler, with the
2218	* stack unwound, pagefault_out_of_memory() will check
2219	* back with us by calling
2220	* mem_cgroup_oom_synchronize(), possibly putting the
2221	* task to sleep.
2222	*/
2223	current->memcg_oom.oom_locked = locked;
2224	current->memcg_oom.wakeups = wakeups;
2225	css_get(&memcg->css);
2226	current->memcg_oom.wait_on_memcg = memcg;
2227	}
2228	}	2186	}
2229		2187
2230	/**	2188	/**
2231	* mem_cgroup_oom_synchronize - complete memcg OOM handling	2189	* mem_cgroup_oom_synchronize - complete memcg OOM handling
		2190	* @handle: actually kill/wait or just clean up the OOM state
2232	*	2191	*
2233	* This has to be called at the end of a page fault if the the memcg	2192	* This has to be called at the end of a page fault if the memcg OOM
2234	* OOM handler was enabled and the fault is returning %VM_FAULT_OOM.	2193	* handler was enabled.
2235	*	2194	*
2236	* Memcg supports userspace OOM handling, so failed allocations must	2195	* Memcg supports userspace OOM handling where failed allocations must
2237	* sleep on a waitqueue until the userspace task resolves the	2196	* sleep on a waitqueue until the userspace task resolves the
2238	* situation. Sleeping directly in the charge context with all kinds	2197	* situation. Sleeping directly in the charge context with all kinds
2239	* of locks held is not a good idea, instead we remember an OOM state	2198	* of locks held is not a good idea, instead we remember an OOM state
2240	* in the task and mem_cgroup_oom_synchronize() has to be called at	2199	* in the task and mem_cgroup_oom_synchronize() has to be called at
2241	* the end of the page fault to put the task to sleep and clean up the	2200	* the end of the page fault to complete the OOM handling.
2242	* OOM state.
2243	*	2201	*
2244	* Returns %true if an ongoing memcg OOM situation was detected and	2202	* Returns %true if an ongoing memcg OOM situation was detected and
2245	* finalized, %false otherwise.	2203	* completed, %false otherwise.
2246	*/	2204	*/
2247	bool mem_cgroup_oom_synchronize(void)	2205	bool mem_cgroup_oom_synchronize(bool handle)
2248	{	2206	{
		2207	struct mem_cgroup *memcg = current->memcg_oom.memcg;
2249	struct oom_wait_info owait;	2208	struct oom_wait_info owait;
2250	struct mem_cgroup *memcg;	2209	bool locked;
2251		2210
2252	/* OOM is global, do not handle */	2211	/* OOM is global, do not handle */
2253	if (!current->memcg_oom.in_memcg_oom)
2254	return false;
2255
2256	/*
2257	* We invoked the OOM killer but there is a chance that a kill
2258	* did not free up any charges. Everybody else might already
2259	* be sleeping, so restart the fault and keep the rampage
2260	* going until some charges are released.
2261	*/
2262	memcg = current->memcg_oom.wait_on_memcg;
2263	if (!memcg)	2212	if (!memcg)
2264	goto out;	2213	return false;
2265		2214
2266	if (test_thread_flag(TIF_MEMDIE) \|\| fatal_signal_pending(current))	2215	if (!handle)
2267	goto out_memcg;	2216	goto cleanup;
2268		2217
2269	owait.memcg = memcg;	2218	owait.memcg = memcg;
2270	owait.wait.flags = 0;	2219	owait.wait.flags = 0;
@@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
2273	INIT_LIST_HEAD(&owait.wait.task_list);	2222	INIT_LIST_HEAD(&owait.wait.task_list);
2274		2223
2275	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);	2224	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2276	/* Only sleep if we didn't miss any wakeups since OOM */	2225	mem_cgroup_mark_under_oom(memcg);
2277	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)	2226
		2227	locked = mem_cgroup_oom_trylock(memcg);
		2228
		2229	if (locked)
		2230	mem_cgroup_oom_notify(memcg);
		2231
		2232	if (locked && !memcg->oom_kill_disable) {
		2233	mem_cgroup_unmark_under_oom(memcg);
		2234	finish_wait(&memcg_oom_waitq, &owait.wait);
		2235	mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
		2236	current->memcg_oom.order);
		2237	} else {
2278	schedule();	2238	schedule();
2279	finish_wait(&memcg_oom_waitq, &owait.wait);	2239	mem_cgroup_unmark_under_oom(memcg);
2280	out_memcg:	2240	finish_wait(&memcg_oom_waitq, &owait.wait);
2281	mem_cgroup_unmark_under_oom(memcg);	2241	}
2282	if (current->memcg_oom.oom_locked) {	2242
		2243	if (locked) {
2283	mem_cgroup_oom_unlock(memcg);	2244	mem_cgroup_oom_unlock(memcg);
2284	/*	2245	/*
2285	* There is no guarantee that an OOM-lock contender	2246	* There is no guarantee that an OOM-lock contender
@@ -2288,10 +2249,9 @@ out_memcg:
2288	*/	2249	*/
2289	memcg_oom_recover(memcg);	2250	memcg_oom_recover(memcg);
2290	}	2251	}
		2252	cleanup:
		2253	current->memcg_oom.memcg = NULL;
2291	css_put(&memcg->css);	2254	css_put(&memcg->css);
2292	current->memcg_oom.wait_on_memcg = NULL;
2293	out:
2294	current->memcg_oom.in_memcg_oom = 0;
2295	return true;	2255	return true;
2296	}	2256	}
2297		2257
@@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2705	\|\| fatal_signal_pending(current)))	2665	\|\| fatal_signal_pending(current)))
2706	goto bypass;	2666	goto bypass;
2707		2667
		2668	if (unlikely(task_in_memcg_oom(current)))
		2669	goto bypass;
		2670
2708	/*	2671	/*
2709	* We always charge the cgroup the mm_struct belongs to.	2672	* We always charge the cgroup the mm_struct belongs to.
2710	* The mm_struct's mem_cgroup changes on task migration if the	2673	* The mm_struct's mem_cgroup changes on task migration if the