1 files changed, 55 insertions, 88 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c52ddbc839b..34d3ca9572d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -866,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
        unsigned long val = 0;
        int cpu;
+        get_online_cpus();
        for_each_online_cpu(cpu)
                val += per_cpu(memcg->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
@@ -873,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
        val += memcg->nocpu_base.events[idx];
        spin_unlock(&memcg->pcp_counter_lock);
 #endif
+        put_online_cpus();
        return val;
 }
@@ -2159,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                memcg_wakeup_oom(memcg);
 }
-/*
- * try to call OOM killer
- */
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-        bool locked;
-        int wakeups;
        if (!current->memcg_oom.may_oom)
                return;
-        current->memcg_oom.in_memcg_oom = 1;
        /*
-         * As with any blocking lock, a contender needs to start
+         * We are in the middle of the charge context here, so we
-         * listening for wakeups before attempting the trylock,
+         * don't want to block when potentially sitting on a callstack
-         * otherwise it can miss the wakeup from the unlock and sleep
+         * that holds all kinds of filesystem and mm locks.
-         * indefinitely.  This is just open-coded because our locking
+         *
-         * is so particular to memcg hierarchies.
+         * Also, the caller may handle a failed allocation gracefully
+         * (like optional page cache readahead) and so an OOM killer
+         * invocation might not even be necessary.
+         *
+         * That's why we don't do anything here except remember the
+         * OOM context and then deal with it at the end of the page
+         * fault when the stack is unwound, the locks are released,
+         * and when we know whether the fault was overall successful.
         */
-        wakeups = atomic_read(&memcg->oom_wakeups);
+        css_get(&memcg->css);
-        mem_cgroup_mark_under_oom(memcg);
+        current->memcg_oom.memcg = memcg;
+        current->memcg_oom.gfp_mask = mask;
-        locked = mem_cgroup_oom_trylock(memcg);
+        current->memcg_oom.order = order;
-        if (locked)
-                mem_cgroup_oom_notify(memcg);
-        if (locked && !memcg->oom_kill_disable) {
-                mem_cgroup_unmark_under_oom(memcg);
-                mem_cgroup_out_of_memory(memcg, mask, order);
-                mem_cgroup_oom_unlock(memcg);
-                /*
-                 * There is no guarantee that an OOM-lock contender
-                 * sees the wakeups triggered by the OOM kill
-                 * uncharges.  Wake any sleepers explicitely.
-                 */
-                memcg_oom_recover(memcg);
-        } else {
-                /*
-                 * A system call can just return -ENOMEM, but if this
-                 * is a page fault and somebody else is handling the
-                 * OOM already, we need to sleep on the OOM waitqueue
-                 * for this memcg until the situation is resolved.
-                 * Which can take some time because it might be
-                 * handled by a userspace task.
-                 *
-                 * However, this is the charge context, which means
-                 * that we may sit on a large call stack and hold
-                 * various filesystem locks, the mmap_sem etc. and we
-                 * don't want the OOM handler to deadlock on them
-                 * while we sit here and wait.  Store the current OOM
-                 * context in the task_struct, then return -ENOMEM.
-                 * At the end of the page fault handler, with the
-                 * stack unwound, pagefault_out_of_memory() will check
-                 * back with us by calling
-                 * mem_cgroup_oom_synchronize(), possibly putting the
-                 * task to sleep.
-                 */
-                current->memcg_oom.oom_locked = locked;
-                current->memcg_oom.wakeups = wakeups;
-                css_get(&memcg->css);
-                current->memcg_oom.wait_on_memcg = memcg;
-        }
 }
 /**
 * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
 *
- * This has to be called at the end of a page fault if the the memcg
+ * This has to be called at the end of a page fault if the memcg OOM
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * handler was enabled.
 *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
 * sleep on a waitqueue until the userspace task resolves the
 * situation.  Sleeping directly in the charge context with all kinds
 * of locks held is not a good idea, instead we remember an OOM state
 * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
+ * the end of the page fault to complete the OOM handling.
- * OOM state.
 *
 * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
 */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
 {
+        struct mem_cgroup *memcg = current->memcg_oom.memcg;
        struct oom_wait_info owait;
-        struct mem_cgroup *memcg;
+        bool locked;
        /* OOM is global, do not handle */
-        if (!current->memcg_oom.in_memcg_oom)
-                return false;
-        /*
-         * We invoked the OOM killer but there is a chance that a kill
-         * did not free up any charges.  Everybody else might already
-         * be sleeping, so restart the fault and keep the rampage
-         * going until some charges are released.
-         */
-        memcg = current->memcg_oom.wait_on_memcg;
        if (!memcg)
-                goto out;
+                return false;
-        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+        if (!handle)
-                goto out_memcg;
+                goto cleanup;
        owait.memcg = memcg;
        owait.wait.flags = 0;
@@ -2271,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
        INIT_LIST_HEAD(&owait.wait.task_list);
        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-        /* Only sleep if we didn't miss any wakeups since OOM */
+        mem_cgroup_mark_under_oom(memcg);
-        if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+        locked = mem_cgroup_oom_trylock(memcg);
+        if (locked)
+                mem_cgroup_oom_notify(memcg);
+        if (locked && !memcg->oom_kill_disable) {
+                mem_cgroup_unmark_under_oom(memcg);
+                finish_wait(&memcg_oom_waitq, &owait.wait);
+                mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+                                         current->memcg_oom.order);
+        } else {
                schedule();
-        finish_wait(&memcg_oom_waitq, &owait.wait);
+                mem_cgroup_unmark_under_oom(memcg);
-out_memcg:
+                finish_wait(&memcg_oom_waitq, &owait.wait);
-        mem_cgroup_unmark_under_oom(memcg);
+        }
-        if (current->memcg_oom.oom_locked) {
+        if (locked) {
                mem_cgroup_oom_unlock(memcg);
                /*
                 * There is no guarantee that an OOM-lock contender
@@ -2286,10 +2249,9 @@ out_memcg:
                 */
                memcg_oom_recover(memcg);
        }
+cleanup:
+        current->memcg_oom.memcg = NULL;
        css_put(&memcg->css);
-        current->memcg_oom.wait_on_memcg = NULL;
-out:
-        current->memcg_oom.in_memcg_oom = 0;
        return true;
 }
@@ -2703,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                     || fatal_signal_pending(current)))
                goto bypass;
+        if (unlikely(task_in_memcg_oom(current)))
+                goto bypass;
        /*
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
@@ -2801,6 +2766,8 @@ done:
        return 0;
 nomem:
        *ptr = NULL;
+        if (gfp_mask & __GFP_NOFAIL)
+                return 0;
        return -ENOMEM;
 bypass:
        *ptr = root_mem_cgroup;

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1c52ddbc839b..34d3ca9572d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -866,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
866	unsigned long val = 0;	866	unsigned long val = 0;
867	int cpu;	867	int cpu;
868		868
		869	get_online_cpus();
869	for_each_online_cpu(cpu)	870	for_each_online_cpu(cpu)
870	val += per_cpu(memcg->stat->events[idx], cpu);	871	val += per_cpu(memcg->stat->events[idx], cpu);
871	#ifdef CONFIG_HOTPLUG_CPU	872	#ifdef CONFIG_HOTPLUG_CPU
@@ -873,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
873	val += memcg->nocpu_base.events[idx];	874	val += memcg->nocpu_base.events[idx];
874	spin_unlock(&memcg->pcp_counter_lock);	875	spin_unlock(&memcg->pcp_counter_lock);
875	#endif	876	#endif
		877	put_online_cpus();
876	return val;	878	return val;
877	}	879	}
878		880
@@ -2159,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2159	memcg_wakeup_oom(memcg);	2161	memcg_wakeup_oom(memcg);
2160	}	2162	}
2161		2163
2162	/*
2163	* try to call OOM killer
2164	*/
2165	static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)	2164	static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2166	{	2165	{
2167	bool locked;
2168	int wakeups;
2169
2170	if (!current->memcg_oom.may_oom)	2166	if (!current->memcg_oom.may_oom)
2171	return;	2167	return;
2172
2173	current->memcg_oom.in_memcg_oom = 1;
2174
2175	/*	2168	/*
2176	* As with any blocking lock, a contender needs to start	2169	* We are in the middle of the charge context here, so we
2177	* listening for wakeups before attempting the trylock,	2170	* don't want to block when potentially sitting on a callstack
2178	* otherwise it can miss the wakeup from the unlock and sleep	2171	* that holds all kinds of filesystem and mm locks.
2179	* indefinitely. This is just open-coded because our locking	2172	*
2180	* is so particular to memcg hierarchies.	2173	* Also, the caller may handle a failed allocation gracefully
		2174	* (like optional page cache readahead) and so an OOM killer
		2175	* invocation might not even be necessary.
		2176	*
		2177	* That's why we don't do anything here except remember the
		2178	* OOM context and then deal with it at the end of the page
		2179	* fault when the stack is unwound, the locks are released,
		2180	* and when we know whether the fault was overall successful.
2181	*/	2181	*/
2182	wakeups = atomic_read(&memcg->oom_wakeups);	2182	css_get(&memcg->css);
2183	mem_cgroup_mark_under_oom(memcg);	2183	current->memcg_oom.memcg = memcg;
2184		2184	current->memcg_oom.gfp_mask = mask;
2185	locked = mem_cgroup_oom_trylock(memcg);	2185	current->memcg_oom.order = order;
2186
2187	if (locked)
2188	mem_cgroup_oom_notify(memcg);
2189
2190	if (locked && !memcg->oom_kill_disable) {
2191	mem_cgroup_unmark_under_oom(memcg);
2192	mem_cgroup_out_of_memory(memcg, mask, order);
2193	mem_cgroup_oom_unlock(memcg);
2194	/*
2195	* There is no guarantee that an OOM-lock contender
2196	* sees the wakeups triggered by the OOM kill
2197	* uncharges. Wake any sleepers explicitely.
2198	*/
2199	memcg_oom_recover(memcg);
2200	} else {
2201	/*
2202	* A system call can just return -ENOMEM, but if this
2203	* is a page fault and somebody else is handling the
2204	* OOM already, we need to sleep on the OOM waitqueue
2205	* for this memcg until the situation is resolved.
2206	* Which can take some time because it might be
2207	* handled by a userspace task.
2208	*
2209	* However, this is the charge context, which means
2210	* that we may sit on a large call stack and hold
2211	* various filesystem locks, the mmap_sem etc. and we
2212	* don't want the OOM handler to deadlock on them
2213	* while we sit here and wait. Store the current OOM
2214	* context in the task_struct, then return -ENOMEM.
2215	* At the end of the page fault handler, with the
2216	* stack unwound, pagefault_out_of_memory() will check
2217	* back with us by calling
2218	* mem_cgroup_oom_synchronize(), possibly putting the
2219	* task to sleep.
2220	*/
2221	current->memcg_oom.oom_locked = locked;
2222	current->memcg_oom.wakeups = wakeups;
2223	css_get(&memcg->css);
2224	current->memcg_oom.wait_on_memcg = memcg;
2225	}
2226	}	2186	}
2227		2187
2228	/**	2188	/**
2229	* mem_cgroup_oom_synchronize - complete memcg OOM handling	2189	* mem_cgroup_oom_synchronize - complete memcg OOM handling
		2190	* @handle: actually kill/wait or just clean up the OOM state
2230	*	2191	*
2231	* This has to be called at the end of a page fault if the the memcg	2192	* This has to be called at the end of a page fault if the memcg OOM
2232	* OOM handler was enabled and the fault is returning %VM_FAULT_OOM.	2193	* handler was enabled.
2233	*	2194	*
2234	* Memcg supports userspace OOM handling, so failed allocations must	2195	* Memcg supports userspace OOM handling where failed allocations must
2235	* sleep on a waitqueue until the userspace task resolves the	2196	* sleep on a waitqueue until the userspace task resolves the
2236	* situation. Sleeping directly in the charge context with all kinds	2197	* situation. Sleeping directly in the charge context with all kinds
2237	* of locks held is not a good idea, instead we remember an OOM state	2198	* of locks held is not a good idea, instead we remember an OOM state
2238	* in the task and mem_cgroup_oom_synchronize() has to be called at	2199	* in the task and mem_cgroup_oom_synchronize() has to be called at
2239	* the end of the page fault to put the task to sleep and clean up the	2200	* the end of the page fault to complete the OOM handling.
2240	* OOM state.
2241	*	2201	*
2242	* Returns %true if an ongoing memcg OOM situation was detected and	2202	* Returns %true if an ongoing memcg OOM situation was detected and
2243	* finalized, %false otherwise.	2203	* completed, %false otherwise.
2244	*/	2204	*/
2245	bool mem_cgroup_oom_synchronize(void)	2205	bool mem_cgroup_oom_synchronize(bool handle)
2246	{	2206	{
		2207	struct mem_cgroup *memcg = current->memcg_oom.memcg;
2247	struct oom_wait_info owait;	2208	struct oom_wait_info owait;
2248	struct mem_cgroup *memcg;	2209	bool locked;
2249		2210
2250	/* OOM is global, do not handle */	2211	/* OOM is global, do not handle */
2251	if (!current->memcg_oom.in_memcg_oom)
2252	return false;
2253
2254	/*
2255	* We invoked the OOM killer but there is a chance that a kill
2256	* did not free up any charges. Everybody else might already
2257	* be sleeping, so restart the fault and keep the rampage
2258	* going until some charges are released.
2259	*/
2260	memcg = current->memcg_oom.wait_on_memcg;
2261	if (!memcg)	2212	if (!memcg)
2262	goto out;	2213	return false;
2263		2214
2264	if (test_thread_flag(TIF_MEMDIE) \|\| fatal_signal_pending(current))	2215	if (!handle)
2265	goto out_memcg;	2216	goto cleanup;
2266		2217
2267	owait.memcg = memcg;	2218	owait.memcg = memcg;
2268	owait.wait.flags = 0;	2219	owait.wait.flags = 0;
@@ -2271,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
2271	INIT_LIST_HEAD(&owait.wait.task_list);	2222	INIT_LIST_HEAD(&owait.wait.task_list);
2272		2223
2273	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);	2224	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2274	/* Only sleep if we didn't miss any wakeups since OOM */	2225	mem_cgroup_mark_under_oom(memcg);
2275	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)	2226
		2227	locked = mem_cgroup_oom_trylock(memcg);
		2228
		2229	if (locked)
		2230	mem_cgroup_oom_notify(memcg);
		2231
		2232	if (locked && !memcg->oom_kill_disable) {
		2233	mem_cgroup_unmark_under_oom(memcg);
		2234	finish_wait(&memcg_oom_waitq, &owait.wait);
		2235	mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
		2236	current->memcg_oom.order);
		2237	} else {
2276	schedule();	2238	schedule();
2277	finish_wait(&memcg_oom_waitq, &owait.wait);	2239	mem_cgroup_unmark_under_oom(memcg);
2278	out_memcg:	2240	finish_wait(&memcg_oom_waitq, &owait.wait);
2279	mem_cgroup_unmark_under_oom(memcg);	2241	}
2280	if (current->memcg_oom.oom_locked) {	2242
		2243	if (locked) {
2281	mem_cgroup_oom_unlock(memcg);	2244	mem_cgroup_oom_unlock(memcg);
2282	/*	2245	/*
2283	* There is no guarantee that an OOM-lock contender	2246	* There is no guarantee that an OOM-lock contender
@@ -2286,10 +2249,9 @@ out_memcg:
2286	*/	2249	*/
2287	memcg_oom_recover(memcg);	2250	memcg_oom_recover(memcg);
2288	}	2251	}
		2252	cleanup:
		2253	current->memcg_oom.memcg = NULL;
2289	css_put(&memcg->css);	2254	css_put(&memcg->css);
2290	current->memcg_oom.wait_on_memcg = NULL;
2291	out:
2292	current->memcg_oom.in_memcg_oom = 0;
2293	return true;	2255	return true;
2294	}	2256	}
2295		2257
@@ -2703,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2703	\|\| fatal_signal_pending(current)))	2665	\|\| fatal_signal_pending(current)))
2704	goto bypass;	2666	goto bypass;
2705		2667
		2668	if (unlikely(task_in_memcg_oom(current)))
		2669	goto bypass;
		2670
2706	/*	2671	/*
2707	* We always charge the cgroup the mm_struct belongs to.	2672	* We always charge the cgroup the mm_struct belongs to.
2708	* The mm_struct's mem_cgroup changes on task migration if the	2673	* The mm_struct's mem_cgroup changes on task migration if the
@@ -2801,6 +2766,8 @@ done:
2801	return 0;	2766	return 0;
2802	nomem:	2767	nomem:
2803	*ptr = NULL;	2768	*ptr = NULL;
		2769	if (gfp_mask & __GFP_NOFAIL)
		2770	return 0;
2804	return -ENOMEM;	2771	return -ENOMEM;
2805	bypass:	2772	bypass:
2806	*ptr = root_mem_cgroup;	2773	*ptr = root_mem_cgroup;