6 files changed, 79 insertions, 148 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ecc82b37c4cc..b3e7a667e03c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
                                        struct page *newpage);
-/**
+static inline void mem_cgroup_oom_enable(void)
- * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
- * @new: true to enable, false to disable
- *
- * Toggle whether a failed memcg charge should invoke the OOM killer
- * or just return -ENOMEM.  Returns the previous toggle state.
- *
- * NOTE: Any path that enables the OOM killer before charging must
- *       call mem_cgroup_oom_synchronize() afterward to finalize the
- *       OOM handling and clean up.
- */
-static inline bool mem_cgroup_toggle_oom(bool new)
 {
-        bool old;
+        WARN_ON(current->memcg_oom.may_oom);
+        current->memcg_oom.may_oom = 1;
-        old = current->memcg_oom.may_oom;
-        current->memcg_oom.may_oom = new;
-        return old;
 }
-static inline void mem_cgroup_enable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
 {
-        bool old = mem_cgroup_toggle_oom(true);
+        WARN_ON(!current->memcg_oom.may_oom);
+        current->memcg_oom.may_oom = 0;
-        WARN_ON(old == true);
-}
-static inline void mem_cgroup_disable_oom(void)
-{
-        bool old = mem_cgroup_toggle_oom(false);
-        WARN_ON(old == false);
 }
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
-        return p->memcg_oom.in_memcg_oom;
+        return p->memcg_oom.memcg;
 }
-bool mem_cgroup_oom_synchronize(void);
+bool mem_cgroup_oom_synchronize(bool wait);
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
@@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
 {
 }
-static inline bool mem_cgroup_toggle_oom(bool new)
+static inline void mem_cgroup_oom_enable(void)
-{
-        return false;
-}
-static inline void mem_cgroup_enable_oom(void)
 {
 }
-static inline void mem_cgroup_disable_oom(void)
+static inline void mem_cgroup_oom_disable(void)
 {
 }
@@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
        return false;
 }
-static inline bool mem_cgroup_oom_synchronize(void)
+static inline bool mem_cgroup_oom_synchronize(bool wait)
 {
        return false;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6682da36b293..e27baeeda3f4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1394,11 +1394,10 @@ struct task_struct {
        } memcg_batch;
        unsigned int memcg_kmem_skip_account;
        struct memcg_oom_info {
+                struct mem_cgroup *memcg;
+                gfp_t gfp_mask;
+                int order;
                unsigned int may_oom:1;
-                unsigned int in_memcg_oom:1;
-                unsigned int oom_locked:1;
-                int wakeups;
-                struct mem_cgroup *wait_on_memcg;
        } memcg_oom;
 #endif
 #ifdef CONFIG_UPROBES
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4a2d2e..ae4846ff4849 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct inode *inode = mapping->host;
        pgoff_t offset = vmf->pgoff;
        struct page *page;
-        bool memcg_oom;
        pgoff_t size;
        int ret = 0;
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                return VM_FAULT_SIGBUS;
        /*
-         * Do we have something in the page cache already?  Either
+         * Do we have something in the page cache already?
-         * way, try readahead, but disable the memcg OOM killer for it
-         * as readahead is optional and no errors are propagated up
-         * the fault stack.  The OOM killer is enabled while trying to
-         * instantiate the faulting page individually below.
         */
        page = find_get_page(mapping, offset);
        if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 * We found the page, so try async readahead before
                 * waiting for the lock.
                 */
-                memcg_oom = mem_cgroup_toggle_oom(false);
                do_async_mmap_readahead(vma, ra, file, page, offset);
-                mem_cgroup_toggle_oom(memcg_oom);
        } else if (!page) {
                /* No page in the page cache at all */
-                memcg_oom = mem_cgroup_toggle_oom(false);
                do_sync_mmap_readahead(vma, ra, file, offset);
-                mem_cgroup_toggle_oom(memcg_oom);
                count_vm_event(PGMAJFAULT);
                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5335b2b6be77..65fc6a449841 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                memcg_wakeup_oom(memcg);
 }
-/*
- * try to call OOM killer
- */
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-        bool locked;
-        int wakeups;
        if (!current->memcg_oom.may_oom)
                return;
-        current->memcg_oom.in_memcg_oom = 1;
        /*
-         * As with any blocking lock, a contender needs to start
+         * We are in the middle of the charge context here, so we
-         * listening for wakeups before attempting the trylock,
+         * don't want to block when potentially sitting on a callstack
-         * otherwise it can miss the wakeup from the unlock and sleep
+         * that holds all kinds of filesystem and mm locks.
-         * indefinitely.  This is just open-coded because our locking
+         *
-         * is so particular to memcg hierarchies.
+         * Also, the caller may handle a failed allocation gracefully
+         * (like optional page cache readahead) and so an OOM killer
+         * invocation might not even be necessary.
+         *
+         * That's why we don't do anything here except remember the
+         * OOM context and then deal with it at the end of the page
+         * fault when the stack is unwound, the locks are released,
+         * and when we know whether the fault was overall successful.
         */
-        wakeups = atomic_read(&memcg->oom_wakeups);
+        css_get(&memcg->css);
-        mem_cgroup_mark_under_oom(memcg);
+        current->memcg_oom.memcg = memcg;
+        current->memcg_oom.gfp_mask = mask;
-        locked = mem_cgroup_oom_trylock(memcg);
+        current->memcg_oom.order = order;
-        if (locked)
-                mem_cgroup_oom_notify(memcg);
-        if (locked && !memcg->oom_kill_disable) {
-                mem_cgroup_unmark_under_oom(memcg);
-                mem_cgroup_out_of_memory(memcg, mask, order);
-                mem_cgroup_oom_unlock(memcg);
-                /*
-                 * There is no guarantee that an OOM-lock contender
-                 * sees the wakeups triggered by the OOM kill
-                 * uncharges.  Wake any sleepers explicitely.
-                 */
-                memcg_oom_recover(memcg);
-        } else {
-                /*
-                 * A system call can just return -ENOMEM, but if this
-                 * is a page fault and somebody else is handling the
-                 * OOM already, we need to sleep on the OOM waitqueue
-                 * for this memcg until the situation is resolved.
-                 * Which can take some time because it might be
-                 * handled by a userspace task.
-                 *
-                 * However, this is the charge context, which means
-                 * that we may sit on a large call stack and hold
-                 * various filesystem locks, the mmap_sem etc. and we
-                 * don't want the OOM handler to deadlock on them
-                 * while we sit here and wait.  Store the current OOM
-                 * context in the task_struct, then return -ENOMEM.
-                 * At the end of the page fault handler, with the
-                 * stack unwound, pagefault_out_of_memory() will check
-                 * back with us by calling
-                 * mem_cgroup_oom_synchronize(), possibly putting the
-                 * task to sleep.
-                 */
-                current->memcg_oom.oom_locked = locked;
-                current->memcg_oom.wakeups = wakeups;
-                css_get(&memcg->css);
-                current->memcg_oom.wait_on_memcg = memcg;
-        }
 }
 /**
 * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
 *
- * This has to be called at the end of a page fault if the the memcg
+ * This has to be called at the end of a page fault if the memcg OOM
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * handler was enabled.
 *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
 * sleep on a waitqueue until the userspace task resolves the
 * situation.  Sleeping directly in the charge context with all kinds
 * of locks held is not a good idea, instead we remember an OOM state
 * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
+ * the end of the page fault to complete the OOM handling.
- * OOM state.
 *
 * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
 */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
 {
+        struct mem_cgroup *memcg = current->memcg_oom.memcg;
        struct oom_wait_info owait;
-        struct mem_cgroup *memcg;
+        bool locked;
        /* OOM is global, do not handle */
-        if (!current->memcg_oom.in_memcg_oom)
-                return false;
-        /*
-         * We invoked the OOM killer but there is a chance that a kill
-         * did not free up any charges.  Everybody else might already
-         * be sleeping, so restart the fault and keep the rampage
-         * going until some charges are released.
-         */
-        memcg = current->memcg_oom.wait_on_memcg;
        if (!memcg)
-                goto out;
+                return false;
-        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+        if (!handle)
-                goto out_memcg;
+                goto cleanup;
        owait.memcg = memcg;
        owait.wait.flags = 0;
@@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
        INIT_LIST_HEAD(&owait.wait.task_list);
        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-        /* Only sleep if we didn't miss any wakeups since OOM */
+        mem_cgroup_mark_under_oom(memcg);
-        if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+        locked = mem_cgroup_oom_trylock(memcg);
+        if (locked)
+                mem_cgroup_oom_notify(memcg);
+        if (locked && !memcg->oom_kill_disable) {
+                mem_cgroup_unmark_under_oom(memcg);
+                finish_wait(&memcg_oom_waitq, &owait.wait);
+                mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+                                         current->memcg_oom.order);
+        } else {
                schedule();
-        finish_wait(&memcg_oom_waitq, &owait.wait);
+                mem_cgroup_unmark_under_oom(memcg);
-out_memcg:
+                finish_wait(&memcg_oom_waitq, &owait.wait);
-        mem_cgroup_unmark_under_oom(memcg);
+        }
-        if (current->memcg_oom.oom_locked) {
+        if (locked) {
                mem_cgroup_oom_unlock(memcg);
                /*
                 * There is no guarantee that an OOM-lock contender
@@ -2288,10 +2249,9 @@ out_memcg:
                 */
                memcg_oom_recover(memcg);
        }
+cleanup:
+        current->memcg_oom.memcg = NULL;
        css_put(&memcg->css);
-        current->memcg_oom.wait_on_memcg = NULL;
-out:
-        current->memcg_oom.in_memcg_oom = 0;
        return true;
 }
@@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                     || fatal_signal_pending(current)))
                goto bypass;
+        if (unlikely(task_in_memcg_oom(current)))
+                goto bypass;
        /*
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
diff --git a/mm/memory.c b/mm/memory.c
index f7b7692c05ed..1311f26497e6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3865,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * space.  Kernel faults are handled more gracefully.
         */
        if (flags & FAULT_FLAG_USER)
-                mem_cgroup_enable_oom();
+                mem_cgroup_oom_enable();
        ret = __handle_mm_fault(mm, vma, address, flags);
-        if (flags & FAULT_FLAG_USER)
+        if (flags & FAULT_FLAG_USER) {
-                mem_cgroup_disable_oom();
+                mem_cgroup_oom_disable();
+                /*
-        if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
+                 * The task may have entered a memcg OOM situation but
-                mem_cgroup_oom_synchronize();
+                 * if the allocation error was handled gracefully (no
+                 * VM_FAULT_OOM), there is no need to kill anything.
+                 * Just clean up the OOM state peacefully.
+                 */
+                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+                        mem_cgroup_oom_synchronize(false);
+        }
        return ret;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d274381..6738c47f1f72 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
 {
        struct zonelist *zonelist;
-        if (mem_cgroup_oom_synchronize())
+        if (mem_cgroup_oom_synchronize(true))
                return;
        zonelist = node_zonelist(first_online_node, GFP_KERNEL);

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ecc82b37c4cc..b3e7a667e03c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h
@@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
137	extern void mem_cgroup_replace_page_cache(struct page *oldpage,	137	extern void mem_cgroup_replace_page_cache(struct page *oldpage,
138	struct page *newpage);	138	struct page *newpage);
139		139
140	/**	140	static inline void mem_cgroup_oom_enable(void)
141	* mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
142	* @new: true to enable, false to disable
143	*
144	* Toggle whether a failed memcg charge should invoke the OOM killer
145	* or just return -ENOMEM. Returns the previous toggle state.
146	*
147	* NOTE: Any path that enables the OOM killer before charging must
148	* call mem_cgroup_oom_synchronize() afterward to finalize the
149	* OOM handling and clean up.
150	*/
151	static inline bool mem_cgroup_toggle_oom(bool new)
152	{	141	{
153	bool old;	142	WARN_ON(current->memcg_oom.may_oom);
154		143	current->memcg_oom.may_oom = 1;
155	old = current->memcg_oom.may_oom;
156	current->memcg_oom.may_oom = new;
157
158	return old;
159	}	144	}
160		145
161	static inline void mem_cgroup_enable_oom(void)	146	static inline void mem_cgroup_oom_disable(void)
162	{	147	{
163	bool old = mem_cgroup_toggle_oom(true);	148	WARN_ON(!current->memcg_oom.may_oom);
164		149	current->memcg_oom.may_oom = 0;
165	WARN_ON(old == true);
166	}
167
168	static inline void mem_cgroup_disable_oom(void)
169	{
170	bool old = mem_cgroup_toggle_oom(false);
171
172	WARN_ON(old == false);
173	}	150	}
174		151
175	static inline bool task_in_memcg_oom(struct task_struct *p)	152	static inline bool task_in_memcg_oom(struct task_struct *p)
176	{	153	{
177	return p->memcg_oom.in_memcg_oom;	154	return p->memcg_oom.memcg;
178	}	155	}
179		156
180	bool mem_cgroup_oom_synchronize(void);	157	bool mem_cgroup_oom_synchronize(bool wait);
181		158
182	#ifdef CONFIG_MEMCG_SWAP	159	#ifdef CONFIG_MEMCG_SWAP
183	extern int do_swap_account;	160	extern int do_swap_account;
@@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
402	{	379	{
403	}	380	}
404		381
405	static inline bool mem_cgroup_toggle_oom(bool new)	382	static inline void mem_cgroup_oom_enable(void)
406	{
407	return false;
408	}
409
410	static inline void mem_cgroup_enable_oom(void)
411	{	383	{
412	}	384	}
413		385
414	static inline void mem_cgroup_disable_oom(void)	386	static inline void mem_cgroup_oom_disable(void)
415	{	387	{
416	}	388	}
417		389
@@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
420	return false;	392	return false;
421	}	393	}
422		394
423	static inline bool mem_cgroup_oom_synchronize(void)	395	static inline bool mem_cgroup_oom_synchronize(bool wait)
424	{	396	{
425	return false;	397	return false;
426	}	398	}


diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b293..e27baeeda3f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -1394,11 +1394,10 @@ struct task_struct {
1394	} memcg_batch;	1394	} memcg_batch;
1395	unsigned int memcg_kmem_skip_account;	1395	unsigned int memcg_kmem_skip_account;
1396	struct memcg_oom_info {	1396	struct memcg_oom_info {
		1397	struct mem_cgroup *memcg;
		1398	gfp_t gfp_mask;
		1399	int order;
1397	unsigned int may_oom:1;	1400	unsigned int may_oom:1;
1398	unsigned int in_memcg_oom:1;
1399	unsigned int oom_locked:1;
1400	int wakeups;
1401	struct mem_cgroup *wait_on_memcg;
1402	} memcg_oom;	1401	} memcg_oom;
1403	#endif	1402	#endif
1404	#ifdef CONFIG_UPROBES	1403	#ifdef CONFIG_UPROBES


diff --git a/mm/filemap.c b/mm/filemap.c index 1e6aec4a2d2e..ae4846ff4849 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct vma, struct vm_fault vmf)
1616	struct inode *inode = mapping->host;	1616	struct inode *inode = mapping->host;
1617	pgoff_t offset = vmf->pgoff;	1617	pgoff_t offset = vmf->pgoff;
1618	struct page *page;	1618	struct page *page;
1619	bool memcg_oom;
1620	pgoff_t size;	1619	pgoff_t size;
1621	int ret = 0;	1620	int ret = 0;
1622		1621
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct vma, struct vm_fault vmf)
1625	return VM_FAULT_SIGBUS;	1624	return VM_FAULT_SIGBUS;
1626		1625
1627	/*	1626	/*
1628	* Do we have something in the page cache already? Either	1627	* Do we have something in the page cache already?
1629	* way, try readahead, but disable the memcg OOM killer for it
1630	* as readahead is optional and no errors are propagated up
1631	* the fault stack. The OOM killer is enabled while trying to
1632	* instantiate the faulting page individually below.
1633	*/	1628	*/
1634	page = find_get_page(mapping, offset);	1629	page = find_get_page(mapping, offset);
1635	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {	1630	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct vma, struct vm_fault vmf)
1637	* We found the page, so try async readahead before	1632	* We found the page, so try async readahead before
1638	* waiting for the lock.	1633	* waiting for the lock.
1639	*/	1634	*/
1640	memcg_oom = mem_cgroup_toggle_oom(false);
1641	do_async_mmap_readahead(vma, ra, file, page, offset);	1635	do_async_mmap_readahead(vma, ra, file, page, offset);
1642	mem_cgroup_toggle_oom(memcg_oom);
1643	} else if (!page) {	1636	} else if (!page) {
1644	/* No page in the page cache at all */	1637	/* No page in the page cache at all */
1645	memcg_oom = mem_cgroup_toggle_oom(false);
1646	do_sync_mmap_readahead(vma, ra, file, offset);	1638	do_sync_mmap_readahead(vma, ra, file, offset);
1647	mem_cgroup_toggle_oom(memcg_oom);
1648	count_vm_event(PGMAJFAULT);	1639	count_vm_event(PGMAJFAULT);
1649	mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);	1640	mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1650	ret = VM_FAULT_MAJOR;	1641	ret = VM_FAULT_MAJOR;


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5335b2b6be77..65fc6a449841 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2161	memcg_wakeup_oom(memcg);	2161	memcg_wakeup_oom(memcg);
2162	}	2162	}
2163		2163
2164	/*
2165	* try to call OOM killer
2166	*/
2167	static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)	2164	static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2168	{	2165	{
2169	bool locked;
2170	int wakeups;
2171
2172	if (!current->memcg_oom.may_oom)	2166	if (!current->memcg_oom.may_oom)
2173	return;	2167	return;
2174
2175	current->memcg_oom.in_memcg_oom = 1;
2176
2177	/*	2168	/*
2178	* As with any blocking lock, a contender needs to start	2169	* We are in the middle of the charge context here, so we
2179	* listening for wakeups before attempting the trylock,	2170	* don't want to block when potentially sitting on a callstack
2180	* otherwise it can miss the wakeup from the unlock and sleep	2171	* that holds all kinds of filesystem and mm locks.
2181	* indefinitely. This is just open-coded because our locking	2172	*
2182	* is so particular to memcg hierarchies.	2173	* Also, the caller may handle a failed allocation gracefully
		2174	* (like optional page cache readahead) and so an OOM killer
		2175	* invocation might not even be necessary.
		2176	*
		2177	* That's why we don't do anything here except remember the
		2178	* OOM context and then deal with it at the end of the page
		2179	* fault when the stack is unwound, the locks are released,
		2180	* and when we know whether the fault was overall successful.
2183	*/	2181	*/
2184	wakeups = atomic_read(&memcg->oom_wakeups);	2182	css_get(&memcg->css);
2185	mem_cgroup_mark_under_oom(memcg);	2183	current->memcg_oom.memcg = memcg;
2186		2184	current->memcg_oom.gfp_mask = mask;
2187	locked = mem_cgroup_oom_trylock(memcg);	2185	current->memcg_oom.order = order;
2188
2189	if (locked)
2190	mem_cgroup_oom_notify(memcg);
2191
2192	if (locked && !memcg->oom_kill_disable) {
2193	mem_cgroup_unmark_under_oom(memcg);
2194	mem_cgroup_out_of_memory(memcg, mask, order);
2195	mem_cgroup_oom_unlock(memcg);
2196	/*
2197	* There is no guarantee that an OOM-lock contender
2198	* sees the wakeups triggered by the OOM kill
2199	* uncharges. Wake any sleepers explicitely.
2200	*/
2201	memcg_oom_recover(memcg);
2202	} else {
2203	/*
2204	* A system call can just return -ENOMEM, but if this
2205	* is a page fault and somebody else is handling the
2206	* OOM already, we need to sleep on the OOM waitqueue
2207	* for this memcg until the situation is resolved.
2208	* Which can take some time because it might be
2209	* handled by a userspace task.
2210	*
2211	* However, this is the charge context, which means
2212	* that we may sit on a large call stack and hold
2213	* various filesystem locks, the mmap_sem etc. and we
2214	* don't want the OOM handler to deadlock on them
2215	* while we sit here and wait. Store the current OOM
2216	* context in the task_struct, then return -ENOMEM.
2217	* At the end of the page fault handler, with the
2218	* stack unwound, pagefault_out_of_memory() will check
2219	* back with us by calling
2220	* mem_cgroup_oom_synchronize(), possibly putting the
2221	* task to sleep.
2222	*/
2223	current->memcg_oom.oom_locked = locked;
2224	current->memcg_oom.wakeups = wakeups;
2225	css_get(&memcg->css);
2226	current->memcg_oom.wait_on_memcg = memcg;
2227	}
2228	}	2186	}
2229		2187
2230	/**	2188	/**
2231	* mem_cgroup_oom_synchronize - complete memcg OOM handling	2189	* mem_cgroup_oom_synchronize - complete memcg OOM handling
		2190	* @handle: actually kill/wait or just clean up the OOM state
2232	*	2191	*
2233	* This has to be called at the end of a page fault if the the memcg	2192	* This has to be called at the end of a page fault if the memcg OOM
2234	* OOM handler was enabled and the fault is returning %VM_FAULT_OOM.	2193	* handler was enabled.
2235	*	2194	*
2236	* Memcg supports userspace OOM handling, so failed allocations must	2195	* Memcg supports userspace OOM handling where failed allocations must
2237	* sleep on a waitqueue until the userspace task resolves the	2196	* sleep on a waitqueue until the userspace task resolves the
2238	* situation. Sleeping directly in the charge context with all kinds	2197	* situation. Sleeping directly in the charge context with all kinds
2239	* of locks held is not a good idea, instead we remember an OOM state	2198	* of locks held is not a good idea, instead we remember an OOM state
2240	* in the task and mem_cgroup_oom_synchronize() has to be called at	2199	* in the task and mem_cgroup_oom_synchronize() has to be called at
2241	* the end of the page fault to put the task to sleep and clean up the	2200	* the end of the page fault to complete the OOM handling.
2242	* OOM state.
2243	*	2201	*
2244	* Returns %true if an ongoing memcg OOM situation was detected and	2202	* Returns %true if an ongoing memcg OOM situation was detected and
2245	* finalized, %false otherwise.	2203	* completed, %false otherwise.
2246	*/	2204	*/
2247	bool mem_cgroup_oom_synchronize(void)	2205	bool mem_cgroup_oom_synchronize(bool handle)
2248	{	2206	{
		2207	struct mem_cgroup *memcg = current->memcg_oom.memcg;
2249	struct oom_wait_info owait;	2208	struct oom_wait_info owait;
2250	struct mem_cgroup *memcg;	2209	bool locked;
2251		2210
2252	/* OOM is global, do not handle */	2211	/* OOM is global, do not handle */
2253	if (!current->memcg_oom.in_memcg_oom)
2254	return false;
2255
2256	/*
2257	* We invoked the OOM killer but there is a chance that a kill
2258	* did not free up any charges. Everybody else might already
2259	* be sleeping, so restart the fault and keep the rampage
2260	* going until some charges are released.
2261	*/
2262	memcg = current->memcg_oom.wait_on_memcg;
2263	if (!memcg)	2212	if (!memcg)
2264	goto out;	2213	return false;
2265		2214
2266	if (test_thread_flag(TIF_MEMDIE) \|\| fatal_signal_pending(current))	2215	if (!handle)
2267	goto out_memcg;	2216	goto cleanup;
2268		2217
2269	owait.memcg = memcg;	2218	owait.memcg = memcg;
2270	owait.wait.flags = 0;	2219	owait.wait.flags = 0;
@@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
2273	INIT_LIST_HEAD(&owait.wait.task_list);	2222	INIT_LIST_HEAD(&owait.wait.task_list);
2274		2223
2275	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);	2224	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2276	/* Only sleep if we didn't miss any wakeups since OOM */	2225	mem_cgroup_mark_under_oom(memcg);
2277	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)	2226
		2227	locked = mem_cgroup_oom_trylock(memcg);
		2228
		2229	if (locked)
		2230	mem_cgroup_oom_notify(memcg);
		2231
		2232	if (locked && !memcg->oom_kill_disable) {
		2233	mem_cgroup_unmark_under_oom(memcg);
		2234	finish_wait(&memcg_oom_waitq, &owait.wait);
		2235	mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
		2236	current->memcg_oom.order);
		2237	} else {
2278	schedule();	2238	schedule();
2279	finish_wait(&memcg_oom_waitq, &owait.wait);	2239	mem_cgroup_unmark_under_oom(memcg);
2280	out_memcg:	2240	finish_wait(&memcg_oom_waitq, &owait.wait);
2281	mem_cgroup_unmark_under_oom(memcg);	2241	}
2282	if (current->memcg_oom.oom_locked) {	2242
		2243	if (locked) {
2283	mem_cgroup_oom_unlock(memcg);	2244	mem_cgroup_oom_unlock(memcg);
2284	/*	2245	/*
2285	* There is no guarantee that an OOM-lock contender	2246	* There is no guarantee that an OOM-lock contender
@@ -2288,10 +2249,9 @@ out_memcg:
2288	*/	2249	*/
2289	memcg_oom_recover(memcg);	2250	memcg_oom_recover(memcg);
2290	}	2251	}
		2252	cleanup:
		2253	current->memcg_oom.memcg = NULL;
2291	css_put(&memcg->css);	2254	css_put(&memcg->css);
2292	current->memcg_oom.wait_on_memcg = NULL;
2293	out:
2294	current->memcg_oom.in_memcg_oom = 0;
2295	return true;	2255	return true;
2296	}	2256	}
2297		2257
@@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2705	\|\| fatal_signal_pending(current)))	2665	\|\| fatal_signal_pending(current)))
2706	goto bypass;	2666	goto bypass;
2707		2667
		2668	if (unlikely(task_in_memcg_oom(current)))
		2669	goto bypass;
		2670
2708	/*	2671	/*
2709	* We always charge the cgroup the mm_struct belongs to.	2672	* We always charge the cgroup the mm_struct belongs to.
2710	* The mm_struct's mem_cgroup changes on task migration if the	2673	* The mm_struct's mem_cgroup changes on task migration if the


diff --git a/mm/memory.c b/mm/memory.c index f7b7692c05ed..1311f26497e6 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -3865,15 +3865,21 @@ int handle_mm_fault(struct mm_struct mm, struct vm_area_struct vma,
3865	* space. Kernel faults are handled more gracefully.	3865	* space. Kernel faults are handled more gracefully.
3866	*/	3866	*/
3867	if (flags & FAULT_FLAG_USER)	3867	if (flags & FAULT_FLAG_USER)
3868	mem_cgroup_enable_oom();	3868	mem_cgroup_oom_enable();
3869		3869
3870	ret = __handle_mm_fault(mm, vma, address, flags);	3870	ret = __handle_mm_fault(mm, vma, address, flags);
3871		3871
3872	if (flags & FAULT_FLAG_USER)	3872	if (flags & FAULT_FLAG_USER) {
3873	mem_cgroup_disable_oom();	3873	mem_cgroup_oom_disable();
3874		3874	/*
3875	if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))	3875	* The task may have entered a memcg OOM situation but
3876	mem_cgroup_oom_synchronize();	3876	* if the allocation error was handled gracefully (no
		3877	* VM_FAULT_OOM), there is no need to kill anything.
		3878	* Just clean up the OOM state peacefully.
		3879	*/
		3880	if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
		3881	mem_cgroup_oom_synchronize(false);
		3882	}
3877		3883
3878	return ret;	3884	return ret;
3879	}	3885	}


diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d274381..6738c47f1f72 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
680	{	680	{
681	struct zonelist *zonelist;	681	struct zonelist *zonelist;
682		682
683	if (mem_cgroup_oom_synchronize())	683	if (mem_cgroup_oom_synchronize(true))
684	return;	684	return;
685		685
686	zonelist = node_zonelist(first_online_node, GFP_KERNEL);	686	zonelist = node_zonelist(first_online_node, GFP_KERNEL);