oom, PM: make OOM detection in the freezer path raceless

Commit 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend") has left a race window when OOM killer manages to note_oom_kill after freeze_processes checks the counter. The race window is quite small and really unlikely and partial solution deemed sufficient at the time of submission. Tejun wasn't happy about this partial solution though and insisted on a full solution. That requires the full OOM and freezer's task freezing exclusion, though. This is done by this patch which introduces oom_sem RW lock and turns oom_killer_disable() into a full OOM barrier. oom_killer_disabled check is moved from the allocation path to the OOM level and we take oom_sem for reading for both the check and the whole OOM invocation. oom_killer_disable() takes oom_sem for writing so it waits for all currently running OOM killer invocations. Then it disable all the further OOMs by setting oom_killer_disabled and checks for any oom victims. Victims are counted via mark_tsk_oom_victim resp. unmark_oom_victim. The last victim wakes up all waiters enqueued by oom_killer_disable(). Therefore this function acts as the full OOM barrier. The page fault path is covered now as well although it was assumed to be safe before. As per Tejun, "We used to have freezing points deep in file system code which may be reacheable from page fault." so it would be better and more robust to not rely on freezing points here. Same applies to the memcg OOM killer. out_of_memory tells the caller whether the OOM was allowed to trigger and the callers are supposed to handle the situation. The page allocation path simply fails the allocation same as before. The page fault path will retry the fault (more on that later) and Sysrq OOM trigger will simply complain to the log. Normally there wouldn't be any unfrozen user tasks after try_to_freeze_tasks so the function will not block. But if there was an OOM killer racing with try_to_freeze_tasks and the OOM victim didn't finish yet then we have to wait for it. This should complete in a finite time, though, because - the victim cannot loop in the page fault handler (it would die on the way out from the exception) - it cannot loop in the page allocator because all the further allocation would fail and __GFP_NOFAIL allocations are not acceptable at this stage - it shouldn't be blocked on any locks held by frozen tasks (try_to_freeze expects lockless context) and kernel threads and work queues are not frozen yet Signed-off-by: Michal Hocko <mhocko@suse.cz> Suggested-by: Tejun Heo <tj@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Michal Hocko <mhocko@suse.cz> 2015-02-11 18:26:24 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-11 20:06:03 -0500
commit: c32b3cbe0d067a9cfae85aa70ba1e97ceba0ced7 (patch)
tree: ea807199ce92eed21239e5279033dbeb83b9dde1
parent: 401e4a7cf67d993bae02efdf1a234d7e2dbd2df2 (diff)
7 files changed, 132 insertions, 91 deletions
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 0071469ecbf1..259a4d5a4e8f 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = {
 static void moom_callback(struct work_struct *ignored)
 {
-        out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,
+        if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
-                      0, NULL, true);
+                           GFP_KERNEL, 0, NULL, true))
+                pr_info("OOM request ignored because killer is disabled\n");
 }
 static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index b42b80f88c3a..d5771bed59c9 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
                unsigned long totalpages, const nodemask_t *nodemask,
                bool force_kill);
-extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *mask, bool force_kill);
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 extern bool oom_killer_disabled;
+extern bool oom_killer_disable(void);
-static inline void oom_killer_disable(void)
+extern void oom_killer_enable(void);
-{
-        oom_killer_disabled = true;
-}
-static inline void oom_killer_enable(void)
-{
-        oom_killer_disabled = false;
-}
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
diff --git a/kernel/exit.c b/kernel/exit.c
index 02b3d1ab2ec0..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
-        unmark_oom_victim();
+        if (test_thread_flag(TIF_MEMDIE))
+                unmark_oom_victim();
 }
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3ac45f192e9f..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only)
        return todo ? -EBUSY : 0;
 }
-static bool __check_frozen_processes(void)
-{
-        struct task_struct *g, *p;
-        for_each_process_thread(g, p)
-                if (p != current && !freezer_should_skip(p) && !frozen(p))
-                        return false;
-        return true;
-}
-/*
- * Returns true if all freezable tasks (except for current) are frozen already
- */
-static bool check_frozen_processes(void)
-{
-        bool ret;
-        read_lock(&tasklist_lock);
-        ret = __check_frozen_processes();
-        read_unlock(&tasklist_lock);
-        return ret;
-}
 /**
 * freeze_processes - Signal user space processes to enter the refrigerator.
 * The current thread will not be frozen.  The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
 int freeze_processes(void)
 {
        int error;
-        int oom_kills_saved;
        error = __usermodehelper_disable(UMH_FREEZING);
        if (error)
@@ -157,29 +132,22 @@ int freeze_processes(void)
        pm_wakeup_clear();
        pr_info("Freezing user space processes ... ");
        pm_freezing = true;
-        oom_kills_saved = oom_kills_count();
        error = try_to_freeze_tasks(true);
        if (!error) {
                __usermodehelper_set_disable_depth(UMH_DISABLED);
-                oom_killer_disable();
+                pr_cont("done.");
-                /*
-                 * There might have been an OOM kill while we were
-                 * freezing tasks and the killed task might be still
-                 * on the way out so we have to double check for race.
-                 */
-                if (oom_kills_count() != oom_kills_saved &&
-                    !check_frozen_processes()) {
-                        __usermodehelper_set_disable_depth(UMH_ENABLED);
-                        pr_cont("OOM in progress.");
-                        error = -EBUSY;
-                } else {
-                        pr_cont("done.");
-                }
        }
        pr_cont("\n");
        BUG_ON(in_atomic());
+        /*
+         * Now that the whole userspace is frozen we need to disbale
+         * the OOM killer to disallow any further interference with
+         * killable tasks.
+         */
+        if (!error && !oom_killer_disable())
+                error = -EBUSY;
        if (error)
                thaw_processes();
        return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe4d258ef32b..fbf64e6f64e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
        if (!memcg)
                return false;
-        if (!handle)
+        if (!handle || oom_killer_disabled)
                goto cleanup;
        owait.memcg = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3cbd76b8c13b..b8df76ee2be3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 /*
- * Number of OOM killer invocations (including memcg OOM killer).
+ * Number of OOM victims in flight
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
 */
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
-int oom_kills_count(void)
+bool oom_killer_disabled __read_mostly;
-{
+static DECLARE_RWSEM(oom_sem);
-        return atomic_read(&oom_kills);
-}
-void note_oom_kill(void)
-{
-        atomic_inc(&oom_kills);
-}
 /**
 * mark_tsk_oom_victim - marks the given taks as OOM victim.
 * @tsk: task to mark
+ *
+ * Has to be called with oom_sem taken for read and never after
+ * oom has been disabled already.
 */
 void mark_tsk_oom_victim(struct task_struct *tsk)
 {
-        set_tsk_thread_flag(tsk, TIF_MEMDIE);
+        WARN_ON(oom_killer_disabled);
+        /* OOM killer might race with memcg OOM */
+        if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+                return;
        /*
         * Make sure that the task is woken up from uninterruptible sleep
         * if it is frozen because OOM killer wouldn't be able to free
@@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
         * that TIF_MEMDIE tasks should be ignored.
         */
        __thaw_task(tsk);
+        atomic_inc(&oom_victims);
 }
 /**
 * unmark_oom_victim - unmarks the current task as OOM victim.
+ *
+ * Wakes up all waiters in oom_killer_disable()
 */
 void unmark_oom_victim(void)
 {
-        clear_thread_flag(TIF_MEMDIE);
+        if (!test_and_clear_thread_flag(TIF_MEMDIE))
+                return;
+        down_read(&oom_sem);
+        /*
+         * There is no need to signal the lasst oom_victim if there
+         * is nobody who cares.
+         */
+        if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+                wake_up_all(&oom_victims_wait);
+        up_read(&oom_sem);
+}
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(void)
+{
+        /*
+         * Make sure to not race with an ongoing OOM killer
+         * and that the current is not the victim.
+         */
+        down_write(&oom_sem);
+        if (test_thread_flag(TIF_MEMDIE)) {
+                up_write(&oom_sem);
+                return false;
+        }
+        oom_killer_disabled = true;
+        up_write(&oom_sem);
+        wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+        return true;
+}
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
+{
+        down_write(&oom_sem);
+        oom_killer_disabled = false;
+        up_write(&oom_sem);
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
 }
 /**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
 * @zonelist: zonelist pointer
 * @gfp_mask: memory allocation flags
 * @order: amount of memory being requested as a power of 2
@@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *nodemask, bool force_kill)
 {
        const nodemask_t *mpol_mask;
@@ -718,6 +771,32 @@ out:
                schedule_timeout_killable(1);
 }
+/**
+ * out_of_memory -  tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+                int order, nodemask_t *nodemask, bool force_kill)
+{
+        bool ret = false;
+        down_read(&oom_sem);
+        if (!oom_killer_disabled) {
+                __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+                ret = true;
+        }
+        up_read(&oom_sem);
+        return ret;
+}
 /*
 * The pagefault handler calls here because it is out of memory, so kill a
 * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
@@ -727,12 +806,25 @@ void pagefault_out_of_memory(void)
 {
        struct zonelist *zonelist;
+        down_read(&oom_sem);
        if (mem_cgroup_oom_synchronize(true))
-                return;
+                goto unlock;
        zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
        if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
-                out_of_memory(NULL, 0, 0, NULL, false);
+                if (!oom_killer_disabled)
+                        __out_of_memory(NULL, 0, 0, NULL, false);
+                else
+                        /*
+                         * There shouldn't be any user tasks runable while the
+                         * OOM killer is disabled so the current task has to
+                         * be a racing OOM victim for which oom_killer_disable()
+                         * is waiting for.
+                         */
+                        WARN_ON(test_thread_flag(TIF_MEMDIE));
                oom_zonelist_unlock(zonelist, GFP_KERNEL);
        }
+unlock:
+        up_read(&oom_sem);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 641d5a9a8617..134e25525044 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
                                        PB_migrate, PB_migrate_end);
 }
-bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        *did_some_progress = 0;
-        if (oom_killer_disabled)
-                return NULL;
        /*
         * Acquire the per-zone oom lock for each zone.  If that
         * fails, somebody else is making progress for us.
@@ -2331,14 +2326,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        }
        /*
-         * PM-freezer should be notified that there might be an OOM killer on
-         * its way to kill and wake somebody up. This is too early and we might
-         * end up not killing anything but false positives are acceptable.
-         * See freeze_processes.
-         */
-        note_oom_kill();
-        /*
         * Go through the zonelist yet one more time, keep very high watermark
         * here, this is only to catch a parallel oom killing, we must fail if
         * we're still under heavy pressure.
@@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-        out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false);
+        if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
-        *did_some_progress = 1;
+                *did_some_progress = 1;
 out:
        oom_zonelist_unlock(ac->zonelist, gfp_mask);
        return page;
author	Michal Hocko <mhocko@suse.cz>	2015-02-11 18:26:24 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-11 20:06:03 -0500
commit	c32b3cbe0d067a9cfae85aa70ba1e97ceba0ced7 (patch)
tree	ea807199ce92eed21239e5279033dbeb83b9dde1
parent	401e4a7cf67d993bae02efdf1a234d7e2dbd2df2 (diff)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 0071469ecbf1..259a4d5a4e8f 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c
@@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = {
355		355
356	static void moom_callback(struct work_struct *ignored)	356	static void moom_callback(struct work_struct *ignored)
357	{	357	{
358	out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,	358	if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
359	0, NULL, true);	359	GFP_KERNEL, 0, NULL, true))
		360	pr_info("OOM request ignored because killer is disabled\n");
360	}	361	}
361		362
362	static DECLARE_WORK(moom_work, moom_callback);	363	static DECLARE_WORK(moom_work, moom_callback);


diff --git a/include/linux/oom.h b/include/linux/oom.h index b42b80f88c3a..d5771bed59c9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h
@@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
72	unsigned long totalpages, const nodemask_t *nodemask,	72	unsigned long totalpages, const nodemask_t *nodemask,
73	bool force_kill);	73	bool force_kill);
74		74
75	extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,	75	extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
76	int order, nodemask_t *mask, bool force_kill);	76	int order, nodemask_t *mask, bool force_kill);
77	extern int register_oom_notifier(struct notifier_block *nb);	77	extern int register_oom_notifier(struct notifier_block *nb);
78	extern int unregister_oom_notifier(struct notifier_block *nb);	78	extern int unregister_oom_notifier(struct notifier_block *nb);
79		79
80	extern bool oom_killer_disabled;	80	extern bool oom_killer_disabled;
81		81	extern bool oom_killer_disable(void);
82	static inline void oom_killer_disable(void)	82	extern void oom_killer_enable(void);
83	{
84	oom_killer_disabled = true;
85	}
86
87	static inline void oom_killer_enable(void)
88	{
89	oom_killer_disabled = false;
90	}
91		83
92	extern struct task_struct find_lock_task_mm(struct task_struct p);	84	extern struct task_struct find_lock_task_mm(struct task_struct p);
93		85


diff --git a/kernel/exit.c b/kernel/exit.c index 02b3d1ab2ec0..feff10bbb307 100644 --- a/kernel/exit.c +++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
435	task_unlock(tsk);	435	task_unlock(tsk);
436	mm_update_next_owner(mm);	436	mm_update_next_owner(mm);
437	mmput(mm);	437	mmput(mm);
438	unmark_oom_victim();	438	if (test_thread_flag(TIF_MEMDIE))
		439	unmark_oom_victim();
439	}	440	}
440		441
441	static struct task_struct find_alive_thread(struct task_struct p)	442	static struct task_struct find_alive_thread(struct task_struct p)


diff --git a/kernel/power/process.c b/kernel/power/process.c index 3ac45f192e9f..564f786df470 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c
@@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only)
108	return todo ? -EBUSY : 0;	108	return todo ? -EBUSY : 0;
109	}	109	}
110		110
111	static bool __check_frozen_processes(void)
112	{
113	struct task_struct g, p;
114
115	for_each_process_thread(g, p)
116	if (p != current && !freezer_should_skip(p) && !frozen(p))
117	return false;
118
119	return true;
120	}
121
122	/*
123	* Returns true if all freezable tasks (except for current) are frozen already
124	*/
125	static bool check_frozen_processes(void)
126	{
127	bool ret;
128
129	read_lock(&tasklist_lock);
130	ret = __check_frozen_processes();
131	read_unlock(&tasklist_lock);
132	return ret;
133	}
134
135	/**	111	/**
136	* freeze_processes - Signal user space processes to enter the refrigerator.	112	* freeze_processes - Signal user space processes to enter the refrigerator.
137	* The current thread will not be frozen. The same process that calls	113	* The current thread will not be frozen. The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
142	int freeze_processes(void)	118	int freeze_processes(void)
143	{	119	{
144	int error;	120	int error;
145	int oom_kills_saved;
146		121
147	error = __usermodehelper_disable(UMH_FREEZING);	122	error = __usermodehelper_disable(UMH_FREEZING);
148	if (error)	123	if (error)
@@ -157,29 +132,22 @@ int freeze_processes(void)
157	pm_wakeup_clear();	132	pm_wakeup_clear();
158	pr_info("Freezing user space processes ... ");	133	pr_info("Freezing user space processes ... ");
159	pm_freezing = true;	134	pm_freezing = true;
160	oom_kills_saved = oom_kills_count();
161	error = try_to_freeze_tasks(true);	135	error = try_to_freeze_tasks(true);
162	if (!error) {	136	if (!error) {
163	__usermodehelper_set_disable_depth(UMH_DISABLED);	137	__usermodehelper_set_disable_depth(UMH_DISABLED);
164	oom_killer_disable();	138	pr_cont("done.");
165
166	/*
167	* There might have been an OOM kill while we were
168	* freezing tasks and the killed task might be still
169	* on the way out so we have to double check for race.
170	*/
171	if (oom_kills_count() != oom_kills_saved &&
172	!check_frozen_processes()) {
173	__usermodehelper_set_disable_depth(UMH_ENABLED);
174	pr_cont("OOM in progress.");
175	error = -EBUSY;
176	} else {
177	pr_cont("done.");
178	}
179	}	139	}
180	pr_cont("\n");	140	pr_cont("\n");
181	BUG_ON(in_atomic());	141	BUG_ON(in_atomic());
182		142
		143	/*
		144	* Now that the whole userspace is frozen we need to disbale
		145	* the OOM killer to disallow any further interference with
		146	* killable tasks.
		147	*/
		148	if (!error && !oom_killer_disable())
		149	error = -EBUSY;
		150
183	if (error)	151	if (error)
184	thaw_processes();	152	thaw_processes();
185	return error;	153	return error;


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe4d258ef32b..fbf64e6f64e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
1930	if (!memcg)	1930	if (!memcg)
1931	return false;	1931	return false;
1932		1932
1933	if (!handle)	1933	if (!handle \|\| oom_killer_disabled)
1934	goto cleanup;	1934	goto cleanup;
1935		1935
1936	owait.memcg = memcg;	1936	owait.memcg = memcg;


diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3cbd76b8c13b..b8df76ee2be3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c
@@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
398	}	398	}
399		399
400	/*	400	/*
401	* Number of OOM killer invocations (including memcg OOM killer).	401	* Number of OOM victims in flight
402	* Primarily used by PM freezer to check for potential races with
403	* OOM killed frozen task.
404	*/	402	*/
405	static atomic_t oom_kills = ATOMIC_INIT(0);	403	static atomic_t oom_victims = ATOMIC_INIT(0);
		404	static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
406		405
407	int oom_kills_count(void)	406	bool oom_killer_disabled __read_mostly;
408	{	407	static DECLARE_RWSEM(oom_sem);
409	return atomic_read(&oom_kills);
410	}
411
412	void note_oom_kill(void)
413	{
414	atomic_inc(&oom_kills);
415	}
416		408
417	/**	409	/**
418	* mark_tsk_oom_victim - marks the given taks as OOM victim.	410	* mark_tsk_oom_victim - marks the given taks as OOM victim.
419	* @tsk: task to mark	411	* @tsk: task to mark
		412	*
		413	* Has to be called with oom_sem taken for read and never after
		414	* oom has been disabled already.
420	*/	415	*/
421	void mark_tsk_oom_victim(struct task_struct *tsk)	416	void mark_tsk_oom_victim(struct task_struct *tsk)
422	{	417	{
423	set_tsk_thread_flag(tsk, TIF_MEMDIE);	418	WARN_ON(oom_killer_disabled);
424		419	/* OOM killer might race with memcg OOM */
		420	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
		421	return;
425	/*	422	/*
426	* Make sure that the task is woken up from uninterruptible sleep	423	* Make sure that the task is woken up from uninterruptible sleep
427	* if it is frozen because OOM killer wouldn't be able to free	424	* if it is frozen because OOM killer wouldn't be able to free
@@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
429	* that TIF_MEMDIE tasks should be ignored.	426	* that TIF_MEMDIE tasks should be ignored.
430	*/	427	*/
431	__thaw_task(tsk);	428	__thaw_task(tsk);
		429	atomic_inc(&oom_victims);
432	}	430	}
433		431
434	/**	432	/**
435	* unmark_oom_victim - unmarks the current task as OOM victim.	433	* unmark_oom_victim - unmarks the current task as OOM victim.
		434	*
		435	* Wakes up all waiters in oom_killer_disable()
436	*/	436	*/
437	void unmark_oom_victim(void)	437	void unmark_oom_victim(void)
438	{	438	{
439	clear_thread_flag(TIF_MEMDIE);	439	if (!test_and_clear_thread_flag(TIF_MEMDIE))
		440	return;
		441
		442	down_read(&oom_sem);
		443	/*
		444	* There is no need to signal the lasst oom_victim if there
		445	* is nobody who cares.
		446	*/
		447	if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
		448	wake_up_all(&oom_victims_wait);
		449	up_read(&oom_sem);
		450	}
		451
		452	/**
		453	* oom_killer_disable - disable OOM killer
		454	*
		455	* Forces all page allocations to fail rather than trigger OOM killer.
		456	* Will block and wait until all OOM victims are killed.
		457	*
		458	* The function cannot be called when there are runnable user tasks because
		459	* the userspace would see unexpected allocation failures as a result. Any
		460	* new usage of this function should be consulted with MM people.
		461	*
		462	* Returns true if successful and false if the OOM killer cannot be
		463	* disabled.
		464	*/
		465	bool oom_killer_disable(void)
		466	{
		467	/*
		468	* Make sure to not race with an ongoing OOM killer
		469	* and that the current is not the victim.
		470	*/
		471	down_write(&oom_sem);
		472	if (test_thread_flag(TIF_MEMDIE)) {
		473	up_write(&oom_sem);
		474	return false;
		475	}
		476
		477	oom_killer_disabled = true;
		478	up_write(&oom_sem);
		479
		480	wait_event(oom_victims_wait, !atomic_read(&oom_victims));
		481
		482	return true;
		483	}
		484
		485	/**
		486	* oom_killer_enable - enable OOM killer
		487	*/
		488	void oom_killer_enable(void)
		489	{
		490	down_write(&oom_sem);
		491	oom_killer_disabled = false;
		492	up_write(&oom_sem);
440	}	493	}
441		494
442	#define K(x) ((x) << (PAGE_SHIFT-10))	495	#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
637	}	690	}
638		691
639	/**	692	/**
640	* out_of_memory - kill the "best" process when we run out of memory	693	* __out_of_memory - kill the "best" process when we run out of memory
641	* @zonelist: zonelist pointer	694	* @zonelist: zonelist pointer
642	* @gfp_mask: memory allocation flags	695	* @gfp_mask: memory allocation flags
643	* @order: amount of memory being requested as a power of 2	696	* @order: amount of memory being requested as a power of 2
@@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
649	* OR try to be smart about which process to kill. Note that we	702	* OR try to be smart about which process to kill. Note that we
650	* don't have to be perfect here, we just have to be good.	703	* don't have to be perfect here, we just have to be good.
651	*/	704	*/
652	void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,	705	static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
653	int order, nodemask_t *nodemask, bool force_kill)	706	int order, nodemask_t *nodemask, bool force_kill)
654	{	707	{
655	const nodemask_t *mpol_mask;	708	const nodemask_t *mpol_mask;
@@ -718,6 +771,32 @@ out:
718	schedule_timeout_killable(1);	771	schedule_timeout_killable(1);
719	}	772	}
720		773
		774	/**
		775	* out_of_memory - tries to invoke OOM killer.
		776	* @zonelist: zonelist pointer
		777	* @gfp_mask: memory allocation flags
		778	* @order: amount of memory being requested as a power of 2
		779	* @nodemask: nodemask passed to page allocator
		780	* @force_kill: true if a task must be killed, even if others are exiting
		781	*
		782	* invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
		783	* when it returns false. Otherwise returns true.
		784	*/
		785	bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
		786	int order, nodemask_t *nodemask, bool force_kill)
		787	{
		788	bool ret = false;
		789
		790	down_read(&oom_sem);
		791	if (!oom_killer_disabled) {
		792	__out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
		793	ret = true;
		794	}
		795	up_read(&oom_sem);
		796
		797	return ret;
		798	}
		799
721	/*	800	/*
722	* The pagefault handler calls here because it is out of memory, so kill a	801	* The pagefault handler calls here because it is out of memory, so kill a
723	* memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a	802	* memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
@@ -727,12 +806,25 @@ void pagefault_out_of_memory(void)
727	{	806	{
728	struct zonelist *zonelist;	807	struct zonelist *zonelist;
729		808
		809	down_read(&oom_sem);
730	if (mem_cgroup_oom_synchronize(true))	810	if (mem_cgroup_oom_synchronize(true))
731	return;	811	goto unlock;
732		812
733	zonelist = node_zonelist(first_memory_node, GFP_KERNEL);	813	zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
734	if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {	814	if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
735	out_of_memory(NULL, 0, 0, NULL, false);	815	if (!oom_killer_disabled)
		816	__out_of_memory(NULL, 0, 0, NULL, false);
		817	else
		818	/*
		819	* There shouldn't be any user tasks runable while the
		820	* OOM killer is disabled so the current task has to
		821	* be a racing OOM victim for which oom_killer_disable()
		822	* is waiting for.
		823	*/
		824	WARN_ON(test_thread_flag(TIF_MEMDIE));
		825
736	oom_zonelist_unlock(zonelist, GFP_KERNEL);	826	oom_zonelist_unlock(zonelist, GFP_KERNEL);
737	}	827	}
		828	unlock:
		829	up_read(&oom_sem);
738	}	830	}


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 641d5a9a8617..134e25525044 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
244	PB_migrate, PB_migrate_end);	244	PB_migrate, PB_migrate_end);
245	}	245	}
246		246
247	bool oom_killer_disabled __read_mostly;
248
249	#ifdef CONFIG_DEBUG_VM	247	#ifdef CONFIG_DEBUG_VM
250	static int page_outside_zone_boundaries(struct zone zone, struct page page)	248	static int page_outside_zone_boundaries(struct zone zone, struct page page)
251	{	249	{
@@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2317		2315
2318	*did_some_progress = 0;	2316	*did_some_progress = 0;
2319		2317
2320	if (oom_killer_disabled)
2321	return NULL;
2322
2323	/*	2318	/*
2324	* Acquire the per-zone oom lock for each zone. If that	2319	* Acquire the per-zone oom lock for each zone. If that
2325	* fails, somebody else is making progress for us.	2320	* fails, somebody else is making progress for us.
@@ -2331,14 +2326,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2331	}	2326	}
2332		2327
2333	/*	2328	/*
2334	* PM-freezer should be notified that there might be an OOM killer on
2335	* its way to kill and wake somebody up. This is too early and we might
2336	* end up not killing anything but false positives are acceptable.
2337	* See freeze_processes.
2338	*/
2339	note_oom_kill();
2340
2341	/*
2342	* Go through the zonelist yet one more time, keep very high watermark	2329	* Go through the zonelist yet one more time, keep very high watermark
2343	* here, this is only to catch a parallel oom killing, we must fail if	2330	* here, this is only to catch a parallel oom killing, we must fail if
2344	* we're still under heavy pressure.	2331	* we're still under heavy pressure.
@@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2372	goto out;	2359	goto out;
2373	}	2360	}
2374	/* Exhausted what can be done so it's blamo time */	2361	/* Exhausted what can be done so it's blamo time */
2375	out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false);	2362	if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
2376	*did_some_progress = 1;	2363	*did_some_progress = 1;
2377	out:	2364	out:
2378	oom_zonelist_unlock(ac->zonelist, gfp_mask);	2365	oom_zonelist_unlock(ac->zonelist, gfp_mask);
2379	return page;	2366	return page;