oom: kill all threads sharing oom killed task's mm

It's necessary to kill all threads that share an oom killed task's mm if the goal is to lead to future memory freeing. This patch reintroduces the code removed in 8c5cd6f3 (oom: oom_kill doesn't kill vfork parent (or child)) since it is obsoleted. It's now guaranteed that any task passed to oom_kill_task() does not share an mm with any thread that is unkillable. Thus, we're safe to issue a SIGKILL to any thread sharing the same mm. This is especially necessary to solve an mm->mmap_sem livelock issue whereas an oom killed thread must acquire the lock in the exit path while another thread is holding it in the page allocator while trying to allocate memory itself (and will preempt the oom killer since a task was already killed). Since tasks with pending fatal signals are now granted access to memory reserves, the thread holding the lock may quickly allocate and release the lock so that the oom killed task may exit. This mainly is for threads that are cloned with CLONE_VM but not CLONE_THREAD, so they are in a different thread group. Non-NPTL threads exist in the wild and this change is necessary to prevent the livelock in such cases. We care more about preventing the livelock than incurring the additional tasklist in the oom killer when a task has been killed. Systems that are sufficiently large to not want the tasklist scan in the oom killer in the first place already have the option of enabling /proc/sys/vm/oom_kill_allocating_task, which was designed specifically for that purpose. This code had existed in the oom killer for over eight years dating back to the 2.4 kernel. [akpm@linux-foundation.org: add nice comment] Signed-off-by: David Rientjes <rientjes@google.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Cc: Ying Han <yinghan@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: David Rientjes <rientjes@google.com> 2010-10-26 17:21:24 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-26 19:52:05 -0400
commit: 1e99bad0d9c12a4aaa60cd812c84ef152564bcf5 (patch)
tree: af9b070096c2ccc567247ff507eb07d3d817a1b4
parent: e18641e19a9204f241f04a5ac700168dcd18de4f (diff)
1 files changed, 24 insertions, 0 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4395f371bc7c..7dcca55ede7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -404,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 {
+        struct task_struct *q;
+        struct mm_struct *mm;
        p = find_lock_task_mm(p);
        if (!p)
                return 1;
+        /* mm cannot be safely dereferenced after task_unlock(p) */
+        mm = p->mm;
        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
                task_pid_nr(p), p->comm, K(p->mm->total_vm),
                K(get_mm_counter(p->mm, MM_ANONPAGES)),
                K(get_mm_counter(p->mm, MM_FILEPAGES)));
        task_unlock(p);
+        /*
+         * Kill all processes sharing p->mm in other thread groups, if any.
+         * They don't get access to memory reserves or a higher scheduler
+         * priority, though, to avoid depletion of all memory or task
+         * starvation.  This prevents mm->mmap_sem livelock when an oom killed
+         * task cannot exit because it requires the semaphore and its contended
+         * by another thread trying to allocate memory itself.  That thread will
+         * now get access to memory reserves since it has a pending fatal
+         * signal.
+         */
+        for_each_process(q)
+                if (q->mm == mm && !same_thread_group(q, p)) {
+                        task_lock(q);   /* Protect ->comm from prctl() */
+                        pr_err("Kill process %d (%s) sharing same memory\n",
+                                task_pid_nr(q), q->comm);
+                        task_unlock(q);
+                        force_sig(SIGKILL, q);
+                }
        set_tsk_thread_flag(p, TIF_MEMDIE);
        force_sig(SIGKILL, p);
author	David Rientjes <rientjes@google.com>	2010-10-26 17:21:24 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-26 19:52:05 -0400
commit	1e99bad0d9c12a4aaa60cd812c84ef152564bcf5 (patch)
tree	af9b070096c2ccc567247ff507eb07d3d817a1b4
parent	e18641e19a9204f241f04a5ac700168dcd18de4f (diff)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4395f371bc7c..7dcca55ede7c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c
@@ -404,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
404	#define K(x) ((x) << (PAGE_SHIFT-10))	404	#define K(x) ((x) << (PAGE_SHIFT-10))
405	static int oom_kill_task(struct task_struct p, struct mem_cgroup mem)	405	static int oom_kill_task(struct task_struct p, struct mem_cgroup mem)
406	{	406	{
		407	struct task_struct *q;
		408	struct mm_struct *mm;
		409
407	p = find_lock_task_mm(p);	410	p = find_lock_task_mm(p);
408	if (!p)	411	if (!p)
409	return 1;	412	return 1;
410		413
		414	/* mm cannot be safely dereferenced after task_unlock(p) */
		415	mm = p->mm;
		416
411	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",	417	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
412	task_pid_nr(p), p->comm, K(p->mm->total_vm),	418	task_pid_nr(p), p->comm, K(p->mm->total_vm),
413	K(get_mm_counter(p->mm, MM_ANONPAGES)),	419	K(get_mm_counter(p->mm, MM_ANONPAGES)),
414	K(get_mm_counter(p->mm, MM_FILEPAGES)));	420	K(get_mm_counter(p->mm, MM_FILEPAGES)));
415	task_unlock(p);	421	task_unlock(p);
416		422
		423	/*
		424	* Kill all processes sharing p->mm in other thread groups, if any.
		425	* They don't get access to memory reserves or a higher scheduler
		426	* priority, though, to avoid depletion of all memory or task
		427	* starvation. This prevents mm->mmap_sem livelock when an oom killed
		428	* task cannot exit because it requires the semaphore and its contended
		429	* by another thread trying to allocate memory itself. That thread will
		430	* now get access to memory reserves since it has a pending fatal
		431	* signal.
		432	*/
		433	for_each_process(q)
		434	if (q->mm == mm && !same_thread_group(q, p)) {
		435	task_lock(q); /* Protect ->comm from prctl() */
		436	pr_err("Kill process %d (%s) sharing same memory\n",
		437	task_pid_nr(q), q->comm);
		438	task_unlock(q);
		439	force_sig(SIGKILL, q);
		440	}
417		441
418	set_tsk_thread_flag(p, TIF_MEMDIE);	442	set_tsk_thread_flag(p, TIF_MEMDIE);
419	force_sig(SIGKILL, p);	443	force_sig(SIGKILL, p);