aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2011-03-22 19:30:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-22 20:43:58 -0400
commitedd45544c6f09550df0a5491aa8a07af24767e73 (patch)
treea44ec26701ed430b68ee3b045b6575462d15af71 /mm
parent30e2b41f20b6238f51e7cffb879c7a0f0073f5fe (diff)
oom: avoid deferring oom killer if exiting task is being traced
The oom killer naturally defers killing anything if it finds an eligible task that is already exiting and has yet to detach its ->mm. This avoids unnecessarily killing tasks when one is already in the exit path and may free enough memory that the oom killer is no longer needed. This is detected by PF_EXITING since threads that have already detached its ->mm are no longer considered at all. The problem with always deferring when a thread is PF_EXITING, however, is that it may never actually exit when being traced, specifically if another task is tracing it with PTRACE_O_TRACEEXIT. The oom killer does not want to defer in this case since there is no guarantee that thread will ever exit without intervention. This patch will now only defer the oom killer when a thread is PF_EXITING and no ptracer has stopped its progress in the exit path. It also ensures that a child is sacrificed for the chosen parent only if it has a different ->mm as the comment implies: this ensures that the thread group leader is always targeted appropriately. Signed-off-by: David Rientjes <rientjes@google.com> Reported-by: Oleg Nesterov <oleg@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Cc: Andrey Vagin <avagin@openvz.org> Cc: <stable@kernel.org> [2.6.38.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/oom_kill.c40
1 files changed, 25 insertions, 15 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d7f345e47e73..33b58615072c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,6 +31,7 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/ptrace.h>
34 35
35int sysctl_panic_on_oom; 36int sysctl_panic_on_oom;
36int sysctl_oom_kill_allocating_task; 37int sysctl_oom_kill_allocating_task;
@@ -316,22 +317,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
316 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 317 if (test_tsk_thread_flag(p, TIF_MEMDIE))
317 return ERR_PTR(-1UL); 318 return ERR_PTR(-1UL);
318 319
319 /*
320 * This is in the process of releasing memory so wait for it
321 * to finish before killing some other task by mistake.
322 *
323 * However, if p is the current task, we allow the 'kill' to
324 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
325 * which will allow it to gain access to memory reserves in
326 * the process of exiting and releasing its resources.
327 * Otherwise we could get an easy OOM deadlock.
328 */
329 if (p->flags & PF_EXITING) { 320 if (p->flags & PF_EXITING) {
330 if (p != current) 321 /*
331 return ERR_PTR(-1UL); 322 * If p is the current task and is in the process of
332 323 * releasing memory, we allow the "kill" to set
333 chosen = p; 324 * TIF_MEMDIE, which will allow it to gain access to
334 *ppoints = 1000; 325 * memory reserves. Otherwise, it may stall forever.
326 *
327 * The loop isn't broken here, however, in case other
328 * threads are found to have already been oom killed.
329 */
330 if (p == current) {
331 chosen = p;
332 *ppoints = 1000;
333 } else {
334 /*
335 * If this task is not being ptraced on exit,
336 * then wait for it to finish before killing
337 * some other task unnecessarily.
338 */
339 if (!(task_ptrace(p->group_leader) &
340 PT_TRACE_EXIT))
341 return ERR_PTR(-1UL);
342 }
335 } 343 }
336 344
337 points = oom_badness(p, mem, nodemask, totalpages); 345 points = oom_badness(p, mem, nodemask, totalpages);
@@ -493,6 +501,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
493 list_for_each_entry(child, &t->children, sibling) { 501 list_for_each_entry(child, &t->children, sibling) {
494 unsigned int child_points; 502 unsigned int child_points;
495 503
504 if (child->mm == p->mm)
505 continue;
496 /* 506 /*
497 * oom_badness() returns 0 if the thread is unkillable 507 * oom_badness() returns 0 if the thread is unkillable
498 */ 508 */