aboutsummaryrefslogtreecommitdiffstats
path: root/mm/oom_kill.c
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2010-08-09 20:18:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-09 23:44:56 -0400
commit5e9d834a0e0c0485dfa487281ab9650fc37a3bb5 (patch)
treeb93cf4fd46b50b18f3fc118f1739a71dbdd8f340 /mm/oom_kill.c
parent6cf86ac6f36b638459a9a6c2576d5e655d41d451 (diff)
oom: sacrifice child with highest badness score for parent
When a task is chosen for oom kill, the oom killer first attempts to sacrifice a child not sharing its parent's memory instead. Unfortunately, this often kills in a seemingly random fashion based on the ordering of the selected task's child list. Additionally, it is not guaranteed at all to free a large amount of memory that we need to prevent additional oom killing in the very near future. Instead, we now only attempt to sacrifice the worst child not sharing its parent's memory, if one exists. The worst child is indicated with the highest badness() score. This serves two advantages: we kill a memory-hogging task more often, and we allow the configurable /proc/pid/oom_adj value to be considered as a factor in which child to kill. Reviewers may observe that the previous implementation would iterate through the children and attempt to kill each until one was successful and then the parent if none were found while the new code simply kills the most memory-hogging task or the parent. Note that the only time oom_kill_task() fails, however, is when a child does not have an mm or has a /proc/pid/oom_adj of OOM_DISABLE. badness() returns 0 for both cases, so the final oom_kill_task() will always succeed. Signed-off-by: David Rientjes <rientjes@google.com> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Nick Piggin <npiggin@suse.de> Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/oom_kill.c')
-rw-r--r--mm/oom_kill.c40
1 files changed, 29 insertions, 11 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6f6e04c40c9..7c8488f6a3f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -362,10 +362,10 @@ static void dump_tasks(const struct mem_cgroup *mem)
362static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 362static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
363 struct mem_cgroup *mem) 363 struct mem_cgroup *mem)
364{ 364{
365 task_lock(current);
365 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 366 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
366 "oom_adj=%d\n", 367 "oom_adj=%d\n",
367 current->comm, gfp_mask, order, current->signal->oom_adj); 368 current->comm, gfp_mask, order, current->signal->oom_adj);
368 task_lock(current);
369 cpuset_print_task_mems_allowed(current); 369 cpuset_print_task_mems_allowed(current);
370 task_unlock(current); 370 task_unlock(current);
371 dump_stack(); 371 dump_stack();
@@ -436,8 +436,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
436 unsigned long points, struct mem_cgroup *mem, 436 unsigned long points, struct mem_cgroup *mem,
437 const char *message) 437 const char *message)
438{ 438{
439 struct task_struct *c; 439 struct task_struct *victim = p;
440 struct task_struct *child;
440 struct task_struct *t = p; 441 struct task_struct *t = p;
442 unsigned long victim_points = 0;
443 struct timespec uptime;
441 444
442 if (printk_ratelimit()) 445 if (printk_ratelimit())
443 dump_header(p, gfp_mask, order, mem); 446 dump_header(p, gfp_mask, order, mem);
@@ -451,22 +454,37 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
451 return 0; 454 return 0;
452 } 455 }
453 456
454 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", 457 task_lock(p);
455 message, task_pid_nr(p), p->comm, points); 458 pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n",
459 message, task_pid_nr(p), p->comm, points);
460 task_unlock(p);
456 461
457 /* Try to kill a child first */ 462 /*
463 * If any of p's children has a different mm and is eligible for kill,
464 * the one with the highest badness() score is sacrificed for its
465 * parent. This attempts to lose the minimal amount of work done while
466 * still freeing memory.
467 */
468 do_posix_clock_monotonic_gettime(&uptime);
458 do { 469 do {
459 list_for_each_entry(c, &t->children, sibling) { 470 list_for_each_entry(child, &t->children, sibling) {
460 if (c->mm == p->mm) 471 unsigned long child_points;
472
473 if (child->mm == p->mm)
461 continue; 474 continue;
462 if (mem && !task_in_mem_cgroup(c, mem)) 475 if (mem && !task_in_mem_cgroup(child, mem))
463 continue; 476 continue;
464 if (!oom_kill_task(c)) 477
465 return 0; 478 /* badness() returns 0 if the thread is unkillable */
479 child_points = badness(child, uptime.tv_sec);
480 if (child_points > victim_points) {
481 victim = child;
482 victim_points = child_points;
483 }
466 } 484 }
467 } while_each_thread(p, t); 485 } while_each_thread(p, t);
468 486
469 return oom_kill_task(p); 487 return oom_kill_task(victim);
470} 488}
471 489
472#ifdef CONFIG_CGROUP_MEM_RES_CTLR 490#ifdef CONFIG_CGROUP_MEM_RES_CTLR