aboutsummaryrefslogtreecommitdiffstats
path: root/mm/oom_kill.c
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/oom_kill.c
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'mm/oom_kill.c')
-rw-r--r--mm/oom_kill.c167
1 files changed, 104 insertions, 63 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583a1024..e4b0991ca351 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,12 +31,40 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/ptrace.h>
34 35
35int sysctl_panic_on_oom; 36int sysctl_panic_on_oom;
36int sysctl_oom_kill_allocating_task; 37int sysctl_oom_kill_allocating_task;
37int sysctl_oom_dump_tasks = 1; 38int sysctl_oom_dump_tasks = 1;
38static DEFINE_SPINLOCK(zone_scan_lock); 39static DEFINE_SPINLOCK(zone_scan_lock);
39 40
41/**
42 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
43 * @new_val: new oom_score_adj value
44 *
45 * Sets the oom_score_adj value for current to @new_val with proper
46 * synchronization and returns the old value. Usually used to temporarily
47 * set a value, save the old value in the caller, and then reinstate it later.
48 */
49int test_set_oom_score_adj(int new_val)
50{
51 struct sighand_struct *sighand = current->sighand;
52 int old_val;
53
54 spin_lock_irq(&sighand->siglock);
55 old_val = current->signal->oom_score_adj;
56 if (new_val != old_val) {
57 if (new_val == OOM_SCORE_ADJ_MIN)
58 atomic_inc(&current->mm->oom_disable_count);
59 else if (old_val == OOM_SCORE_ADJ_MIN)
60 atomic_dec(&current->mm->oom_disable_count);
61 current->signal->oom_score_adj = new_val;
62 }
63 spin_unlock_irq(&sighand->siglock);
64
65 return old_val;
66}
67
40#ifdef CONFIG_NUMA 68#ifdef CONFIG_NUMA
41/** 69/**
42 * has_intersects_mems_allowed() - check task eligiblity for kill 70 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -83,24 +111,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
83#endif /* CONFIG_NUMA */ 111#endif /* CONFIG_NUMA */
84 112
85/* 113/*
86 * If this is a system OOM (not a memcg OOM) and the task selected to be
87 * killed is not already running at high (RT) priorities, speed up the
88 * recovery by boosting the dying task to the lowest FIFO priority.
89 * That helps with the recovery and avoids interfering with RT tasks.
90 */
91static void boost_dying_task_prio(struct task_struct *p,
92 struct mem_cgroup *mem)
93{
94 struct sched_param param = { .sched_priority = 1 };
95
96 if (mem)
97 return;
98
99 if (!rt_task(p))
100 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
101}
102
103/*
104 * The process p may have detached its own ->mm while exiting or through 114 * The process p may have detached its own ->mm while exiting or through
105 * use_mm(), but one or more of its subthreads may still have a valid 115 * use_mm(), but one or more of its subthreads may still have a valid
106 * pointer. Return p, or any of its subthreads with a valid ->mm, with 116 * pointer. Return p, or any of its subthreads with a valid ->mm, with
@@ -162,24 +172,16 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
162 return 0; 172 return 0;
163 173
164 /* 174 /*
165 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't 175 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
166 * need to be executed for something that cannot be killed. 176 * so the entire heuristic doesn't need to be executed for something
177 * that cannot be killed.
167 */ 178 */
168 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 179 if (atomic_read(&p->mm->oom_disable_count)) {
169 task_unlock(p); 180 task_unlock(p);
170 return 0; 181 return 0;
171 } 182 }
172 183
173 /* 184 /*
174 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
175 * priority for oom killing.
176 */
177 if (p->flags & PF_OOM_ORIGIN) {
178 task_unlock(p);
179 return 1000;
180 }
181
182 /*
183 * The memory controller may have a limit of 0 bytes, so avoid a divide 185 * The memory controller may have a limit of 0 bytes, so avoid a divide
184 * by zero, if necessary. 186 * by zero, if necessary.
185 */ 187 */
@@ -188,10 +190,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
188 190
189 /* 191 /*
190 * The baseline for the badness score is the proportion of RAM that each 192 * The baseline for the badness score is the proportion of RAM that each
191 * task's rss and swap space use. 193 * task's rss, pagetable and swap space use.
192 */ 194 */
193 points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / 195 points = get_mm_rss(p->mm) + p->mm->nr_ptes;
194 totalpages; 196 points += get_mm_counter(p->mm, MM_SWAPENTS);
197
198 points *= 1000;
199 points /= totalpages;
195 task_unlock(p); 200 task_unlock(p);
196 201
197 /* 202 /*
@@ -291,13 +296,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
291 unsigned long totalpages, struct mem_cgroup *mem, 296 unsigned long totalpages, struct mem_cgroup *mem,
292 const nodemask_t *nodemask) 297 const nodemask_t *nodemask)
293{ 298{
294 struct task_struct *p; 299 struct task_struct *g, *p;
295 struct task_struct *chosen = NULL; 300 struct task_struct *chosen = NULL;
296 *ppoints = 0; 301 *ppoints = 0;
297 302
298 for_each_process(p) { 303 do_each_thread(g, p) {
299 unsigned int points; 304 unsigned int points;
300 305
306 if (!p->mm)
307 continue;
301 if (oom_unkillable_task(p, mem, nodemask)) 308 if (oom_unkillable_task(p, mem, nodemask))
302 continue; 309 continue;
303 310
@@ -313,22 +320,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
313 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 320 if (test_tsk_thread_flag(p, TIF_MEMDIE))
314 return ERR_PTR(-1UL); 321 return ERR_PTR(-1UL);
315 322
316 /* 323 if (p->flags & PF_EXITING) {
317 * This is in the process of releasing memory so wait for it 324 /*
318 * to finish before killing some other task by mistake. 325 * If p is the current task and is in the process of
319 * 326 * releasing memory, we allow the "kill" to set
320 * However, if p is the current task, we allow the 'kill' to 327 * TIF_MEMDIE, which will allow it to gain access to
321 * go ahead if it is exiting: this will simply set TIF_MEMDIE, 328 * memory reserves. Otherwise, it may stall forever.
322 * which will allow it to gain access to memory reserves in 329 *
323 * the process of exiting and releasing its resources. 330 * The loop isn't broken here, however, in case other
324 * Otherwise we could get an easy OOM deadlock. 331 * threads are found to have already been oom killed.
325 */ 332 */
326 if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { 333 if (p == current) {
327 if (p != current) 334 chosen = p;
328 return ERR_PTR(-1UL); 335 *ppoints = 1000;
329 336 } else {
330 chosen = p; 337 /*
331 *ppoints = 1000; 338 * If this task is not being ptraced on exit,
339 * then wait for it to finish before killing
340 * some other task unnecessarily.
341 */
342 if (!(task_ptrace(p->group_leader) &
343 PT_TRACE_EXIT))
344 return ERR_PTR(-1UL);
345 }
332 } 346 }
333 347
334 points = oom_badness(p, mem, nodemask, totalpages); 348 points = oom_badness(p, mem, nodemask, totalpages);
@@ -336,7 +350,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
336 chosen = p; 350 chosen = p;
337 *ppoints = points; 351 *ppoints = points;
338 } 352 }
339 } 353 } while_each_thread(g, p);
340 354
341 return chosen; 355 return chosen;
342} 356}
@@ -395,7 +409,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
395 task_unlock(current); 409 task_unlock(current);
396 dump_stack(); 410 dump_stack();
397 mem_cgroup_print_oom_info(mem, p); 411 mem_cgroup_print_oom_info(mem, p);
398 show_mem(); 412 show_mem(SHOW_MEM_FILTER_NODES);
399 if (sysctl_oom_dump_tasks) 413 if (sysctl_oom_dump_tasks)
400 dump_tasks(mem, nodemask); 414 dump_tasks(mem, nodemask);
401} 415}
@@ -403,27 +417,44 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
403#define K(x) ((x) << (PAGE_SHIFT-10)) 417#define K(x) ((x) << (PAGE_SHIFT-10))
404static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) 418static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
405{ 419{
420 struct task_struct *q;
421 struct mm_struct *mm;
422
406 p = find_lock_task_mm(p); 423 p = find_lock_task_mm(p);
407 if (!p) 424 if (!p)
408 return 1; 425 return 1;
409 426
427 /* mm cannot be safely dereferenced after task_unlock(p) */
428 mm = p->mm;
429
410 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 430 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
411 task_pid_nr(p), p->comm, K(p->mm->total_vm), 431 task_pid_nr(p), p->comm, K(p->mm->total_vm),
412 K(get_mm_counter(p->mm, MM_ANONPAGES)), 432 K(get_mm_counter(p->mm, MM_ANONPAGES)),
413 K(get_mm_counter(p->mm, MM_FILEPAGES))); 433 K(get_mm_counter(p->mm, MM_FILEPAGES)));
414 task_unlock(p); 434 task_unlock(p);
415 435
436 /*
437 * Kill all processes sharing p->mm in other thread groups, if any.
438 * They don't get access to memory reserves or a higher scheduler
439 * priority, though, to avoid depletion of all memory or task
440 * starvation. This prevents mm->mmap_sem livelock when an oom killed
441 * task cannot exit because it requires the semaphore and its contended
442 * by another thread trying to allocate memory itself. That thread will
443 * now get access to memory reserves since it has a pending fatal
444 * signal.
445 */
446 for_each_process(q)
447 if (q->mm == mm && !same_thread_group(q, p)) {
448 task_lock(q); /* Protect ->comm from prctl() */
449 pr_err("Kill process %d (%s) sharing same memory\n",
450 task_pid_nr(q), q->comm);
451 task_unlock(q);
452 force_sig(SIGKILL, q);
453 }
416 454
417 set_tsk_thread_flag(p, TIF_MEMDIE); 455 set_tsk_thread_flag(p, TIF_MEMDIE);
418 force_sig(SIGKILL, p); 456 force_sig(SIGKILL, p);
419 457
420 /*
421 * We give our sacrificial lamb high priority and access to
422 * all the memory it needs. That way it should be able to
423 * exit() and clear out its resources quickly...
424 */
425 boost_dying_task_prio(p, mem);
426
427 return 0; 458 return 0;
428} 459}
429#undef K 460#undef K
@@ -447,7 +478,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
447 */ 478 */
448 if (p->flags & PF_EXITING) { 479 if (p->flags & PF_EXITING) {
449 set_tsk_thread_flag(p, TIF_MEMDIE); 480 set_tsk_thread_flag(p, TIF_MEMDIE);
450 boost_dying_task_prio(p, mem);
451 return 0; 481 return 0;
452 } 482 }
453 483
@@ -466,6 +496,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
466 list_for_each_entry(child, &t->children, sibling) { 496 list_for_each_entry(child, &t->children, sibling) {
467 unsigned int child_points; 497 unsigned int child_points;
468 498
499 if (child->mm == p->mm)
500 continue;
469 /* 501 /*
470 * oom_badness() returns 0 if the thread is unkillable 502 * oom_badness() returns 0 if the thread is unkillable
471 */ 503 */
@@ -512,6 +544,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
512 unsigned int points = 0; 544 unsigned int points = 0;
513 struct task_struct *p; 545 struct task_struct *p;
514 546
547 /*
548 * If current has a pending SIGKILL, then automatically select it. The
549 * goal is to allow it to allocate so that it may quickly exit and free
550 * its memory.
551 */
552 if (fatal_signal_pending(current)) {
553 set_thread_flag(TIF_MEMDIE);
554 return;
555 }
556
515 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 557 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
516 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; 558 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
517 read_lock(&tasklist_lock); 559 read_lock(&tasklist_lock);
@@ -664,7 +706,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
664 */ 706 */
665 if (fatal_signal_pending(current)) { 707 if (fatal_signal_pending(current)) {
666 set_thread_flag(TIF_MEMDIE); 708 set_thread_flag(TIF_MEMDIE);
667 boost_dying_task_prio(current, NULL);
668 return; 709 return;
669 } 710 }
670 711
@@ -680,7 +721,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
680 read_lock(&tasklist_lock); 721 read_lock(&tasklist_lock);
681 if (sysctl_oom_kill_allocating_task && 722 if (sysctl_oom_kill_allocating_task &&
682 !oom_unkillable_task(current, NULL, nodemask) && 723 !oom_unkillable_task(current, NULL, nodemask) &&
683 (current->signal->oom_adj != OOM_DISABLE)) { 724 current->mm && !atomic_read(&current->mm->oom_disable_count)) {
684 /* 725 /*
685 * oom_kill_process() needs tasklist_lock held. If it returns 726 * oom_kill_process() needs tasklist_lock held. If it returns
686 * non-zero, current could not be killed so we must fallback to 727 * non-zero, current could not be killed so we must fallback to