diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/oom_kill.c | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'mm/oom_kill.c')
-rw-r--r-- | mm/oom_kill.c | 167 |
1 files changed, 104 insertions, 63 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4029583a1024..e4b0991ca351 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -31,12 +31,40 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/mempolicy.h> | 32 | #include <linux/mempolicy.h> |
33 | #include <linux/security.h> | 33 | #include <linux/security.h> |
34 | #include <linux/ptrace.h> | ||
34 | 35 | ||
35 | int sysctl_panic_on_oom; | 36 | int sysctl_panic_on_oom; |
36 | int sysctl_oom_kill_allocating_task; | 37 | int sysctl_oom_kill_allocating_task; |
37 | int sysctl_oom_dump_tasks = 1; | 38 | int sysctl_oom_dump_tasks = 1; |
38 | static DEFINE_SPINLOCK(zone_scan_lock); | 39 | static DEFINE_SPINLOCK(zone_scan_lock); |
39 | 40 | ||
41 | /** | ||
42 | * test_set_oom_score_adj() - set current's oom_score_adj and return old value | ||
43 | * @new_val: new oom_score_adj value | ||
44 | * | ||
45 | * Sets the oom_score_adj value for current to @new_val with proper | ||
46 | * synchronization and returns the old value. Usually used to temporarily | ||
47 | * set a value, save the old value in the caller, and then reinstate it later. | ||
48 | */ | ||
49 | int test_set_oom_score_adj(int new_val) | ||
50 | { | ||
51 | struct sighand_struct *sighand = current->sighand; | ||
52 | int old_val; | ||
53 | |||
54 | spin_lock_irq(&sighand->siglock); | ||
55 | old_val = current->signal->oom_score_adj; | ||
56 | if (new_val != old_val) { | ||
57 | if (new_val == OOM_SCORE_ADJ_MIN) | ||
58 | atomic_inc(¤t->mm->oom_disable_count); | ||
59 | else if (old_val == OOM_SCORE_ADJ_MIN) | ||
60 | atomic_dec(¤t->mm->oom_disable_count); | ||
61 | current->signal->oom_score_adj = new_val; | ||
62 | } | ||
63 | spin_unlock_irq(&sighand->siglock); | ||
64 | |||
65 | return old_val; | ||
66 | } | ||
67 | |||
40 | #ifdef CONFIG_NUMA | 68 | #ifdef CONFIG_NUMA |
41 | /** | 69 | /** |
42 | * has_intersects_mems_allowed() - check task eligiblity for kill | 70 | * has_intersects_mems_allowed() - check task eligiblity for kill |
@@ -83,24 +111,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
83 | #endif /* CONFIG_NUMA */ | 111 | #endif /* CONFIG_NUMA */ |
84 | 112 | ||
85 | /* | 113 | /* |
86 | * If this is a system OOM (not a memcg OOM) and the task selected to be | ||
87 | * killed is not already running at high (RT) priorities, speed up the | ||
88 | * recovery by boosting the dying task to the lowest FIFO priority. | ||
89 | * That helps with the recovery and avoids interfering with RT tasks. | ||
90 | */ | ||
91 | static void boost_dying_task_prio(struct task_struct *p, | ||
92 | struct mem_cgroup *mem) | ||
93 | { | ||
94 | struct sched_param param = { .sched_priority = 1 }; | ||
95 | |||
96 | if (mem) | ||
97 | return; | ||
98 | |||
99 | if (!rt_task(p)) | ||
100 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * The process p may have detached its own ->mm while exiting or through | 114 | * The process p may have detached its own ->mm while exiting or through |
105 | * use_mm(), but one or more of its subthreads may still have a valid | 115 | * use_mm(), but one or more of its subthreads may still have a valid |
106 | * pointer. Return p, or any of its subthreads with a valid ->mm, with | 116 | * pointer. Return p, or any of its subthreads with a valid ->mm, with |
@@ -162,24 +172,16 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | |||
162 | return 0; | 172 | return 0; |
163 | 173 | ||
164 | /* | 174 | /* |
165 | * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't | 175 | * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN |
166 | * need to be executed for something that cannot be killed. | 176 | * so the entire heuristic doesn't need to be executed for something |
177 | * that cannot be killed. | ||
167 | */ | 178 | */ |
168 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | 179 | if (atomic_read(&p->mm->oom_disable_count)) { |
169 | task_unlock(p); | 180 | task_unlock(p); |
170 | return 0; | 181 | return 0; |
171 | } | 182 | } |
172 | 183 | ||
173 | /* | 184 | /* |
174 | * When the PF_OOM_ORIGIN bit is set, it indicates the task should have | ||
175 | * priority for oom killing. | ||
176 | */ | ||
177 | if (p->flags & PF_OOM_ORIGIN) { | ||
178 | task_unlock(p); | ||
179 | return 1000; | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * The memory controller may have a limit of 0 bytes, so avoid a divide | 185 | * The memory controller may have a limit of 0 bytes, so avoid a divide |
184 | * by zero, if necessary. | 186 | * by zero, if necessary. |
185 | */ | 187 | */ |
@@ -188,10 +190,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | |||
188 | 190 | ||
189 | /* | 191 | /* |
190 | * The baseline for the badness score is the proportion of RAM that each | 192 | * The baseline for the badness score is the proportion of RAM that each |
191 | * task's rss and swap space use. | 193 | * task's rss, pagetable and swap space use. |
192 | */ | 194 | */ |
193 | points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / | 195 | points = get_mm_rss(p->mm) + p->mm->nr_ptes; |
194 | totalpages; | 196 | points += get_mm_counter(p->mm, MM_SWAPENTS); |
197 | |||
198 | points *= 1000; | ||
199 | points /= totalpages; | ||
195 | task_unlock(p); | 200 | task_unlock(p); |
196 | 201 | ||
197 | /* | 202 | /* |
@@ -291,13 +296,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
291 | unsigned long totalpages, struct mem_cgroup *mem, | 296 | unsigned long totalpages, struct mem_cgroup *mem, |
292 | const nodemask_t *nodemask) | 297 | const nodemask_t *nodemask) |
293 | { | 298 | { |
294 | struct task_struct *p; | 299 | struct task_struct *g, *p; |
295 | struct task_struct *chosen = NULL; | 300 | struct task_struct *chosen = NULL; |
296 | *ppoints = 0; | 301 | *ppoints = 0; |
297 | 302 | ||
298 | for_each_process(p) { | 303 | do_each_thread(g, p) { |
299 | unsigned int points; | 304 | unsigned int points; |
300 | 305 | ||
306 | if (!p->mm) | ||
307 | continue; | ||
301 | if (oom_unkillable_task(p, mem, nodemask)) | 308 | if (oom_unkillable_task(p, mem, nodemask)) |
302 | continue; | 309 | continue; |
303 | 310 | ||
@@ -313,22 +320,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
313 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) | 320 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) |
314 | return ERR_PTR(-1UL); | 321 | return ERR_PTR(-1UL); |
315 | 322 | ||
316 | /* | 323 | if (p->flags & PF_EXITING) { |
317 | * This is in the process of releasing memory so wait for it | 324 | /* |
318 | * to finish before killing some other task by mistake. | 325 | * If p is the current task and is in the process of |
319 | * | 326 | * releasing memory, we allow the "kill" to set |
320 | * However, if p is the current task, we allow the 'kill' to | 327 | * TIF_MEMDIE, which will allow it to gain access to |
321 | * go ahead if it is exiting: this will simply set TIF_MEMDIE, | 328 | * memory reserves. Otherwise, it may stall forever. |
322 | * which will allow it to gain access to memory reserves in | 329 | * |
323 | * the process of exiting and releasing its resources. | 330 | * The loop isn't broken here, however, in case other |
324 | * Otherwise we could get an easy OOM deadlock. | 331 | * threads are found to have already been oom killed. |
325 | */ | 332 | */ |
326 | if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { | 333 | if (p == current) { |
327 | if (p != current) | 334 | chosen = p; |
328 | return ERR_PTR(-1UL); | 335 | *ppoints = 1000; |
329 | 336 | } else { | |
330 | chosen = p; | 337 | /* |
331 | *ppoints = 1000; | 338 | * If this task is not being ptraced on exit, |
339 | * then wait for it to finish before killing | ||
340 | * some other task unnecessarily. | ||
341 | */ | ||
342 | if (!(task_ptrace(p->group_leader) & | ||
343 | PT_TRACE_EXIT)) | ||
344 | return ERR_PTR(-1UL); | ||
345 | } | ||
332 | } | 346 | } |
333 | 347 | ||
334 | points = oom_badness(p, mem, nodemask, totalpages); | 348 | points = oom_badness(p, mem, nodemask, totalpages); |
@@ -336,7 +350,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
336 | chosen = p; | 350 | chosen = p; |
337 | *ppoints = points; | 351 | *ppoints = points; |
338 | } | 352 | } |
339 | } | 353 | } while_each_thread(g, p); |
340 | 354 | ||
341 | return chosen; | 355 | return chosen; |
342 | } | 356 | } |
@@ -395,7 +409,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
395 | task_unlock(current); | 409 | task_unlock(current); |
396 | dump_stack(); | 410 | dump_stack(); |
397 | mem_cgroup_print_oom_info(mem, p); | 411 | mem_cgroup_print_oom_info(mem, p); |
398 | show_mem(); | 412 | show_mem(SHOW_MEM_FILTER_NODES); |
399 | if (sysctl_oom_dump_tasks) | 413 | if (sysctl_oom_dump_tasks) |
400 | dump_tasks(mem, nodemask); | 414 | dump_tasks(mem, nodemask); |
401 | } | 415 | } |
@@ -403,27 +417,44 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
403 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 417 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
404 | static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) | 418 | static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) |
405 | { | 419 | { |
420 | struct task_struct *q; | ||
421 | struct mm_struct *mm; | ||
422 | |||
406 | p = find_lock_task_mm(p); | 423 | p = find_lock_task_mm(p); |
407 | if (!p) | 424 | if (!p) |
408 | return 1; | 425 | return 1; |
409 | 426 | ||
427 | /* mm cannot be safely dereferenced after task_unlock(p) */ | ||
428 | mm = p->mm; | ||
429 | |||
410 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | 430 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
411 | task_pid_nr(p), p->comm, K(p->mm->total_vm), | 431 | task_pid_nr(p), p->comm, K(p->mm->total_vm), |
412 | K(get_mm_counter(p->mm, MM_ANONPAGES)), | 432 | K(get_mm_counter(p->mm, MM_ANONPAGES)), |
413 | K(get_mm_counter(p->mm, MM_FILEPAGES))); | 433 | K(get_mm_counter(p->mm, MM_FILEPAGES))); |
414 | task_unlock(p); | 434 | task_unlock(p); |
415 | 435 | ||
436 | /* | ||
437 | * Kill all processes sharing p->mm in other thread groups, if any. | ||
438 | * They don't get access to memory reserves or a higher scheduler | ||
439 | * priority, though, to avoid depletion of all memory or task | ||
440 | * starvation. This prevents mm->mmap_sem livelock when an oom killed | ||
441 | * task cannot exit because it requires the semaphore and its contended | ||
442 | * by another thread trying to allocate memory itself. That thread will | ||
443 | * now get access to memory reserves since it has a pending fatal | ||
444 | * signal. | ||
445 | */ | ||
446 | for_each_process(q) | ||
447 | if (q->mm == mm && !same_thread_group(q, p)) { | ||
448 | task_lock(q); /* Protect ->comm from prctl() */ | ||
449 | pr_err("Kill process %d (%s) sharing same memory\n", | ||
450 | task_pid_nr(q), q->comm); | ||
451 | task_unlock(q); | ||
452 | force_sig(SIGKILL, q); | ||
453 | } | ||
416 | 454 | ||
417 | set_tsk_thread_flag(p, TIF_MEMDIE); | 455 | set_tsk_thread_flag(p, TIF_MEMDIE); |
418 | force_sig(SIGKILL, p); | 456 | force_sig(SIGKILL, p); |
419 | 457 | ||
420 | /* | ||
421 | * We give our sacrificial lamb high priority and access to | ||
422 | * all the memory it needs. That way it should be able to | ||
423 | * exit() and clear out its resources quickly... | ||
424 | */ | ||
425 | boost_dying_task_prio(p, mem); | ||
426 | |||
427 | return 0; | 458 | return 0; |
428 | } | 459 | } |
429 | #undef K | 460 | #undef K |
@@ -447,7 +478,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
447 | */ | 478 | */ |
448 | if (p->flags & PF_EXITING) { | 479 | if (p->flags & PF_EXITING) { |
449 | set_tsk_thread_flag(p, TIF_MEMDIE); | 480 | set_tsk_thread_flag(p, TIF_MEMDIE); |
450 | boost_dying_task_prio(p, mem); | ||
451 | return 0; | 481 | return 0; |
452 | } | 482 | } |
453 | 483 | ||
@@ -466,6 +496,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
466 | list_for_each_entry(child, &t->children, sibling) { | 496 | list_for_each_entry(child, &t->children, sibling) { |
467 | unsigned int child_points; | 497 | unsigned int child_points; |
468 | 498 | ||
499 | if (child->mm == p->mm) | ||
500 | continue; | ||
469 | /* | 501 | /* |
470 | * oom_badness() returns 0 if the thread is unkillable | 502 | * oom_badness() returns 0 if the thread is unkillable |
471 | */ | 503 | */ |
@@ -512,6 +544,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
512 | unsigned int points = 0; | 544 | unsigned int points = 0; |
513 | struct task_struct *p; | 545 | struct task_struct *p; |
514 | 546 | ||
547 | /* | ||
548 | * If current has a pending SIGKILL, then automatically select it. The | ||
549 | * goal is to allow it to allocate so that it may quickly exit and free | ||
550 | * its memory. | ||
551 | */ | ||
552 | if (fatal_signal_pending(current)) { | ||
553 | set_thread_flag(TIF_MEMDIE); | ||
554 | return; | ||
555 | } | ||
556 | |||
515 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); | 557 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); |
516 | limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; | 558 | limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; |
517 | read_lock(&tasklist_lock); | 559 | read_lock(&tasklist_lock); |
@@ -664,7 +706,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
664 | */ | 706 | */ |
665 | if (fatal_signal_pending(current)) { | 707 | if (fatal_signal_pending(current)) { |
666 | set_thread_flag(TIF_MEMDIE); | 708 | set_thread_flag(TIF_MEMDIE); |
667 | boost_dying_task_prio(current, NULL); | ||
668 | return; | 709 | return; |
669 | } | 710 | } |
670 | 711 | ||
@@ -680,7 +721,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
680 | read_lock(&tasklist_lock); | 721 | read_lock(&tasklist_lock); |
681 | if (sysctl_oom_kill_allocating_task && | 722 | if (sysctl_oom_kill_allocating_task && |
682 | !oom_unkillable_task(current, NULL, nodemask) && | 723 | !oom_unkillable_task(current, NULL, nodemask) && |
683 | (current->signal->oom_adj != OOM_DISABLE)) { | 724 | current->mm && !atomic_read(¤t->mm->oom_disable_count)) { |
684 | /* | 725 | /* |
685 | * oom_kill_process() needs tasklist_lock held. If it returns | 726 | * oom_kill_process() needs tasklist_lock held. If it returns |
686 | * non-zero, current could not be killed so we must fallback to | 727 | * non-zero, current could not be killed so we must fallback to |