aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2016-10-07 19:58:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-07 21:46:27 -0400
commit26db62f179d112d345031e14926a4cda9cd40d6e (patch)
tree12d1fca0155dd6f1b3e16f660a74a0b55432b204
parent8496afaba93ece80a83cbd096f0675a1020ddfc4 (diff)
oom: keep mm of the killed task available
oom_reap_task has to call exit_oom_victim in order to make sure that the oom vicim will not block the oom killer for ever. This is, however, opening new problems (e.g oom_killer_disable exclusion - see commit 74070542099c ("oom, suspend: fix oom_reaper vs. oom_killer_disable race")). exit_oom_victim should be only called from the victim's context ideally. One way to achieve this would be to rely on per mm_struct flags. We already have MMF_OOM_REAPED to hide a task from the oom killer since "mm, oom: hide mm which is shared with kthread or global init". The problem is that the exit path: do_exit exit_mm tsk->mm = NULL; mmput __mmput exit_oom_victim doesn't guarantee that exit_oom_victim will get called in a bounded amount of time. At least exit_aio depends on IO which might get blocked due to lack of memory and who knows what else is lurking there. This patch takes a different approach. We remember tsk->mm into the signal_struct and bind it to the signal struct life time for all oom victims. __oom_reap_task_mm as well as oom_scan_process_thread do not have to rely on find_lock_task_mm anymore and they will have a reliable reference to the mm struct. As a result all the oom specific communication inside the OOM killer can be done via tsk->signal->oom_mm. Increasing the signal_struct for something as unlikely as the oom killer is far from ideal but this approach will make the code much more reasonable and long term we even might want to move task->mm into the signal_struct anyway. In the next step we might want to make the oom killer exclusion and access to memory reserves completely independent which would be also nice. Link: http://lkml.kernel.org/r/1472119394-11342-4-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Oleg Nesterov <oleg@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/fork.c2
-rw-r--r--mm/oom_kill.c51
3 files changed, 23 insertions, 32 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b48cd32be445..67ea79610e67 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -805,6 +805,8 @@ struct signal_struct {
805 short oom_score_adj; /* OOM kill score adjustment */ 805 short oom_score_adj; /* OOM kill score adjustment */
806 short oom_score_adj_min; /* OOM kill score adjustment min value. 806 short oom_score_adj_min; /* OOM kill score adjustment min value.
807 * Only settable by CAP_SYS_RESOURCE. */ 807 * Only settable by CAP_SYS_RESOURCE. */
808 struct mm_struct *oom_mm; /* recorded mm when the thread group got
809 * killed by the oom killer */
808 810
809 struct mutex cred_guard_mutex; /* guard against foreign influences on 811 struct mutex cred_guard_mutex; /* guard against foreign influences on
810 * credential calculations 812 * credential calculations
diff --git a/kernel/fork.c b/kernel/fork.c
index 9a05bd93f8e7..48cafe787b75 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -359,6 +359,8 @@ static inline void free_signal_struct(struct signal_struct *sig)
359{ 359{
360 taskstats_tgid_free(sig); 360 taskstats_tgid_free(sig);
361 sched_autogroup_exit(sig); 361 sched_autogroup_exit(sig);
362 if (sig->oom_mm)
363 mmdrop(sig->oom_mm);
362 kmem_cache_free(signal_cachep, sig); 364 kmem_cache_free(signal_cachep, sig);
363} 365}
364 366
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 45097f5a8f30..f16ec0840a0e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -300,14 +300,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
300 * any memory is quite low. 300 * any memory is quite low.
301 */ 301 */
302 if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { 302 if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
303 struct task_struct *p = find_lock_task_mm(task); 303 if (test_bit(MMF_OOM_REAPED, &task->signal->oom_mm->flags))
304 bool reaped = false;
305
306 if (p) {
307 reaped = test_bit(MMF_OOM_REAPED, &p->mm->flags);
308 task_unlock(p);
309 }
310 if (reaped)
311 goto next; 304 goto next;
312 goto abort; 305 goto abort;
313 } 306 }
@@ -537,11 +530,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
537 up_read(&mm->mmap_sem); 530 up_read(&mm->mmap_sem);
538 531
539 /* 532 /*
540 * This task can be safely ignored because we cannot do much more
541 * to release its memory.
542 */
543 set_bit(MMF_OOM_REAPED, &mm->flags);
544 /*
545 * Drop our reference but make sure the mmput slow path is called from a 533 * Drop our reference but make sure the mmput slow path is called from a
546 * different context because we shouldn't risk we get stuck there and 534 * different context because we shouldn't risk we get stuck there and
547 * put the oom_reaper out of the way. 535 * put the oom_reaper out of the way.
@@ -556,20 +544,7 @@ unlock_oom:
556static void oom_reap_task(struct task_struct *tsk) 544static void oom_reap_task(struct task_struct *tsk)
557{ 545{
558 int attempts = 0; 546 int attempts = 0;
559 struct mm_struct *mm = NULL; 547 struct mm_struct *mm = tsk->signal->oom_mm;
560 struct task_struct *p = find_lock_task_mm(tsk);
561
562 /*
563 * Make sure we find the associated mm_struct even when the particular
564 * thread has already terminated and cleared its mm.
565 * We might have race with exit path so consider our work done if there
566 * is no mm.
567 */
568 if (!p)
569 goto done;
570 mm = p->mm;
571 atomic_inc(&mm->mm_count);
572 task_unlock(p);
573 548
574 /* Retry the down_read_trylock(mmap_sem) a few times */ 549 /* Retry the down_read_trylock(mmap_sem) a few times */
575 while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) 550 while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
@@ -578,8 +553,6 @@ static void oom_reap_task(struct task_struct *tsk)
578 if (attempts <= MAX_OOM_REAP_RETRIES) 553 if (attempts <= MAX_OOM_REAP_RETRIES)
579 goto done; 554 goto done;
580 555
581 /* Ignore this mm because somebody can't call up_write(mmap_sem). */
582 set_bit(MMF_OOM_REAPED, &mm->flags);
583 556
584 pr_info("oom_reaper: unable to reap pid:%d (%s)\n", 557 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
585 task_pid_nr(tsk), tsk->comm); 558 task_pid_nr(tsk), tsk->comm);
@@ -595,11 +568,14 @@ done:
595 tsk->oom_reaper_list = NULL; 568 tsk->oom_reaper_list = NULL;
596 exit_oom_victim(tsk); 569 exit_oom_victim(tsk);
597 570
571 /*
572 * Hide this mm from OOM killer because it has been either reaped or
573 * somebody can't call up_write(mmap_sem).
574 */
575 set_bit(MMF_OOM_REAPED, &mm->flags);
576
598 /* Drop a reference taken by wake_oom_reaper */ 577 /* Drop a reference taken by wake_oom_reaper */
599 put_task_struct(tsk); 578 put_task_struct(tsk);
600 /* Drop a reference taken above. */
601 if (mm)
602 mmdrop(mm);
603} 579}
604 580
605static int oom_reaper(void *unused) 581static int oom_reaper(void *unused)
@@ -665,14 +641,25 @@ static inline void wake_oom_reaper(struct task_struct *tsk)
665 * 641 *
666 * Has to be called with oom_lock held and never after 642 * Has to be called with oom_lock held and never after
667 * oom has been disabled already. 643 * oom has been disabled already.
644 *
645 * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
646 * under task_lock or operate on the current).
668 */ 647 */
669static void mark_oom_victim(struct task_struct *tsk) 648static void mark_oom_victim(struct task_struct *tsk)
670{ 649{
650 struct mm_struct *mm = tsk->mm;
651
671 WARN_ON(oom_killer_disabled); 652 WARN_ON(oom_killer_disabled);
672 /* OOM killer might race with memcg OOM */ 653 /* OOM killer might race with memcg OOM */
673 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) 654 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
674 return; 655 return;
656
675 atomic_inc(&tsk->signal->oom_victims); 657 atomic_inc(&tsk->signal->oom_victims);
658
659 /* oom_mm is bound to the signal struct life time. */
660 if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
661 atomic_inc(&tsk->signal->oom_mm->mm_count);
662
676 /* 663 /*
677 * Make sure that the task is woken up from uninterruptible sleep 664 * Make sure that the task is woken up from uninterruptible sleep
678 * if it is frozen because OOM killer wouldn't be able to free 665 * if it is frozen because OOM killer wouldn't be able to free