aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2018-05-11 19:02:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-05-11 20:28:45 -0400
commit27ae357fa82be5ab73b2ef8d39dcb8ca2563483a (patch)
tree78596e2b24c4923f8d17152ac23b15d2d7681c19
parent013567be19761e2d14fc2a2676fe7686ac54c9ac (diff)
mm, oom: fix concurrent munlock and oom reaper unmap, v3
Since exit_mmap() is done without the protection of mm->mmap_sem, it is possible for the oom reaper to concurrently operate on an mm until MMF_OOM_SKIP is set. This allows munlock_vma_pages_all() to concurrently run while the oom reaper is operating on a vma. Since munlock_vma_pages_range() depends on clearing VM_LOCKED from vm_flags before actually doing the munlock to determine if any other vmas are locking the same memory, the check for VM_LOCKED in the oom reaper is racy. This is especially noticeable on architectures such as powerpc where clearing a huge pmd requires serialize_against_pte_lookup(). If the pmd is zapped by the oom reaper during follow_page_mask() after the check for pmd_none() is bypassed, this ends up deferencing a NULL ptl or a kernel oops. Fix this by manually freeing all possible memory from the mm before doing the munlock and then setting MMF_OOM_SKIP. The oom reaper can not run on the mm anymore so the munlock is safe to do in exit_mmap(). It also matches the logic that the oom reaper currently uses for determining when to set MMF_OOM_SKIP itself, so there's no new risk of excessive oom killing. This issue fixes CVE-2018-1000200. Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1804241526320.238665@chino.kir.corp.google.com Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: David Rientjes <rientjes@google.com> Suggested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: <stable@vger.kernel.org> [4.14+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/oom.h2
-rw-r--r--mm/mmap.c44
-rw-r--r--mm/oom_kill.c81
3 files changed, 71 insertions, 56 deletions
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 5bad038ac012..6adac113e96d 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -95,6 +95,8 @@ static inline int check_stable_address_space(struct mm_struct *mm)
95 return 0; 95 return 0;
96} 96}
97 97
98void __oom_reap_task_mm(struct mm_struct *mm);
99
98extern unsigned long oom_badness(struct task_struct *p, 100extern unsigned long oom_badness(struct task_struct *p,
99 struct mem_cgroup *memcg, const nodemask_t *nodemask, 101 struct mem_cgroup *memcg, const nodemask_t *nodemask,
100 unsigned long totalpages); 102 unsigned long totalpages);
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d5968d1e8e3..d6836566e4e5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3024,6 +3024,32 @@ void exit_mmap(struct mm_struct *mm)
3024 /* mm's last user has gone, and its about to be pulled down */ 3024 /* mm's last user has gone, and its about to be pulled down */
3025 mmu_notifier_release(mm); 3025 mmu_notifier_release(mm);
3026 3026
3027 if (unlikely(mm_is_oom_victim(mm))) {
3028 /*
3029 * Manually reap the mm to free as much memory as possible.
3030 * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
3031 * this mm from further consideration. Taking mm->mmap_sem for
3032 * write after setting MMF_OOM_SKIP will guarantee that the oom
3033 * reaper will not run on this mm again after mmap_sem is
3034 * dropped.
3035 *
3036 * Nothing can be holding mm->mmap_sem here and the above call
3037 * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
3038 * __oom_reap_task_mm() will not block.
3039 *
3040 * This needs to be done before calling munlock_vma_pages_all(),
3041 * which clears VM_LOCKED, otherwise the oom reaper cannot
3042 * reliably test it.
3043 */
3044 mutex_lock(&oom_lock);
3045 __oom_reap_task_mm(mm);
3046 mutex_unlock(&oom_lock);
3047
3048 set_bit(MMF_OOM_SKIP, &mm->flags);
3049 down_write(&mm->mmap_sem);
3050 up_write(&mm->mmap_sem);
3051 }
3052
3027 if (mm->locked_vm) { 3053 if (mm->locked_vm) {
3028 vma = mm->mmap; 3054 vma = mm->mmap;
3029 while (vma) { 3055 while (vma) {
@@ -3045,24 +3071,6 @@ void exit_mmap(struct mm_struct *mm)
3045 /* update_hiwater_rss(mm) here? but nobody should be looking */ 3071 /* update_hiwater_rss(mm) here? but nobody should be looking */
3046 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 3072 /* Use -1 here to ensure all VMAs in the mm are unmapped */
3047 unmap_vmas(&tlb, vma, 0, -1); 3073 unmap_vmas(&tlb, vma, 0, -1);
3048
3049 if (unlikely(mm_is_oom_victim(mm))) {
3050 /*
3051 * Wait for oom_reap_task() to stop working on this
3052 * mm. Because MMF_OOM_SKIP is already set before
3053 * calling down_read(), oom_reap_task() will not run
3054 * on this "mm" post up_write().
3055 *
3056 * mm_is_oom_victim() cannot be set from under us
3057 * either because victim->mm is already set to NULL
3058 * under task_lock before calling mmput and oom_mm is
3059 * set not NULL by the OOM killer only if victim->mm
3060 * is found not NULL while holding the task_lock.
3061 */
3062 set_bit(MMF_OOM_SKIP, &mm->flags);
3063 down_write(&mm->mmap_sem);
3064 up_write(&mm->mmap_sem);
3065 }
3066 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); 3074 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
3067 tlb_finish_mmu(&tlb, 0, -1); 3075 tlb_finish_mmu(&tlb, 0, -1);
3068 3076
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ff992fa8760a..8ba6cb88cf58 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -469,7 +469,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
469 return false; 469 return false;
470} 470}
471 471
472
473#ifdef CONFIG_MMU 472#ifdef CONFIG_MMU
474/* 473/*
475 * OOM Reaper kernel thread which tries to reap the memory used by the OOM 474 * OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -480,16 +479,54 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
480static struct task_struct *oom_reaper_list; 479static struct task_struct *oom_reaper_list;
481static DEFINE_SPINLOCK(oom_reaper_lock); 480static DEFINE_SPINLOCK(oom_reaper_lock);
482 481
483static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) 482void __oom_reap_task_mm(struct mm_struct *mm)
484{ 483{
485 struct mmu_gather tlb;
486 struct vm_area_struct *vma; 484 struct vm_area_struct *vma;
485
486 /*
487 * Tell all users of get_user/copy_from_user etc... that the content
488 * is no longer stable. No barriers really needed because unmapping
489 * should imply barriers already and the reader would hit a page fault
490 * if it stumbled over a reaped memory.
491 */
492 set_bit(MMF_UNSTABLE, &mm->flags);
493
494 for (vma = mm->mmap ; vma; vma = vma->vm_next) {
495 if (!can_madv_dontneed_vma(vma))
496 continue;
497
498 /*
499 * Only anonymous pages have a good chance to be dropped
500 * without additional steps which we cannot afford as we
501 * are OOM already.
502 *
503 * We do not even care about fs backed pages because all
504 * which are reclaimable have already been reclaimed and
505 * we do not want to block exit_mmap by keeping mm ref
506 * count elevated without a good reason.
507 */
508 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
509 const unsigned long start = vma->vm_start;
510 const unsigned long end = vma->vm_end;
511 struct mmu_gather tlb;
512
513 tlb_gather_mmu(&tlb, mm, start, end);
514 mmu_notifier_invalidate_range_start(mm, start, end);
515 unmap_page_range(&tlb, vma, start, end, NULL);
516 mmu_notifier_invalidate_range_end(mm, start, end);
517 tlb_finish_mmu(&tlb, start, end);
518 }
519 }
520}
521
522static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
523{
487 bool ret = true; 524 bool ret = true;
488 525
489 /* 526 /*
490 * We have to make sure to not race with the victim exit path 527 * We have to make sure to not race with the victim exit path
491 * and cause premature new oom victim selection: 528 * and cause premature new oom victim selection:
492 * __oom_reap_task_mm exit_mm 529 * oom_reap_task_mm exit_mm
493 * mmget_not_zero 530 * mmget_not_zero
494 * mmput 531 * mmput
495 * atomic_dec_and_test 532 * atomic_dec_and_test
@@ -534,39 +571,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
534 571
535 trace_start_task_reaping(tsk->pid); 572 trace_start_task_reaping(tsk->pid);
536 573
537 /* 574 __oom_reap_task_mm(mm);
538 * Tell all users of get_user/copy_from_user etc... that the content
539 * is no longer stable. No barriers really needed because unmapping
540 * should imply barriers already and the reader would hit a page fault
541 * if it stumbled over a reaped memory.
542 */
543 set_bit(MMF_UNSTABLE, &mm->flags);
544
545 for (vma = mm->mmap ; vma; vma = vma->vm_next) {
546 if (!can_madv_dontneed_vma(vma))
547 continue;
548 575
549 /*
550 * Only anonymous pages have a good chance to be dropped
551 * without additional steps which we cannot afford as we
552 * are OOM already.
553 *
554 * We do not even care about fs backed pages because all
555 * which are reclaimable have already been reclaimed and
556 * we do not want to block exit_mmap by keeping mm ref
557 * count elevated without a good reason.
558 */
559 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
560 const unsigned long start = vma->vm_start;
561 const unsigned long end = vma->vm_end;
562
563 tlb_gather_mmu(&tlb, mm, start, end);
564 mmu_notifier_invalidate_range_start(mm, start, end);
565 unmap_page_range(&tlb, vma, start, end, NULL);
566 mmu_notifier_invalidate_range_end(mm, start, end);
567 tlb_finish_mmu(&tlb, start, end);
568 }
569 }
570 pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 576 pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
571 task_pid_nr(tsk), tsk->comm, 577 task_pid_nr(tsk), tsk->comm,
572 K(get_mm_counter(mm, MM_ANONPAGES)), 578 K(get_mm_counter(mm, MM_ANONPAGES)),
@@ -587,14 +593,13 @@ static void oom_reap_task(struct task_struct *tsk)
587 struct mm_struct *mm = tsk->signal->oom_mm; 593 struct mm_struct *mm = tsk->signal->oom_mm;
588 594
589 /* Retry the down_read_trylock(mmap_sem) a few times */ 595 /* Retry the down_read_trylock(mmap_sem) a few times */
590 while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) 596 while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
591 schedule_timeout_idle(HZ/10); 597 schedule_timeout_idle(HZ/10);
592 598
593 if (attempts <= MAX_OOM_REAP_RETRIES || 599 if (attempts <= MAX_OOM_REAP_RETRIES ||
594 test_bit(MMF_OOM_SKIP, &mm->flags)) 600 test_bit(MMF_OOM_SKIP, &mm->flags))
595 goto done; 601 goto done;
596 602
597
598 pr_info("oom_reaper: unable to reap pid:%d (%s)\n", 603 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
599 task_pid_nr(tsk), tsk->comm); 604 task_pid_nr(tsk), tsk->comm);
600 debug_show_all_locks(); 605 debug_show_all_locks();