diff options
author | David Rientjes <rientjes@google.com> | 2018-05-11 19:02:04 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-05-11 20:28:45 -0400 |
commit | 27ae357fa82be5ab73b2ef8d39dcb8ca2563483a (patch) | |
tree | 78596e2b24c4923f8d17152ac23b15d2d7681c19 | |
parent | 013567be19761e2d14fc2a2676fe7686ac54c9ac (diff) |
mm, oom: fix concurrent munlock and oom reaper unmap, v3
Since exit_mmap() is done without the protection of mm->mmap_sem, it is
possible for the oom reaper to concurrently operate on an mm until
MMF_OOM_SKIP is set.
This allows munlock_vma_pages_all() to concurrently run while the oom
reaper is operating on a vma. Since munlock_vma_pages_range() depends
on clearing VM_LOCKED from vm_flags before actually doing the munlock to
determine if any other vmas are locking the same memory, the check for
VM_LOCKED in the oom reaper is racy.
This is especially noticeable on architectures such as powerpc where
clearing a huge pmd requires serialize_against_pte_lookup(). If the pmd
is zapped by the oom reaper during follow_page_mask() after the check
for pmd_none() is bypassed, this ends up deferencing a NULL ptl or a
kernel oops.
Fix this by manually freeing all possible memory from the mm before
doing the munlock and then setting MMF_OOM_SKIP. The oom reaper can not
run on the mm anymore so the munlock is safe to do in exit_mmap(). It
also matches the logic that the oom reaper currently uses for
determining when to set MMF_OOM_SKIP itself, so there's no new risk of
excessive oom killing.
This issue fixes CVE-2018-1000200.
Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1804241526320.238665@chino.kir.corp.google.com
Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Signed-off-by: David Rientjes <rientjes@google.com>
Suggested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org> [4.14+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/oom.h | 2 | ||||
-rw-r--r-- | mm/mmap.c | 44 | ||||
-rw-r--r-- | mm/oom_kill.c | 81 |
3 files changed, 71 insertions, 56 deletions
diff --git a/include/linux/oom.h b/include/linux/oom.h index 5bad038ac012..6adac113e96d 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -95,6 +95,8 @@ static inline int check_stable_address_space(struct mm_struct *mm) | |||
95 | return 0; | 95 | return 0; |
96 | } | 96 | } |
97 | 97 | ||
98 | void __oom_reap_task_mm(struct mm_struct *mm); | ||
99 | |||
98 | extern unsigned long oom_badness(struct task_struct *p, | 100 | extern unsigned long oom_badness(struct task_struct *p, |
99 | struct mem_cgroup *memcg, const nodemask_t *nodemask, | 101 | struct mem_cgroup *memcg, const nodemask_t *nodemask, |
100 | unsigned long totalpages); | 102 | unsigned long totalpages); |
@@ -3024,6 +3024,32 @@ void exit_mmap(struct mm_struct *mm) | |||
3024 | /* mm's last user has gone, and its about to be pulled down */ | 3024 | /* mm's last user has gone, and its about to be pulled down */ |
3025 | mmu_notifier_release(mm); | 3025 | mmu_notifier_release(mm); |
3026 | 3026 | ||
3027 | if (unlikely(mm_is_oom_victim(mm))) { | ||
3028 | /* | ||
3029 | * Manually reap the mm to free as much memory as possible. | ||
3030 | * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard | ||
3031 | * this mm from further consideration. Taking mm->mmap_sem for | ||
3032 | * write after setting MMF_OOM_SKIP will guarantee that the oom | ||
3033 | * reaper will not run on this mm again after mmap_sem is | ||
3034 | * dropped. | ||
3035 | * | ||
3036 | * Nothing can be holding mm->mmap_sem here and the above call | ||
3037 | * to mmu_notifier_release(mm) ensures mmu notifier callbacks in | ||
3038 | * __oom_reap_task_mm() will not block. | ||
3039 | * | ||
3040 | * This needs to be done before calling munlock_vma_pages_all(), | ||
3041 | * which clears VM_LOCKED, otherwise the oom reaper cannot | ||
3042 | * reliably test it. | ||
3043 | */ | ||
3044 | mutex_lock(&oom_lock); | ||
3045 | __oom_reap_task_mm(mm); | ||
3046 | mutex_unlock(&oom_lock); | ||
3047 | |||
3048 | set_bit(MMF_OOM_SKIP, &mm->flags); | ||
3049 | down_write(&mm->mmap_sem); | ||
3050 | up_write(&mm->mmap_sem); | ||
3051 | } | ||
3052 | |||
3027 | if (mm->locked_vm) { | 3053 | if (mm->locked_vm) { |
3028 | vma = mm->mmap; | 3054 | vma = mm->mmap; |
3029 | while (vma) { | 3055 | while (vma) { |
@@ -3045,24 +3071,6 @@ void exit_mmap(struct mm_struct *mm) | |||
3045 | /* update_hiwater_rss(mm) here? but nobody should be looking */ | 3071 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
3046 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 3072 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
3047 | unmap_vmas(&tlb, vma, 0, -1); | 3073 | unmap_vmas(&tlb, vma, 0, -1); |
3048 | |||
3049 | if (unlikely(mm_is_oom_victim(mm))) { | ||
3050 | /* | ||
3051 | * Wait for oom_reap_task() to stop working on this | ||
3052 | * mm. Because MMF_OOM_SKIP is already set before | ||
3053 | * calling down_read(), oom_reap_task() will not run | ||
3054 | * on this "mm" post up_write(). | ||
3055 | * | ||
3056 | * mm_is_oom_victim() cannot be set from under us | ||
3057 | * either because victim->mm is already set to NULL | ||
3058 | * under task_lock before calling mmput and oom_mm is | ||
3059 | * set not NULL by the OOM killer only if victim->mm | ||
3060 | * is found not NULL while holding the task_lock. | ||
3061 | */ | ||
3062 | set_bit(MMF_OOM_SKIP, &mm->flags); | ||
3063 | down_write(&mm->mmap_sem); | ||
3064 | up_write(&mm->mmap_sem); | ||
3065 | } | ||
3066 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); | 3074 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); |
3067 | tlb_finish_mmu(&tlb, 0, -1); | 3075 | tlb_finish_mmu(&tlb, 0, -1); |
3068 | 3076 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ff992fa8760a..8ba6cb88cf58 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -469,7 +469,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) | |||
469 | return false; | 469 | return false; |
470 | } | 470 | } |
471 | 471 | ||
472 | |||
473 | #ifdef CONFIG_MMU | 472 | #ifdef CONFIG_MMU |
474 | /* | 473 | /* |
475 | * OOM Reaper kernel thread which tries to reap the memory used by the OOM | 474 | * OOM Reaper kernel thread which tries to reap the memory used by the OOM |
@@ -480,16 +479,54 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); | |||
480 | static struct task_struct *oom_reaper_list; | 479 | static struct task_struct *oom_reaper_list; |
481 | static DEFINE_SPINLOCK(oom_reaper_lock); | 480 | static DEFINE_SPINLOCK(oom_reaper_lock); |
482 | 481 | ||
483 | static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) | 482 | void __oom_reap_task_mm(struct mm_struct *mm) |
484 | { | 483 | { |
485 | struct mmu_gather tlb; | ||
486 | struct vm_area_struct *vma; | 484 | struct vm_area_struct *vma; |
485 | |||
486 | /* | ||
487 | * Tell all users of get_user/copy_from_user etc... that the content | ||
488 | * is no longer stable. No barriers really needed because unmapping | ||
489 | * should imply barriers already and the reader would hit a page fault | ||
490 | * if it stumbled over a reaped memory. | ||
491 | */ | ||
492 | set_bit(MMF_UNSTABLE, &mm->flags); | ||
493 | |||
494 | for (vma = mm->mmap ; vma; vma = vma->vm_next) { | ||
495 | if (!can_madv_dontneed_vma(vma)) | ||
496 | continue; | ||
497 | |||
498 | /* | ||
499 | * Only anonymous pages have a good chance to be dropped | ||
500 | * without additional steps which we cannot afford as we | ||
501 | * are OOM already. | ||
502 | * | ||
503 | * We do not even care about fs backed pages because all | ||
504 | * which are reclaimable have already been reclaimed and | ||
505 | * we do not want to block exit_mmap by keeping mm ref | ||
506 | * count elevated without a good reason. | ||
507 | */ | ||
508 | if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { | ||
509 | const unsigned long start = vma->vm_start; | ||
510 | const unsigned long end = vma->vm_end; | ||
511 | struct mmu_gather tlb; | ||
512 | |||
513 | tlb_gather_mmu(&tlb, mm, start, end); | ||
514 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
515 | unmap_page_range(&tlb, vma, start, end, NULL); | ||
516 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
517 | tlb_finish_mmu(&tlb, start, end); | ||
518 | } | ||
519 | } | ||
520 | } | ||
521 | |||
522 | static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) | ||
523 | { | ||
487 | bool ret = true; | 524 | bool ret = true; |
488 | 525 | ||
489 | /* | 526 | /* |
490 | * We have to make sure to not race with the victim exit path | 527 | * We have to make sure to not race with the victim exit path |
491 | * and cause premature new oom victim selection: | 528 | * and cause premature new oom victim selection: |
492 | * __oom_reap_task_mm exit_mm | 529 | * oom_reap_task_mm exit_mm |
493 | * mmget_not_zero | 530 | * mmget_not_zero |
494 | * mmput | 531 | * mmput |
495 | * atomic_dec_and_test | 532 | * atomic_dec_and_test |
@@ -534,39 +571,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) | |||
534 | 571 | ||
535 | trace_start_task_reaping(tsk->pid); | 572 | trace_start_task_reaping(tsk->pid); |
536 | 573 | ||
537 | /* | 574 | __oom_reap_task_mm(mm); |
538 | * Tell all users of get_user/copy_from_user etc... that the content | ||
539 | * is no longer stable. No barriers really needed because unmapping | ||
540 | * should imply barriers already and the reader would hit a page fault | ||
541 | * if it stumbled over a reaped memory. | ||
542 | */ | ||
543 | set_bit(MMF_UNSTABLE, &mm->flags); | ||
544 | |||
545 | for (vma = mm->mmap ; vma; vma = vma->vm_next) { | ||
546 | if (!can_madv_dontneed_vma(vma)) | ||
547 | continue; | ||
548 | 575 | ||
549 | /* | ||
550 | * Only anonymous pages have a good chance to be dropped | ||
551 | * without additional steps which we cannot afford as we | ||
552 | * are OOM already. | ||
553 | * | ||
554 | * We do not even care about fs backed pages because all | ||
555 | * which are reclaimable have already been reclaimed and | ||
556 | * we do not want to block exit_mmap by keeping mm ref | ||
557 | * count elevated without a good reason. | ||
558 | */ | ||
559 | if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { | ||
560 | const unsigned long start = vma->vm_start; | ||
561 | const unsigned long end = vma->vm_end; | ||
562 | |||
563 | tlb_gather_mmu(&tlb, mm, start, end); | ||
564 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
565 | unmap_page_range(&tlb, vma, start, end, NULL); | ||
566 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
567 | tlb_finish_mmu(&tlb, start, end); | ||
568 | } | ||
569 | } | ||
570 | pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", | 576 | pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", |
571 | task_pid_nr(tsk), tsk->comm, | 577 | task_pid_nr(tsk), tsk->comm, |
572 | K(get_mm_counter(mm, MM_ANONPAGES)), | 578 | K(get_mm_counter(mm, MM_ANONPAGES)), |
@@ -587,14 +593,13 @@ static void oom_reap_task(struct task_struct *tsk) | |||
587 | struct mm_struct *mm = tsk->signal->oom_mm; | 593 | struct mm_struct *mm = tsk->signal->oom_mm; |
588 | 594 | ||
589 | /* Retry the down_read_trylock(mmap_sem) a few times */ | 595 | /* Retry the down_read_trylock(mmap_sem) a few times */ |
590 | while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) | 596 | while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) |
591 | schedule_timeout_idle(HZ/10); | 597 | schedule_timeout_idle(HZ/10); |
592 | 598 | ||
593 | if (attempts <= MAX_OOM_REAP_RETRIES || | 599 | if (attempts <= MAX_OOM_REAP_RETRIES || |
594 | test_bit(MMF_OOM_SKIP, &mm->flags)) | 600 | test_bit(MMF_OOM_SKIP, &mm->flags)) |
595 | goto done; | 601 | goto done; |
596 | 602 | ||
597 | |||
598 | pr_info("oom_reaper: unable to reap pid:%d (%s)\n", | 603 | pr_info("oom_reaper: unable to reap pid:%d (%s)\n", |
599 | task_pid_nr(tsk), tsk->comm); | 604 | task_pid_nr(tsk), tsk->comm); |
600 | debug_show_all_locks(); | 605 | debug_show_all_locks(); |