summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2017-08-18 18:16:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-08-18 18:32:01 -0400
commit6b31d5955cb29a51c5baffee382f213d75e98fb8 (patch)
tree86405062da720e3cc0f60c9c0e48e21e3ab0189c
parent5b53a6ea886700a128b697a6fe8375340dea2c30 (diff)
mm, oom: fix potential data corruption when oom_reaper races with writer
Wenwei Tao has noticed that our current assumption that the oom victim is dying and never doing any visible changes after it dies, and so the oom_reaper can tear it down, is not entirely true. __task_will_free_mem consider a task dying when SIGNAL_GROUP_EXIT is set but do_group_exit sends SIGKILL to all threads _after_ the flag is set. So there is a race window when some threads won't have fatal_signal_pending while the oom_reaper could start unmapping the address space. Moreover some paths might not check for fatal signals before each PF/g-u-p/copy_from_user. We already have a protection for oom_reaper vs. PF races by checking MMF_UNSTABLE. This has been, however, checked only for kernel threads (use_mm users) which can outlive the oom victim. A simple fix would be to extend the current check in handle_mm_fault for all tasks but that wouldn't be sufficient because the current check assumes that a kernel thread would bail out after EFAULT from get_user*/copy_from_user and never re-read the same address which would succeed because the PF path has established page tables already. This seems to be the case for the only existing use_mm user currently (virtio driver) but it is rather fragile in general. This is even more fragile in general for more complex paths such as generic_perform_write which can re-read the same address more times (e.g. iov_iter_copy_from_user_atomic to fail and then iov_iter_fault_in_readable on retry). Therefore we have to implement MMF_UNSTABLE protection in a robust way and never make a potentially corrupted content visible. That requires to hook deeper into the PF path and check for the flag _every time_ before a pte for anonymous memory is established (that means all !VM_SHARED mappings). The corruption can be triggered artificially (http://lkml.kernel.org/r/201708040646.v746kkhC024636@www262.sakura.ne.jp) but there doesn't seem to be any real life bug report. The race window should be quite tight to trigger most of the time. Link: http://lkml.kernel.org/r/20170807113839.16695-3-mhocko@kernel.org Fixes: aac453635549 ("mm, oom: introduce oom reaper") Signed-off-by: Michal Hocko <mhocko@suse.com> Reported-by: Wenwei Tao <wenwei.tww@alibaba-inc.com> Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: Andrea Argangeli <andrea@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/oom.h22
-rw-r--r--mm/huge_memory.c30
-rw-r--r--mm/memory.c46
3 files changed, 64 insertions, 34 deletions
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8a266e2be5a6..76aac4ce39bc 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -6,6 +6,8 @@
6#include <linux/types.h> 6#include <linux/types.h>
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <uapi/linux/oom.h> 8#include <uapi/linux/oom.h>
9#include <linux/sched/coredump.h> /* MMF_* */
10#include <linux/mm.h> /* VM_FAULT* */
9 11
10struct zonelist; 12struct zonelist;
11struct notifier_block; 13struct notifier_block;
@@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
63 return tsk->signal->oom_mm; 65 return tsk->signal->oom_mm;
64} 66}
65 67
68/*
69 * Checks whether a page fault on the given mm is still reliable.
70 * This is no longer true if the oom reaper started to reap the
71 * address space which is reflected by MMF_UNSTABLE flag set in
72 * the mm. At that moment any !shared mapping would lose the content
73 * and could cause a memory corruption (zero pages instead of the
74 * original content).
75 *
76 * User should call this before establishing a page table entry for
77 * a !shared mapping and under the proper page table lock.
78 *
79 * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
80 */
81static inline int check_stable_address_space(struct mm_struct *mm)
82{
83 if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
84 return VM_FAULT_SIGBUS;
85 return 0;
86}
87
66extern unsigned long oom_badness(struct task_struct *p, 88extern unsigned long oom_badness(struct task_struct *p,
67 struct mem_cgroup *memcg, const nodemask_t *nodemask, 89 struct mem_cgroup *memcg, const nodemask_t *nodemask,
68 unsigned long totalpages); 90 unsigned long totalpages);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 216114f6ef0b..90731e3b7e58 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -32,6 +32,7 @@
32#include <linux/userfaultfd_k.h> 32#include <linux/userfaultfd_k.h>
33#include <linux/page_idle.h> 33#include <linux/page_idle.h>
34#include <linux/shmem_fs.h> 34#include <linux/shmem_fs.h>
35#include <linux/oom.h>
35 36
36#include <asm/tlb.h> 37#include <asm/tlb.h>
37#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
550 struct mem_cgroup *memcg; 551 struct mem_cgroup *memcg;
551 pgtable_t pgtable; 552 pgtable_t pgtable;
552 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 553 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
554 int ret = 0;
553 555
554 VM_BUG_ON_PAGE(!PageCompound(page), page); 556 VM_BUG_ON_PAGE(!PageCompound(page), page);
555 557
@@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
561 563
562 pgtable = pte_alloc_one(vma->vm_mm, haddr); 564 pgtable = pte_alloc_one(vma->vm_mm, haddr);
563 if (unlikely(!pgtable)) { 565 if (unlikely(!pgtable)) {
564 mem_cgroup_cancel_charge(page, memcg, true); 566 ret = VM_FAULT_OOM;
565 put_page(page); 567 goto release;
566 return VM_FAULT_OOM;
567 } 568 }
568 569
569 clear_huge_page(page, haddr, HPAGE_PMD_NR); 570 clear_huge_page(page, haddr, HPAGE_PMD_NR);
@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
576 577
577 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 578 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
578 if (unlikely(!pmd_none(*vmf->pmd))) { 579 if (unlikely(!pmd_none(*vmf->pmd))) {
579 spin_unlock(vmf->ptl); 580 goto unlock_release;
580 mem_cgroup_cancel_charge(page, memcg, true);
581 put_page(page);
582 pte_free(vma->vm_mm, pgtable);
583 } else { 581 } else {
584 pmd_t entry; 582 pmd_t entry;
585 583
584 ret = check_stable_address_space(vma->vm_mm);
585 if (ret)
586 goto unlock_release;
587
586 /* Deliver the page fault to userland */ 588 /* Deliver the page fault to userland */
587 if (userfaultfd_missing(vma)) { 589 if (userfaultfd_missing(vma)) {
588 int ret; 590 int ret;
@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
610 } 612 }
611 613
612 return 0; 614 return 0;
615unlock_release:
616 spin_unlock(vmf->ptl);
617release:
618 if (pgtable)
619 pte_free(vma->vm_mm, pgtable);
620 mem_cgroup_cancel_charge(page, memcg, true);
621 put_page(page);
622 return ret;
623
613} 624}
614 625
615/* 626/*
@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
688 ret = 0; 699 ret = 0;
689 set = false; 700 set = false;
690 if (pmd_none(*vmf->pmd)) { 701 if (pmd_none(*vmf->pmd)) {
691 if (userfaultfd_missing(vma)) { 702 ret = check_stable_address_space(vma->vm_mm);
703 if (ret) {
704 spin_unlock(vmf->ptl);
705 } else if (userfaultfd_missing(vma)) {
692 spin_unlock(vmf->ptl); 706 spin_unlock(vmf->ptl);
693 ret = handle_userfault(vmf, VM_UFFD_MISSING); 707 ret = handle_userfault(vmf, VM_UFFD_MISSING);
694 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 708 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
diff --git a/mm/memory.c b/mm/memory.c
index c717b5bcc80e..fe2fba27ded2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
68#include <linux/debugfs.h> 68#include <linux/debugfs.h>
69#include <linux/userfaultfd_k.h> 69#include <linux/userfaultfd_k.h>
70#include <linux/dax.h> 70#include <linux/dax.h>
71#include <linux/oom.h>
71 72
72#include <asm/io.h> 73#include <asm/io.h>
73#include <asm/mmu_context.h> 74#include <asm/mmu_context.h>
@@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
2893 struct vm_area_struct *vma = vmf->vma; 2894 struct vm_area_struct *vma = vmf->vma;
2894 struct mem_cgroup *memcg; 2895 struct mem_cgroup *memcg;
2895 struct page *page; 2896 struct page *page;
2897 int ret = 0;
2896 pte_t entry; 2898 pte_t entry;
2897 2899
2898 /* File mapping without ->vm_ops ? */ 2900 /* File mapping without ->vm_ops ? */
@@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
2925 vmf->address, &vmf->ptl); 2927 vmf->address, &vmf->ptl);
2926 if (!pte_none(*vmf->pte)) 2928 if (!pte_none(*vmf->pte))
2927 goto unlock; 2929 goto unlock;
2930 ret = check_stable_address_space(vma->vm_mm);
2931 if (ret)
2932 goto unlock;
2928 /* Deliver the page fault to userland, check inside PT lock */ 2933 /* Deliver the page fault to userland, check inside PT lock */
2929 if (userfaultfd_missing(vma)) { 2934 if (userfaultfd_missing(vma)) {
2930 pte_unmap_unlock(vmf->pte, vmf->ptl); 2935 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
2959 if (!pte_none(*vmf->pte)) 2964 if (!pte_none(*vmf->pte))
2960 goto release; 2965 goto release;
2961 2966
2967 ret = check_stable_address_space(vma->vm_mm);
2968 if (ret)
2969 goto release;
2970
2962 /* Deliver the page fault to userland, check inside PT lock */ 2971 /* Deliver the page fault to userland, check inside PT lock */
2963 if (userfaultfd_missing(vma)) { 2972 if (userfaultfd_missing(vma)) {
2964 pte_unmap_unlock(vmf->pte, vmf->ptl); 2973 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2978,7 +2987,7 @@ setpte:
2978 update_mmu_cache(vma, vmf->address, vmf->pte); 2987 update_mmu_cache(vma, vmf->address, vmf->pte);
2979unlock: 2988unlock:
2980 pte_unmap_unlock(vmf->pte, vmf->ptl); 2989 pte_unmap_unlock(vmf->pte, vmf->ptl);
2981 return 0; 2990 return ret;
2982release: 2991release:
2983 mem_cgroup_cancel_charge(page, memcg, false); 2992 mem_cgroup_cancel_charge(page, memcg, false);
2984 put_page(page); 2993 put_page(page);
@@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3252int finish_fault(struct vm_fault *vmf) 3261int finish_fault(struct vm_fault *vmf)
3253{ 3262{
3254 struct page *page; 3263 struct page *page;
3255 int ret; 3264 int ret = 0;
3256 3265
3257 /* Did we COW the page? */ 3266 /* Did we COW the page? */
3258 if ((vmf->flags & FAULT_FLAG_WRITE) && 3267 if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3260,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf)
3260 page = vmf->cow_page; 3269 page = vmf->cow_page;
3261 else 3270 else
3262 page = vmf->page; 3271 page = vmf->page;
3263 ret = alloc_set_pte(vmf, vmf->memcg, page); 3272
3273 /*
3274 * check even for read faults because we might have lost our CoWed
3275 * page
3276 */
3277 if (!(vmf->vma->vm_flags & VM_SHARED))
3278 ret = check_stable_address_space(vmf->vma->vm_mm);
3279 if (!ret)
3280 ret = alloc_set_pte(vmf, vmf->memcg, page);
3264 if (vmf->pte) 3281 if (vmf->pte)
3265 pte_unmap_unlock(vmf->pte, vmf->ptl); 3282 pte_unmap_unlock(vmf->pte, vmf->ptl);
3266 return ret; 3283 return ret;
@@ -3900,29 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3900 mem_cgroup_oom_synchronize(false); 3917 mem_cgroup_oom_synchronize(false);
3901 } 3918 }
3902 3919
3903 /*
3904 * This mm has been already reaped by the oom reaper and so the
3905 * refault cannot be trusted in general. Anonymous refaults would
3906 * lose data and give a zero page instead e.g. This is especially
3907 * problem for use_mm() because regular tasks will just die and
3908 * the corrupted data will not be visible anywhere while kthread
3909 * will outlive the oom victim and potentially propagate the date
3910 * further.
3911 */
3912 if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
3913 && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) {
3914
3915 /*
3916 * We are going to enforce SIGBUS but the PF path might have
3917 * dropped the mmap_sem already so take it again so that
3918 * we do not break expectations of all arch specific PF paths
3919 * and g-u-p
3920 */
3921 if (ret & VM_FAULT_RETRY)
3922 down_read(&vma->vm_mm->mmap_sem);
3923 ret = VM_FAULT_SIGBUS;
3924 }
3925
3926 return ret; 3920 return ret;
3927} 3921}
3928EXPORT_SYMBOL_GPL(handle_mm_fault); 3922EXPORT_SYMBOL_GPL(handle_mm_fault);