aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Rapoport <rppt@linux.vnet.ibm.com>2018-06-07 20:09:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-07 20:34:38 -0400
commitdf2cc96e77011cf7989208b206da9817e0321028 (patch)
tree5831fc1b33713f8073c5661a36890f7822a4f484
parentbe09102b4190561b67e3809b07a7fd29c9774152 (diff)
userfaultfd: prevent non-cooperative events vs mcopy_atomic races
If a process monitored with userfaultfd changes it's memory mappings or forks() at the same time as uffd monitor fills the process memory with UFFDIO_COPY, the actual creation of page table entries and copying of the data in mcopy_atomic may happen either before of after the memory mapping modifications and there is no way for the uffd monitor to maintain consistent view of the process memory layout. For instance, let's consider fork() running in parallel with userfaultfd_copy(): process | uffd monitor ---------------------------------+------------------------------ fork() | userfaultfd_copy() ... | ... dup_mmap() | down_read(mmap_sem) down_write(mmap_sem) | /* create PTEs, copy data */ dup_uffd() | up_read(mmap_sem) copy_page_range() | up_write(mmap_sem) | dup_uffd_complete() | /* notify monitor */ | If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be present by the time copy_page_range() is called and they will appear in the child's memory mappings. However, if the fork() is the first to take the mmap_sem, the new pages won't be mapped in the child's address space. If the pages are not present and child tries to access them, the monitor will get page fault notification and everything is fine. However, if the pages *are present*, the child can access them without uffd noticing. And if we copy them into child it'll see the wrong data. Since we are talking about background copy, we'd need to decide whether the pages should be copied or not regardless #PF notifications. Since userfaultfd monitor has no way to determine what was the order, let's disallow userfaultfd_copy in parallel with the non-cooperative events. In such case we return -EAGAIN and the uffd monitor can understand that userfaultfd_copy() clashed with a non-cooperative event and take an appropriate action. Link: http://lkml.kernel.org/r/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Acked-by: Pavel Emelyanov <xemul@virtuozzo.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Andrei Vagin <avagin@virtuozzo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/userfaultfd.c22
-rw-r--r--include/linux/userfaultfd_k.h6
-rw-r--r--mm/userfaultfd.c22
3 files changed, 41 insertions, 9 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cec550c8468f..123bf7d516fc 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -62,6 +62,8 @@ struct userfaultfd_ctx {
62 enum userfaultfd_state state; 62 enum userfaultfd_state state;
63 /* released */ 63 /* released */
64 bool released; 64 bool released;
65 /* memory mappings are changing because of non-cooperative event */
66 bool mmap_changing;
65 /* mm with one ore more vmas attached to this userfaultfd_ctx */ 67 /* mm with one ore more vmas attached to this userfaultfd_ctx */
66 struct mm_struct *mm; 68 struct mm_struct *mm;
67}; 69};
@@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
641 * already released. 643 * already released.
642 */ 644 */
643out: 645out:
646 WRITE_ONCE(ctx->mmap_changing, false);
644 userfaultfd_ctx_put(ctx); 647 userfaultfd_ctx_put(ctx);
645} 648}
646 649
@@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
686 ctx->state = UFFD_STATE_RUNNING; 689 ctx->state = UFFD_STATE_RUNNING;
687 ctx->features = octx->features; 690 ctx->features = octx->features;
688 ctx->released = false; 691 ctx->released = false;
692 ctx->mmap_changing = false;
689 ctx->mm = vma->vm_mm; 693 ctx->mm = vma->vm_mm;
690 mmgrab(ctx->mm); 694 mmgrab(ctx->mm);
691 695
692 userfaultfd_ctx_get(octx); 696 userfaultfd_ctx_get(octx);
697 WRITE_ONCE(octx->mmap_changing, true);
693 fctx->orig = octx; 698 fctx->orig = octx;
694 fctx->new = ctx; 699 fctx->new = ctx;
695 list_add_tail(&fctx->list, fcs); 700 list_add_tail(&fctx->list, fcs);
@@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
732 if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { 737 if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
733 vm_ctx->ctx = ctx; 738 vm_ctx->ctx = ctx;
734 userfaultfd_ctx_get(ctx); 739 userfaultfd_ctx_get(ctx);
740 WRITE_ONCE(ctx->mmap_changing, true);
735 } 741 }
736} 742}
737 743
@@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
772 return true; 778 return true;
773 779
774 userfaultfd_ctx_get(ctx); 780 userfaultfd_ctx_get(ctx);
781 WRITE_ONCE(ctx->mmap_changing, true);
775 up_read(&mm->mmap_sem); 782 up_read(&mm->mmap_sem);
776 783
777 msg_init(&ewq.msg); 784 msg_init(&ewq.msg);
@@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
815 return -ENOMEM; 822 return -ENOMEM;
816 823
817 userfaultfd_ctx_get(ctx); 824 userfaultfd_ctx_get(ctx);
825 WRITE_ONCE(ctx->mmap_changing, true);
818 unmap_ctx->ctx = ctx; 826 unmap_ctx->ctx = ctx;
819 unmap_ctx->start = start; 827 unmap_ctx->start = start;
820 unmap_ctx->end = end; 828 unmap_ctx->end = end;
@@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1653 1661
1654 user_uffdio_copy = (struct uffdio_copy __user *) arg; 1662 user_uffdio_copy = (struct uffdio_copy __user *) arg;
1655 1663
1664 ret = -EAGAIN;
1665 if (READ_ONCE(ctx->mmap_changing))
1666 goto out;
1667
1656 ret = -EFAULT; 1668 ret = -EFAULT;
1657 if (copy_from_user(&uffdio_copy, user_uffdio_copy, 1669 if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1658 /* don't copy "copy" last field */ 1670 /* don't copy "copy" last field */
@@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1674 goto out; 1686 goto out;
1675 if (mmget_not_zero(ctx->mm)) { 1687 if (mmget_not_zero(ctx->mm)) {
1676 ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, 1688 ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1677 uffdio_copy.len); 1689 uffdio_copy.len, &ctx->mmap_changing);
1678 mmput(ctx->mm); 1690 mmput(ctx->mm);
1679 } else { 1691 } else {
1680 return -ESRCH; 1692 return -ESRCH;
@@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1705 1717
1706 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; 1718 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1707 1719
1720 ret = -EAGAIN;
1721 if (READ_ONCE(ctx->mmap_changing))
1722 goto out;
1723
1708 ret = -EFAULT; 1724 ret = -EFAULT;
1709 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, 1725 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1710 /* don't copy "zeropage" last field */ 1726 /* don't copy "zeropage" last field */
@@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1721 1737
1722 if (mmget_not_zero(ctx->mm)) { 1738 if (mmget_not_zero(ctx->mm)) {
1723 ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, 1739 ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
1724 uffdio_zeropage.range.len); 1740 uffdio_zeropage.range.len,
1741 &ctx->mmap_changing);
1725 mmput(ctx->mm); 1742 mmput(ctx->mm);
1726 } else { 1743 } else {
1727 return -ESRCH; 1744 return -ESRCH;
@@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
1900 ctx->features = 0; 1917 ctx->features = 0;
1901 ctx->state = UFFD_STATE_WAIT_API; 1918 ctx->state = UFFD_STATE_WAIT_API;
1902 ctx->released = false; 1919 ctx->released = false;
1920 ctx->mmap_changing = false;
1903 ctx->mm = current->mm; 1921 ctx->mm = current->mm;
1904 /* prevent the mm struct to be freed */ 1922 /* prevent the mm struct to be freed */
1905 mmgrab(ctx->mm); 1923 mmgrab(ctx->mm);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f2f3b68ba910..e091f0a11b11 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -31,10 +31,12 @@
31extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); 31extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
32 32
33extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 33extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
34 unsigned long src_start, unsigned long len); 34 unsigned long src_start, unsigned long len,
35 bool *mmap_changing);
35extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, 36extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
36 unsigned long dst_start, 37 unsigned long dst_start,
37 unsigned long len); 38 unsigned long len,
39 bool *mmap_changing);
38 40
39/* mm helpers */ 41/* mm helpers */
40static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, 42static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 39791b81ede7..5029f241908f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
404 unsigned long dst_start, 404 unsigned long dst_start,
405 unsigned long src_start, 405 unsigned long src_start,
406 unsigned long len, 406 unsigned long len,
407 bool zeropage) 407 bool zeropage,
408 bool *mmap_changing)
408{ 409{
409 struct vm_area_struct *dst_vma; 410 struct vm_area_struct *dst_vma;
410 ssize_t err; 411 ssize_t err;
@@ -431,6 +432,15 @@ retry:
431 down_read(&dst_mm->mmap_sem); 432 down_read(&dst_mm->mmap_sem);
432 433
433 /* 434 /*
435 * If memory mappings are changing because of non-cooperative
436 * operation (e.g. mremap) running in parallel, bail out and
437 * request the user to retry later
438 */
439 err = -EAGAIN;
440 if (mmap_changing && READ_ONCE(*mmap_changing))
441 goto out_unlock;
442
443 /*
434 * Make sure the vma is not shared, that the dst range is 444 * Make sure the vma is not shared, that the dst range is
435 * both valid and fully within a single existing vma. 445 * both valid and fully within a single existing vma.
436 */ 446 */
@@ -563,13 +573,15 @@ out:
563} 573}
564 574
565ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 575ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
566 unsigned long src_start, unsigned long len) 576 unsigned long src_start, unsigned long len,
577 bool *mmap_changing)
567{ 578{
568 return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); 579 return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
580 mmap_changing);
569} 581}
570 582
571ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, 583ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
572 unsigned long len) 584 unsigned long len, bool *mmap_changing)
573{ 585{
574 return __mcopy_atomic(dst_mm, start, 0, len, true); 586 return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
575} 587}