summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPavel Emelyanov <xemul@parallels.com>2017-02-22 18:42:27 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 19:41:28 -0500
commit893e26e61d04eac974ded0c11e1647b335c8cb7b (patch)
treed75bbbd621a08130e8794dffcedc71f4a6ff50cc
parent656031445d5a855e1c13b291dedae32579d0f3f2 (diff)
userfaultfd: non-cooperative: Add fork() event
When the mm with uffd-ed vmas fork()-s the respective vmas notify their uffds with the event which contains a descriptor with new uffd. This new descriptor can then be used to get events from the child and populate its mm with data. Note, that there can be different uffd-s controlling different vmas within one mm, so first we should collect all those uffds (and ctx-s) in a list and then notify them all one by one but only once per fork(). The context is created at fork() time but the descriptor, file struct and anon inode object is created at event read time. So some trickery is added to the userfaultfd_ctx_read() to handle the ctx queues' locking vs file creation. Another thing worth noticing is that the task that fork()-s waits for the uffd event to get processed WITHOUT the mmap sem. [aarcange@redhat.com: build warning fix] Link: http://lkml.kernel.org/r/20161216144821.5183-10-aarcange@redhat.com Link: http://lkml.kernel.org/r/20161216144821.5183-9-aarcange@redhat.com Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Michael Rapoport <RAPOPORT@il.ibm.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/userfaultfd.c148
-rw-r--r--include/linux/userfaultfd_k.h13
-rw-r--r--include/uapi/linux/userfaultfd.h15
-rw-r--r--kernel/fork.c10
4 files changed, 170 insertions, 16 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 87d31921b66c..6046e0b552b2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -64,6 +64,12 @@ struct userfaultfd_ctx {
64 struct mm_struct *mm; 64 struct mm_struct *mm;
65}; 65};
66 66
67struct userfaultfd_fork_ctx {
68 struct userfaultfd_ctx *orig;
69 struct userfaultfd_ctx *new;
70 struct list_head list;
71};
72
67struct userfaultfd_wait_queue { 73struct userfaultfd_wait_queue {
68 struct uffd_msg msg; 74 struct uffd_msg msg;
69 wait_queue_t wq; 75 wait_queue_t wq;
@@ -465,9 +471,8 @@ out:
465 return ret; 471 return ret;
466} 472}
467 473
468static int __maybe_unused userfaultfd_event_wait_completion( 474static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
469 struct userfaultfd_ctx *ctx, 475 struct userfaultfd_wait_queue *ewq)
470 struct userfaultfd_wait_queue *ewq)
471{ 476{
472 int ret = 0; 477 int ret = 0;
473 478
@@ -518,6 +523,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
518 __remove_wait_queue(&ctx->event_wqh, &ewq->wq); 523 __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
519} 524}
520 525
526int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
527{
528 struct userfaultfd_ctx *ctx = NULL, *octx;
529 struct userfaultfd_fork_ctx *fctx;
530
531 octx = vma->vm_userfaultfd_ctx.ctx;
532 if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
533 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
534 vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
535 return 0;
536 }
537
538 list_for_each_entry(fctx, fcs, list)
539 if (fctx->orig == octx) {
540 ctx = fctx->new;
541 break;
542 }
543
544 if (!ctx) {
545 fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
546 if (!fctx)
547 return -ENOMEM;
548
549 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
550 if (!ctx) {
551 kfree(fctx);
552 return -ENOMEM;
553 }
554
555 atomic_set(&ctx->refcount, 1);
556 ctx->flags = octx->flags;
557 ctx->state = UFFD_STATE_RUNNING;
558 ctx->features = octx->features;
559 ctx->released = false;
560 ctx->mm = vma->vm_mm;
561 atomic_inc(&ctx->mm->mm_users);
562
563 userfaultfd_ctx_get(octx);
564 fctx->orig = octx;
565 fctx->new = ctx;
566 list_add_tail(&fctx->list, fcs);
567 }
568
569 vma->vm_userfaultfd_ctx.ctx = ctx;
570 return 0;
571}
572
573static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
574{
575 struct userfaultfd_ctx *ctx = fctx->orig;
576 struct userfaultfd_wait_queue ewq;
577
578 msg_init(&ewq.msg);
579
580 ewq.msg.event = UFFD_EVENT_FORK;
581 ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
582
583 return userfaultfd_event_wait_completion(ctx, &ewq);
584}
585
586void dup_userfaultfd_complete(struct list_head *fcs)
587{
588 int ret = 0;
589 struct userfaultfd_fork_ctx *fctx, *n;
590
591 list_for_each_entry_safe(fctx, n, fcs, list) {
592 if (!ret)
593 ret = dup_fctx(fctx);
594 list_del(&fctx->list);
595 kfree(fctx);
596 }
597}
598
521static int userfaultfd_release(struct inode *inode, struct file *file) 599static int userfaultfd_release(struct inode *inode, struct file *file)
522{ 600{
523 struct userfaultfd_ctx *ctx = file->private_data; 601 struct userfaultfd_ctx *ctx = file->private_data;
@@ -653,12 +731,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
653 } 731 }
654} 732}
655 733
734static const struct file_operations userfaultfd_fops;
735
736static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
737 struct userfaultfd_ctx *new,
738 struct uffd_msg *msg)
739{
740 int fd;
741 struct file *file;
742 unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
743
744 fd = get_unused_fd_flags(flags);
745 if (fd < 0)
746 return fd;
747
748 file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
749 O_RDWR | flags);
750 if (IS_ERR(file)) {
751 put_unused_fd(fd);
752 return PTR_ERR(file);
753 }
754
755 fd_install(fd, file);
756 msg->arg.reserved.reserved1 = 0;
757 msg->arg.fork.ufd = fd;
758
759 return 0;
760}
761
656static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, 762static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
657 struct uffd_msg *msg) 763 struct uffd_msg *msg)
658{ 764{
659 ssize_t ret; 765 ssize_t ret;
660 DECLARE_WAITQUEUE(wait, current); 766 DECLARE_WAITQUEUE(wait, current);
661 struct userfaultfd_wait_queue *uwq; 767 struct userfaultfd_wait_queue *uwq;
768 /*
769 * Handling fork event requires sleeping operations, so
770 * we drop the event_wqh lock, then do these ops, then
771 * lock it back and wake up the waiter. While the lock is
772 * dropped the ewq may go away so we keep track of it
773 * carefully.
774 */
775 LIST_HEAD(fork_event);
776 struct userfaultfd_ctx *fork_nctx = NULL;
662 777
663 /* always take the fd_wqh lock before the fault_pending_wqh lock */ 778 /* always take the fd_wqh lock before the fault_pending_wqh lock */
664 spin_lock(&ctx->fd_wqh.lock); 779 spin_lock(&ctx->fd_wqh.lock);
@@ -716,6 +831,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
716 if (uwq) { 831 if (uwq) {
717 *msg = uwq->msg; 832 *msg = uwq->msg;
718 833
834 if (uwq->msg.event == UFFD_EVENT_FORK) {
835 fork_nctx = (struct userfaultfd_ctx *)
836 (unsigned long)
837 uwq->msg.arg.reserved.reserved1;
838 list_move(&uwq->wq.task_list, &fork_event);
839 spin_unlock(&ctx->event_wqh.lock);
840 ret = 0;
841 break;
842 }
843
719 userfaultfd_event_complete(ctx, uwq); 844 userfaultfd_event_complete(ctx, uwq);
720 spin_unlock(&ctx->event_wqh.lock); 845 spin_unlock(&ctx->event_wqh.lock);
721 ret = 0; 846 ret = 0;
@@ -739,6 +864,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
739 __set_current_state(TASK_RUNNING); 864 __set_current_state(TASK_RUNNING);
740 spin_unlock(&ctx->fd_wqh.lock); 865 spin_unlock(&ctx->fd_wqh.lock);
741 866
867 if (!ret && msg->event == UFFD_EVENT_FORK) {
868 ret = resolve_userfault_fork(ctx, fork_nctx, msg);
869
870 if (!ret) {
871 spin_lock(&ctx->event_wqh.lock);
872 if (!list_empty(&fork_event)) {
873 uwq = list_first_entry(&fork_event,
874 typeof(*uwq),
875 wq.task_list);
876 list_del(&uwq->wq.task_list);
877 __add_wait_queue(&ctx->event_wqh, &uwq->wq);
878 userfaultfd_event_complete(ctx, uwq);
879 }
880 spin_unlock(&ctx->event_wqh.lock);
881 }
882 }
883
742 return ret; 884 return ret;
743} 885}
744 886
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 11b92b047a1e..79002bca1f43 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
52 return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); 52 return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
53} 53}
54 54
55extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
56extern void dup_userfaultfd_complete(struct list_head *);
57
55#else /* CONFIG_USERFAULTFD */ 58#else /* CONFIG_USERFAULTFD */
56 59
57/* mm helpers */ 60/* mm helpers */
@@ -76,6 +79,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
76 return false; 79 return false;
77} 80}
78 81
82static inline int dup_userfaultfd(struct vm_area_struct *vma,
83 struct list_head *l)
84{
85 return 0;
86}
87
88static inline void dup_userfaultfd_complete(struct list_head *l)
89{
90}
91
79#endif /* CONFIG_USERFAULTFD */ 92#endif /* CONFIG_USERFAULTFD */
80 93
81#endif /* _LINUX_USERFAULTFD_K_H */ 94#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 94046b8aa6ad..c8953c84fdcc 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,12 +18,7 @@
18 * means the userland is reading). 18 * means the userland is reading).
19 */ 19 */
20#define UFFD_API ((__u64)0xAA) 20#define UFFD_API ((__u64)0xAA)
21/* 21#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
22 * After implementing the respective features it will become:
23 * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
24 * UFFD_FEATURE_EVENT_FORK)
25 */
26#define UFFD_API_FEATURES (0)
27#define UFFD_API_IOCTLS \ 22#define UFFD_API_IOCTLS \
28 ((__u64)1 << _UFFDIO_REGISTER | \ 23 ((__u64)1 << _UFFDIO_REGISTER | \
29 (__u64)1 << _UFFDIO_UNREGISTER | \ 24 (__u64)1 << _UFFDIO_UNREGISTER | \
@@ -78,6 +73,10 @@ struct uffd_msg {
78 } pagefault; 73 } pagefault;
79 74
80 struct { 75 struct {
76 __u32 ufd;
77 } fork;
78
79 struct {
81 /* unused reserved fields */ 80 /* unused reserved fields */
82 __u64 reserved1; 81 __u64 reserved1;
83 __u64 reserved2; 82 __u64 reserved2;
@@ -90,9 +89,7 @@ struct uffd_msg {
90 * Start at 0x12 and not at 0 to be more strict against bugs. 89 * Start at 0x12 and not at 0 to be more strict against bugs.
91 */ 90 */
92#define UFFD_EVENT_PAGEFAULT 0x12 91#define UFFD_EVENT_PAGEFAULT 0x12
93#if 0 /* not available yet */
94#define UFFD_EVENT_FORK 0x13 92#define UFFD_EVENT_FORK 0x13
95#endif
96 93
97/* flags for UFFD_EVENT_PAGEFAULT */ 94/* flags for UFFD_EVENT_PAGEFAULT */
98#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ 95#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
@@ -111,10 +108,8 @@ struct uffdio_api {
111 * are to be considered implicitly always enabled in all kernels as 108 * are to be considered implicitly always enabled in all kernels as
112 * long as the uffdio_api.api requested matches UFFD_API. 109 * long as the uffdio_api.api requested matches UFFD_API.
113 */ 110 */
114#if 0 /* not available yet */
115#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) 111#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
116#define UFFD_FEATURE_EVENT_FORK (1<<1) 112#define UFFD_FEATURE_EVENT_FORK (1<<1)
117#endif
118 __u64 features; 113 __u64 features;
119 114
120 __u64 ioctls; 115 __u64 ioctls;
diff --git a/kernel/fork.c b/kernel/fork.c
index ff82e24573b6..d12fcc4db8a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
55#include <linux/rmap.h> 55#include <linux/rmap.h>
56#include <linux/ksm.h> 56#include <linux/ksm.h>
57#include <linux/acct.h> 57#include <linux/acct.h>
58#include <linux/userfaultfd_k.h>
58#include <linux/tsacct_kern.h> 59#include <linux/tsacct_kern.h>
59#include <linux/cn_proc.h> 60#include <linux/cn_proc.h>
60#include <linux/freezer.h> 61#include <linux/freezer.h>
@@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
561 struct rb_node **rb_link, *rb_parent; 562 struct rb_node **rb_link, *rb_parent;
562 int retval; 563 int retval;
563 unsigned long charge; 564 unsigned long charge;
565 LIST_HEAD(uf);
564 566
565 uprobe_start_dup_mmap(); 567 uprobe_start_dup_mmap();
566 if (down_write_killable(&oldmm->mmap_sem)) { 568 if (down_write_killable(&oldmm->mmap_sem)) {
@@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
617 if (retval) 619 if (retval)
618 goto fail_nomem_policy; 620 goto fail_nomem_policy;
619 tmp->vm_mm = mm; 621 tmp->vm_mm = mm;
622 retval = dup_userfaultfd(tmp, &uf);
623 if (retval)
624 goto fail_nomem_anon_vma_fork;
620 if (anon_vma_fork(tmp, mpnt)) 625 if (anon_vma_fork(tmp, mpnt))
621 goto fail_nomem_anon_vma_fork; 626 goto fail_nomem_anon_vma_fork;
622 tmp->vm_flags &= 627 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
623 ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
624 tmp->vm_next = tmp->vm_prev = NULL; 628 tmp->vm_next = tmp->vm_prev = NULL;
625 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
626 file = tmp->vm_file; 629 file = tmp->vm_file;
627 if (file) { 630 if (file) {
628 struct inode *inode = file_inode(file); 631 struct inode *inode = file_inode(file);
@@ -678,6 +681,7 @@ out:
678 up_write(&mm->mmap_sem); 681 up_write(&mm->mmap_sem);
679 flush_tlb_mm(oldmm); 682 flush_tlb_mm(oldmm);
680 up_write(&oldmm->mmap_sem); 683 up_write(&oldmm->mmap_sem);
684 dup_userfaultfd_complete(&uf);
681fail_uprobe_end: 685fail_uprobe_end:
682 uprobe_end_dup_mmap(); 686 uprobe_end_dup_mmap();
683 return retval; 687 return retval;