aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Rapoport <rppt@linux.vnet.ibm.com>2017-02-24 17:58:22 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-24 20:46:55 -0500
commit897ab3e0c49e24b62e2d54d165c7afec6bbca65b (patch)
tree5fa7e09864d6c959cef33849f6cb10ed04e459e4
parent846b1a0f1db065a8479159dd8fecddb1ebf30547 (diff)
userfaultfd: non-cooperative: add event for memory unmaps
When a non-cooperative userfaultfd monitor copies pages in the background, it may encounter regions that were already unmapped. Addition of UFFD_EVENT_UNMAP allows the uffd monitor to track precisely changes in the virtual memory layout. Since there might be different uffd contexts for the affected VMAs, we first should create a temporary representation for the unmap event for each uffd context and then notify them one by one to the appropriate userfault file descriptors. The event notification occurs after the mmap_sem has been released. [arnd@arndb.de: fix nommu build] Link: http://lkml.kernel.org/r/20170203165141.3665284-1-arnd@arndb.de [mhocko@suse.com: fix nommu build] Link: http://lkml.kernel.org/r/20170202091503.GA22823@dhcp22.suse.cz Link: http://lkml.kernel.org/r/1485542673-24387-3-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Pavel Emelyanov <xemul@virtuozzo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/mips/kernel/vdso.c2
-rw-r--r--arch/tile/mm/elf.c2
-rw-r--r--arch/x86/entry/vdso/vma.c2
-rw-r--r--arch/x86/mm/mpx.c4
-rw-r--r--fs/aio.c2
-rw-r--r--fs/proc/vmcore.c4
-rw-r--r--fs/userfaultfd.c65
-rw-r--r--include/linux/mm.h14
-rw-r--r--include/linux/userfaultfd_k.h18
-rw-r--r--include/uapi/linux/userfaultfd.h3
-rw-r--r--ipc/shm.c8
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/mremap.c23
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/util.c5
15 files changed, 160 insertions, 45 deletions
diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index f9dbfb14af33..093517e85a6c 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -111,7 +111,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
111 base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, 111 base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
112 VM_READ|VM_WRITE|VM_EXEC| 112 VM_READ|VM_WRITE|VM_EXEC|
113 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 113 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
114 0); 114 0, NULL);
115 if (IS_ERR_VALUE(base)) { 115 if (IS_ERR_VALUE(base)) {
116 ret = base; 116 ret = base;
117 goto out; 117 goto out;
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 6225cc998db1..889901824400 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -143,7 +143,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
143 unsigned long addr = MEM_USER_INTRPT; 143 unsigned long addr = MEM_USER_INTRPT;
144 addr = mmap_region(NULL, addr, INTRPT_SIZE, 144 addr = mmap_region(NULL, addr, INTRPT_SIZE,
145 VM_READ|VM_EXEC| 145 VM_READ|VM_EXEC|
146 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); 146 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0, NULL);
147 if (addr > (unsigned long) -PAGE_SIZE) 147 if (addr > (unsigned long) -PAGE_SIZE)
148 retval = (int) addr; 148 retval = (int) addr;
149 } 149 }
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 10820f6cefbf..572cee3fccff 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -186,7 +186,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
186 186
187 if (IS_ERR(vma)) { 187 if (IS_ERR(vma)) {
188 ret = PTR_ERR(vma); 188 ret = PTR_ERR(vma);
189 do_munmap(mm, text_start, image->size); 189 do_munmap(mm, text_start, image->size, NULL);
190 } else { 190 } else {
191 current->mm->context.vdso = (void __user *)text_start; 191 current->mm->context.vdso = (void __user *)text_start;
192 current->mm->context.vdso_image = image; 192 current->mm->context.vdso_image = image;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index aad4ac386f98..c98079684bdb 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -51,7 +51,7 @@ static unsigned long mpx_mmap(unsigned long len)
51 51
52 down_write(&mm->mmap_sem); 52 down_write(&mm->mmap_sem);
53 addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, 53 addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
54 MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate); 54 MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL);
55 up_write(&mm->mmap_sem); 55 up_write(&mm->mmap_sem);
56 if (populate) 56 if (populate)
57 mm_populate(addr, populate); 57 mm_populate(addr, populate);
@@ -893,7 +893,7 @@ static int unmap_entire_bt(struct mm_struct *mm,
893 * avoid recursion, do_munmap() will check whether it comes 893 * avoid recursion, do_munmap() will check whether it comes
894 * from one bounds table through VM_MPX flag. 894 * from one bounds table through VM_MPX flag.
895 */ 895 */
896 return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm)); 896 return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL);
897} 897}
898 898
899static int try_unmap_single_bt(struct mm_struct *mm, 899static int try_unmap_single_bt(struct mm_struct *mm,
diff --git a/fs/aio.c b/fs/aio.c
index 873b4ca82ccb..7e2ab9c8e39c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -512,7 +512,7 @@ static int aio_setup_ring(struct kioctx *ctx)
512 512
513 ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, 513 ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
514 PROT_READ | PROT_WRITE, 514 PROT_READ | PROT_WRITE,
515 MAP_SHARED, 0, &unused); 515 MAP_SHARED, 0, &unused, NULL);
516 up_write(&mm->mmap_sem); 516 up_write(&mm->mmap_sem);
517 if (IS_ERR((void *)ctx->mmap_base)) { 517 if (IS_ERR((void *)ctx->mmap_base)) {
518 ctx->mmap_size = 0; 518 ctx->mmap_size = 0;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f52d8e857ff7..885d445afa0d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -388,7 +388,7 @@ static int remap_oldmem_pfn_checked(struct vm_area_struct *vma,
388 } 388 }
389 return 0; 389 return 0;
390fail: 390fail:
391 do_munmap(vma->vm_mm, from, len); 391 do_munmap(vma->vm_mm, from, len, NULL);
392 return -EAGAIN; 392 return -EAGAIN;
393} 393}
394 394
@@ -481,7 +481,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
481 481
482 return 0; 482 return 0;
483fail: 483fail:
484 do_munmap(vma->vm_mm, vma->vm_start, len); 484 do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
485 return -EAGAIN; 485 return -EAGAIN;
486} 486}
487#else 487#else
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8fe601b4875e..4c78458ea78d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -71,6 +71,13 @@ struct userfaultfd_fork_ctx {
71 struct list_head list; 71 struct list_head list;
72}; 72};
73 73
74struct userfaultfd_unmap_ctx {
75 struct userfaultfd_ctx *ctx;
76 unsigned long start;
77 unsigned long end;
78 struct list_head list;
79};
80
74struct userfaultfd_wait_queue { 81struct userfaultfd_wait_queue {
75 struct uffd_msg msg; 82 struct uffd_msg msg;
76 wait_queue_t wq; 83 wait_queue_t wq;
@@ -709,6 +716,64 @@ void userfaultfd_remove(struct vm_area_struct *vma,
709 down_read(&mm->mmap_sem); 716 down_read(&mm->mmap_sem);
710} 717}
711 718
719static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
720 unsigned long start, unsigned long end)
721{
722 struct userfaultfd_unmap_ctx *unmap_ctx;
723
724 list_for_each_entry(unmap_ctx, unmaps, list)
725 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
726 unmap_ctx->end == end)
727 return true;
728
729 return false;
730}
731
732int userfaultfd_unmap_prep(struct vm_area_struct *vma,
733 unsigned long start, unsigned long end,
734 struct list_head *unmaps)
735{
736 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
737 struct userfaultfd_unmap_ctx *unmap_ctx;
738 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
739
740 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
741 has_unmap_ctx(ctx, unmaps, start, end))
742 continue;
743
744 unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
745 if (!unmap_ctx)
746 return -ENOMEM;
747
748 userfaultfd_ctx_get(ctx);
749 unmap_ctx->ctx = ctx;
750 unmap_ctx->start = start;
751 unmap_ctx->end = end;
752 list_add_tail(&unmap_ctx->list, unmaps);
753 }
754
755 return 0;
756}
757
758void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
759{
760 struct userfaultfd_unmap_ctx *ctx, *n;
761 struct userfaultfd_wait_queue ewq;
762
763 list_for_each_entry_safe(ctx, n, uf, list) {
764 msg_init(&ewq.msg);
765
766 ewq.msg.event = UFFD_EVENT_UNMAP;
767 ewq.msg.arg.remove.start = ctx->start;
768 ewq.msg.arg.remove.end = ctx->end;
769
770 userfaultfd_event_wait_completion(ctx->ctx, &ewq);
771
772 list_del(&ctx->list);
773 kfree(ctx);
774 }
775}
776
712static int userfaultfd_release(struct inode *inode, struct file *file) 777static int userfaultfd_release(struct inode *inode, struct file *file)
713{ 778{
714 struct userfaultfd_ctx *ctx = file->private_data; 779 struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c65aa43b5712..c6fcba1d1ae5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2090,18 +2090,22 @@ extern int install_special_mapping(struct mm_struct *mm,
2090extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 2090extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
2091 2091
2092extern unsigned long mmap_region(struct file *file, unsigned long addr, 2092extern unsigned long mmap_region(struct file *file, unsigned long addr,
2093 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); 2093 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
2094 struct list_head *uf);
2094extern unsigned long do_mmap(struct file *file, unsigned long addr, 2095extern unsigned long do_mmap(struct file *file, unsigned long addr,
2095 unsigned long len, unsigned long prot, unsigned long flags, 2096 unsigned long len, unsigned long prot, unsigned long flags,
2096 vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate); 2097 vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
2097extern int do_munmap(struct mm_struct *, unsigned long, size_t); 2098 struct list_head *uf);
2099extern int do_munmap(struct mm_struct *, unsigned long, size_t,
2100 struct list_head *uf);
2098 2101
2099static inline unsigned long 2102static inline unsigned long
2100do_mmap_pgoff(struct file *file, unsigned long addr, 2103do_mmap_pgoff(struct file *file, unsigned long addr,
2101 unsigned long len, unsigned long prot, unsigned long flags, 2104 unsigned long len, unsigned long prot, unsigned long flags,
2102 unsigned long pgoff, unsigned long *populate) 2105 unsigned long pgoff, unsigned long *populate,
2106 struct list_head *uf)
2103{ 2107{
2104 return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate); 2108 return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
2105} 2109}
2106 2110
2107#ifdef CONFIG_MMU 2111#ifdef CONFIG_MMU
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 2521542f6c07..a40be5d0661b 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -66,6 +66,12 @@ extern void userfaultfd_remove(struct vm_area_struct *vma,
66 unsigned long start, 66 unsigned long start,
67 unsigned long end); 67 unsigned long end);
68 68
69extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
70 unsigned long start, unsigned long end,
71 struct list_head *uf);
72extern void userfaultfd_unmap_complete(struct mm_struct *mm,
73 struct list_head *uf);
74
69#else /* CONFIG_USERFAULTFD */ 75#else /* CONFIG_USERFAULTFD */
70 76
71/* mm helpers */ 77/* mm helpers */
@@ -118,6 +124,18 @@ static inline void userfaultfd_remove(struct vm_area_struct *vma,
118 unsigned long end) 124 unsigned long end)
119{ 125{
120} 126}
127
128static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
129 unsigned long start, unsigned long end,
130 struct list_head *uf)
131{
132 return 0;
133}
134
135static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
136 struct list_head *uf)
137{
138}
121#endif /* CONFIG_USERFAULTFD */ 139#endif /* CONFIG_USERFAULTFD */
122 140
123#endif /* _LINUX_USERFAULTFD_K_H */ 141#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index b742c40c2880..3b059530dac9 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -21,6 +21,7 @@
21#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ 21#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \
22 UFFD_FEATURE_EVENT_REMAP | \ 22 UFFD_FEATURE_EVENT_REMAP | \
23 UFFD_FEATURE_EVENT_REMOVE | \ 23 UFFD_FEATURE_EVENT_REMOVE | \
24 UFFD_FEATURE_EVENT_UNMAP | \
24 UFFD_FEATURE_MISSING_HUGETLBFS | \ 25 UFFD_FEATURE_MISSING_HUGETLBFS | \
25 UFFD_FEATURE_MISSING_SHMEM) 26 UFFD_FEATURE_MISSING_SHMEM)
26#define UFFD_API_IOCTLS \ 27#define UFFD_API_IOCTLS \
@@ -110,6 +111,7 @@ struct uffd_msg {
110#define UFFD_EVENT_FORK 0x13 111#define UFFD_EVENT_FORK 0x13
111#define UFFD_EVENT_REMAP 0x14 112#define UFFD_EVENT_REMAP 0x14
112#define UFFD_EVENT_REMOVE 0x15 113#define UFFD_EVENT_REMOVE 0x15
114#define UFFD_EVENT_UNMAP 0x16
113 115
114/* flags for UFFD_EVENT_PAGEFAULT */ 116/* flags for UFFD_EVENT_PAGEFAULT */
115#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ 117#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
@@ -158,6 +160,7 @@ struct uffdio_api {
158#define UFFD_FEATURE_EVENT_REMOVE (1<<3) 160#define UFFD_FEATURE_EVENT_REMOVE (1<<3)
159#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) 161#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4)
160#define UFFD_FEATURE_MISSING_SHMEM (1<<5) 162#define UFFD_FEATURE_MISSING_SHMEM (1<<5)
163#define UFFD_FEATURE_EVENT_UNMAP (1<<6)
161 __u64 features; 164 __u64 features;
162 165
163 __u64 ioctls; 166 __u64 ioctls;
diff --git a/ipc/shm.c b/ipc/shm.c
index 7f6537b84ef5..d7805acb44fd 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1222,7 +1222,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
1222 goto invalid; 1222 goto invalid;
1223 } 1223 }
1224 1224
1225 addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); 1225 addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL);
1226 *raddr = addr; 1226 *raddr = addr;
1227 err = 0; 1227 err = 0;
1228 if (IS_ERR_VALUE(addr)) 1228 if (IS_ERR_VALUE(addr))
@@ -1329,7 +1329,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1329 */ 1329 */
1330 file = vma->vm_file; 1330 file = vma->vm_file;
1331 size = i_size_read(file_inode(vma->vm_file)); 1331 size = i_size_read(file_inode(vma->vm_file));
1332 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1332 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
1333 /* 1333 /*
1334 * We discovered the size of the shm segment, so 1334 * We discovered the size of the shm segment, so
1335 * break out of here and fall through to the next 1335 * break out of here and fall through to the next
@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1356 if ((vma->vm_ops == &shm_vm_ops) && 1356 if ((vma->vm_ops == &shm_vm_ops) &&
1357 ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && 1357 ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
1358 (vma->vm_file == file)) 1358 (vma->vm_file == file))
1359 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1359 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
1360 vma = next; 1360 vma = next;
1361 } 1361 }
1362 1362
@@ -1365,7 +1365,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1365 * given 1365 * given
1366 */ 1366 */
1367 if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { 1367 if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1368 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1368 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
1369 retval = 0; 1369 retval = 0;
1370 } 1370 }
1371 1371
diff --git a/mm/mmap.c b/mm/mmap.c
index 13d16a2b7623..1cec28d20583 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -176,7 +176,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
176 return next; 176 return next;
177} 177}
178 178
179static int do_brk(unsigned long addr, unsigned long len); 179static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf);
180 180
181SYSCALL_DEFINE1(brk, unsigned long, brk) 181SYSCALL_DEFINE1(brk, unsigned long, brk)
182{ 182{
@@ -185,6 +185,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
185 struct mm_struct *mm = current->mm; 185 struct mm_struct *mm = current->mm;
186 unsigned long min_brk; 186 unsigned long min_brk;
187 bool populate; 187 bool populate;
188 LIST_HEAD(uf);
188 189
189 if (down_write_killable(&mm->mmap_sem)) 190 if (down_write_killable(&mm->mmap_sem))
190 return -EINTR; 191 return -EINTR;
@@ -222,7 +223,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
222 223
223 /* Always allow shrinking brk. */ 224 /* Always allow shrinking brk. */
224 if (brk <= mm->brk) { 225 if (brk <= mm->brk) {
225 if (!do_munmap(mm, newbrk, oldbrk-newbrk)) 226 if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
226 goto set_brk; 227 goto set_brk;
227 goto out; 228 goto out;
228 } 229 }
@@ -232,13 +233,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
232 goto out; 233 goto out;
233 234
234 /* Ok, looks good - let it rip. */ 235 /* Ok, looks good - let it rip. */
235 if (do_brk(oldbrk, newbrk-oldbrk) < 0) 236 if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0)
236 goto out; 237 goto out;
237 238
238set_brk: 239set_brk:
239 mm->brk = brk; 240 mm->brk = brk;
240 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; 241 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
241 up_write(&mm->mmap_sem); 242 up_write(&mm->mmap_sem);
243 userfaultfd_unmap_complete(mm, &uf);
242 if (populate) 244 if (populate)
243 mm_populate(oldbrk, newbrk - oldbrk); 245 mm_populate(oldbrk, newbrk - oldbrk);
244 return brk; 246 return brk;
@@ -1304,7 +1306,8 @@ static inline int mlock_future_check(struct mm_struct *mm,
1304unsigned long do_mmap(struct file *file, unsigned long addr, 1306unsigned long do_mmap(struct file *file, unsigned long addr,
1305 unsigned long len, unsigned long prot, 1307 unsigned long len, unsigned long prot,
1306 unsigned long flags, vm_flags_t vm_flags, 1308 unsigned long flags, vm_flags_t vm_flags,
1307 unsigned long pgoff, unsigned long *populate) 1309 unsigned long pgoff, unsigned long *populate,
1310 struct list_head *uf)
1308{ 1311{
1309 struct mm_struct *mm = current->mm; 1312 struct mm_struct *mm = current->mm;
1310 int pkey = 0; 1313 int pkey = 0;
@@ -1447,7 +1450,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
1447 vm_flags |= VM_NORESERVE; 1450 vm_flags |= VM_NORESERVE;
1448 } 1451 }
1449 1452
1450 addr = mmap_region(file, addr, len, vm_flags, pgoff); 1453 addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
1451 if (!IS_ERR_VALUE(addr) && 1454 if (!IS_ERR_VALUE(addr) &&
1452 ((vm_flags & VM_LOCKED) || 1455 ((vm_flags & VM_LOCKED) ||
1453 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) 1456 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
@@ -1583,7 +1586,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1583} 1586}
1584 1587
1585unsigned long mmap_region(struct file *file, unsigned long addr, 1588unsigned long mmap_region(struct file *file, unsigned long addr,
1586 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) 1589 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
1590 struct list_head *uf)
1587{ 1591{
1588 struct mm_struct *mm = current->mm; 1592 struct mm_struct *mm = current->mm;
1589 struct vm_area_struct *vma, *prev; 1593 struct vm_area_struct *vma, *prev;
@@ -1609,7 +1613,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1609 /* Clear old maps */ 1613 /* Clear old maps */
1610 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, 1614 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
1611 &rb_parent)) { 1615 &rb_parent)) {
1612 if (do_munmap(mm, addr, len)) 1616 if (do_munmap(mm, addr, len, uf))
1613 return -ENOMEM; 1617 return -ENOMEM;
1614 } 1618 }
1615 1619
@@ -2579,7 +2583,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2579 * work. This now handles partial unmappings. 2583 * work. This now handles partial unmappings.
2580 * Jeremy Fitzhardinge <jeremy@goop.org> 2584 * Jeremy Fitzhardinge <jeremy@goop.org>
2581 */ 2585 */
2582int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 2586int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2587 struct list_head *uf)
2583{ 2588{
2584 unsigned long end; 2589 unsigned long end;
2585 struct vm_area_struct *vma, *prev, *last; 2590 struct vm_area_struct *vma, *prev, *last;
@@ -2603,6 +2608,13 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2603 if (vma->vm_start >= end) 2608 if (vma->vm_start >= end)
2604 return 0; 2609 return 0;
2605 2610
2611 if (uf) {
2612 int error = userfaultfd_unmap_prep(vma, start, end, uf);
2613
2614 if (error)
2615 return error;
2616 }
2617
2606 /* 2618 /*
2607 * If we need to split any vma, do it now to save pain later. 2619 * If we need to split any vma, do it now to save pain later.
2608 * 2620 *
@@ -2668,12 +2680,14 @@ int vm_munmap(unsigned long start, size_t len)
2668{ 2680{
2669 int ret; 2681 int ret;
2670 struct mm_struct *mm = current->mm; 2682 struct mm_struct *mm = current->mm;
2683 LIST_HEAD(uf);
2671 2684
2672 if (down_write_killable(&mm->mmap_sem)) 2685 if (down_write_killable(&mm->mmap_sem))
2673 return -EINTR; 2686 return -EINTR;
2674 2687
2675 ret = do_munmap(mm, start, len); 2688 ret = do_munmap(mm, start, len, &uf);
2676 up_write(&mm->mmap_sem); 2689 up_write(&mm->mmap_sem);
2690 userfaultfd_unmap_complete(mm, &uf);
2677 return ret; 2691 return ret;
2678} 2692}
2679EXPORT_SYMBOL(vm_munmap); 2693EXPORT_SYMBOL(vm_munmap);
@@ -2773,7 +2787,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
2773 2787
2774 file = get_file(vma->vm_file); 2788 file = get_file(vma->vm_file);
2775 ret = do_mmap_pgoff(vma->vm_file, start, size, 2789 ret = do_mmap_pgoff(vma->vm_file, start, size,
2776 prot, flags, pgoff, &populate); 2790 prot, flags, pgoff, &populate, NULL);
2777 fput(file); 2791 fput(file);
2778out: 2792out:
2779 up_write(&mm->mmap_sem); 2793 up_write(&mm->mmap_sem);
@@ -2799,7 +2813,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
2799 * anonymous maps. eventually we may be able to do some 2813 * anonymous maps. eventually we may be able to do some
2800 * brk-specific accounting here. 2814 * brk-specific accounting here.
2801 */ 2815 */
2802static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) 2816static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf)
2803{ 2817{
2804 struct mm_struct *mm = current->mm; 2818 struct mm_struct *mm = current->mm;
2805 struct vm_area_struct *vma, *prev; 2819 struct vm_area_struct *vma, *prev;
@@ -2838,7 +2852,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
2838 */ 2852 */
2839 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, 2853 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
2840 &rb_parent)) { 2854 &rb_parent)) {
2841 if (do_munmap(mm, addr, len)) 2855 if (do_munmap(mm, addr, len, uf))
2842 return -ENOMEM; 2856 return -ENOMEM;
2843 } 2857 }
2844 2858
@@ -2885,9 +2899,9 @@ out:
2885 return 0; 2899 return 0;
2886} 2900}
2887 2901
2888static int do_brk(unsigned long addr, unsigned long len) 2902static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
2889{ 2903{
2890 return do_brk_flags(addr, len, 0); 2904 return do_brk_flags(addr, len, 0, uf);
2891} 2905}
2892 2906
2893int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) 2907int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
@@ -2895,13 +2909,15 @@ int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
2895 struct mm_struct *mm = current->mm; 2909 struct mm_struct *mm = current->mm;
2896 int ret; 2910 int ret;
2897 bool populate; 2911 bool populate;
2912 LIST_HEAD(uf);
2898 2913
2899 if (down_write_killable(&mm->mmap_sem)) 2914 if (down_write_killable(&mm->mmap_sem))
2900 return -EINTR; 2915 return -EINTR;
2901 2916
2902 ret = do_brk_flags(addr, len, flags); 2917 ret = do_brk_flags(addr, len, flags, &uf);
2903 populate = ((mm->def_flags & VM_LOCKED) != 0); 2918 populate = ((mm->def_flags & VM_LOCKED) != 0);
2904 up_write(&mm->mmap_sem); 2919 up_write(&mm->mmap_sem);
2920 userfaultfd_unmap_complete(mm, &uf);
2905 if (populate && !ret) 2921 if (populate && !ret)
2906 mm_populate(addr, len); 2922 mm_populate(addr, len);
2907 return ret; 2923 return ret;
diff --git a/mm/mremap.c b/mm/mremap.c
index 8779928d6a70..8233b0105c82 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -252,7 +252,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
252static unsigned long move_vma(struct vm_area_struct *vma, 252static unsigned long move_vma(struct vm_area_struct *vma,
253 unsigned long old_addr, unsigned long old_len, 253 unsigned long old_addr, unsigned long old_len,
254 unsigned long new_len, unsigned long new_addr, 254 unsigned long new_len, unsigned long new_addr,
255 bool *locked, struct vm_userfaultfd_ctx *uf) 255 bool *locked, struct vm_userfaultfd_ctx *uf,
256 struct list_head *uf_unmap)
256{ 257{
257 struct mm_struct *mm = vma->vm_mm; 258 struct mm_struct *mm = vma->vm_mm;
258 struct vm_area_struct *new_vma; 259 struct vm_area_struct *new_vma;
@@ -341,7 +342,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
341 if (unlikely(vma->vm_flags & VM_PFNMAP)) 342 if (unlikely(vma->vm_flags & VM_PFNMAP))
342 untrack_pfn_moved(vma); 343 untrack_pfn_moved(vma);
343 344
344 if (do_munmap(mm, old_addr, old_len) < 0) { 345 if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
345 /* OOM: unable to split vma, just get accounts right */ 346 /* OOM: unable to split vma, just get accounts right */
346 vm_unacct_memory(excess >> PAGE_SHIFT); 347 vm_unacct_memory(excess >> PAGE_SHIFT);
347 excess = 0; 348 excess = 0;
@@ -417,7 +418,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
417 418
418static unsigned long mremap_to(unsigned long addr, unsigned long old_len, 419static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
419 unsigned long new_addr, unsigned long new_len, bool *locked, 420 unsigned long new_addr, unsigned long new_len, bool *locked,
420 struct vm_userfaultfd_ctx *uf) 421 struct vm_userfaultfd_ctx *uf,
422 struct list_head *uf_unmap)
421{ 423{
422 struct mm_struct *mm = current->mm; 424 struct mm_struct *mm = current->mm;
423 struct vm_area_struct *vma; 425 struct vm_area_struct *vma;
@@ -435,12 +437,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
435 if (addr + old_len > new_addr && new_addr + new_len > addr) 437 if (addr + old_len > new_addr && new_addr + new_len > addr)
436 goto out; 438 goto out;
437 439
438 ret = do_munmap(mm, new_addr, new_len); 440 ret = do_munmap(mm, new_addr, new_len, NULL);
439 if (ret) 441 if (ret)
440 goto out; 442 goto out;
441 443
442 if (old_len >= new_len) { 444 if (old_len >= new_len) {
443 ret = do_munmap(mm, addr+new_len, old_len - new_len); 445 ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
444 if (ret && old_len != new_len) 446 if (ret && old_len != new_len)
445 goto out; 447 goto out;
446 old_len = new_len; 448 old_len = new_len;
@@ -462,7 +464,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
462 if (offset_in_page(ret)) 464 if (offset_in_page(ret))
463 goto out1; 465 goto out1;
464 466
465 ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf); 467 ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
468 uf_unmap);
466 if (!(offset_in_page(ret))) 469 if (!(offset_in_page(ret)))
467 goto out; 470 goto out;
468out1: 471out1:
@@ -502,6 +505,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
502 unsigned long charged = 0; 505 unsigned long charged = 0;
503 bool locked = false; 506 bool locked = false;
504 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; 507 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
508 LIST_HEAD(uf_unmap);
505 509
506 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) 510 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
507 return ret; 511 return ret;
@@ -528,7 +532,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
528 532
529 if (flags & MREMAP_FIXED) { 533 if (flags & MREMAP_FIXED) {
530 ret = mremap_to(addr, old_len, new_addr, new_len, 534 ret = mremap_to(addr, old_len, new_addr, new_len,
531 &locked, &uf); 535 &locked, &uf, &uf_unmap);
532 goto out; 536 goto out;
533 } 537 }
534 538
@@ -538,7 +542,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
538 * do_munmap does all the needed commit accounting 542 * do_munmap does all the needed commit accounting
539 */ 543 */
540 if (old_len >= new_len) { 544 if (old_len >= new_len) {
541 ret = do_munmap(mm, addr+new_len, old_len - new_len); 545 ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
542 if (ret && old_len != new_len) 546 if (ret && old_len != new_len)
543 goto out; 547 goto out;
544 ret = addr; 548 ret = addr;
@@ -598,7 +602,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
598 } 602 }
599 603
600 ret = move_vma(vma, addr, old_len, new_len, new_addr, 604 ret = move_vma(vma, addr, old_len, new_len, new_addr,
601 &locked, &uf); 605 &locked, &uf, &uf_unmap);
602 } 606 }
603out: 607out:
604 if (offset_in_page(ret)) { 608 if (offset_in_page(ret)) {
@@ -609,5 +613,6 @@ out:
609 if (locked && new_len > old_len) 613 if (locked && new_len > old_len)
610 mm_populate(new_addr + old_len, new_len - old_len); 614 mm_populate(new_addr + old_len, new_len - old_len);
611 mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); 615 mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
616 userfaultfd_unmap_complete(mm, &uf_unmap);
612 return ret; 617 return ret;
613} 618}
diff --git a/mm/nommu.c b/mm/nommu.c
index 215c62296028..fe9f4fa4a7a7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1205,7 +1205,8 @@ unsigned long do_mmap(struct file *file,
1205 unsigned long flags, 1205 unsigned long flags,
1206 vm_flags_t vm_flags, 1206 vm_flags_t vm_flags,
1207 unsigned long pgoff, 1207 unsigned long pgoff,
1208 unsigned long *populate) 1208 unsigned long *populate,
1209 struct list_head *uf)
1209{ 1210{
1210 struct vm_area_struct *vma; 1211 struct vm_area_struct *vma;
1211 struct vm_region *region; 1212 struct vm_region *region;
@@ -1577,7 +1578,7 @@ static int shrink_vma(struct mm_struct *mm,
1577 * - under NOMMU conditions the chunk to be unmapped must be backed by a single 1578 * - under NOMMU conditions the chunk to be unmapped must be backed by a single
1578 * VMA, though it need not cover the whole VMA 1579 * VMA, though it need not cover the whole VMA
1579 */ 1580 */
1580int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 1581int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
1581{ 1582{
1582 struct vm_area_struct *vma; 1583 struct vm_area_struct *vma;
1583 unsigned long end; 1584 unsigned long end;
@@ -1643,7 +1644,7 @@ int vm_munmap(unsigned long addr, size_t len)
1643 int ret; 1644 int ret;
1644 1645
1645 down_write(&mm->mmap_sem); 1646 down_write(&mm->mmap_sem);
1646 ret = do_munmap(mm, addr, len); 1647 ret = do_munmap(mm, addr, len, NULL);
1647 up_write(&mm->mmap_sem); 1648 up_write(&mm->mmap_sem);
1648 return ret; 1649 return ret;
1649} 1650}
diff --git a/mm/util.c b/mm/util.c
index 3cb2164f4099..b8f538863b5a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -11,6 +11,7 @@
11#include <linux/mman.h> 11#include <linux/mman.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/userfaultfd_k.h>
14 15
15#include <asm/sections.h> 16#include <asm/sections.h>
16#include <linux/uaccess.h> 17#include <linux/uaccess.h>
@@ -297,14 +298,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
297 unsigned long ret; 298 unsigned long ret;
298 struct mm_struct *mm = current->mm; 299 struct mm_struct *mm = current->mm;
299 unsigned long populate; 300 unsigned long populate;
301 LIST_HEAD(uf);
300 302
301 ret = security_mmap_file(file, prot, flag); 303 ret = security_mmap_file(file, prot, flag);
302 if (!ret) { 304 if (!ret) {
303 if (down_write_killable(&mm->mmap_sem)) 305 if (down_write_killable(&mm->mmap_sem))
304 return -EINTR; 306 return -EINTR;
305 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, 307 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
306 &populate); 308 &populate, &uf);
307 up_write(&mm->mmap_sem); 309 up_write(&mm->mmap_sem);
310 userfaultfd_unmap_complete(mm, &uf);
308 if (populate) 311 if (populate)
309 mm_populate(ret, populate); 312 mm_populate(ret, populate);
310 } 313 }