aboutsummaryrefslogtreecommitdiffstats
path: root/mm/userfaultfd.c
diff options
context:
space:
mode:
authorMike Kravetz <mike.kravetz@oracle.com>2017-02-22 18:43:43 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 19:41:28 -0500
commit1c9e8def43a3452e7af658b340f5f4f4ecde5c38 (patch)
treeda23316c9053fe0e5a5fdd8b4cd24b14ec384473 /mm/userfaultfd.c
parentcac673292b9b39493bb0ff526b96c83ace6fdcd0 (diff)
userfaultfd: hugetlbfs: add UFFDIO_COPY support for shared mappings
When userfaultfd hugetlbfs support was originally added, it followed the pattern of anon mappings and did not support any vmas marked VM_SHARED. As such, support was only added for private mappings. Remove this limitation and support shared mappings. The primary functional change required is adding pages to the page cache. More subtle changes are required for huge page reservation handling in error paths. A lengthy comment in the code describes the reservation handling. [mike.kravetz@oracle.com: update] Link: http://lkml.kernel.org/r/c9c8cafe-baa7-05b4-34ea-1dfa5523a85f@oracle.com Link: http://lkml.kernel.org/r/1487195210-12839-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Pavel Emelyanov <xemul@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/userfaultfd.c')
-rw-r--r--mm/userfaultfd.c74
1 files changed, 58 insertions, 16 deletions
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index a0817cc470b0..1e5c2f94e8a3 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -154,6 +154,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
154 unsigned long len, 154 unsigned long len,
155 bool zeropage) 155 bool zeropage)
156{ 156{
157 int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
158 int vm_shared = dst_vma->vm_flags & VM_SHARED;
157 ssize_t err; 159 ssize_t err;
158 pte_t *dst_pte; 160 pte_t *dst_pte;
159 unsigned long src_addr, dst_addr; 161 unsigned long src_addr, dst_addr;
@@ -204,14 +206,14 @@ retry:
204 goto out_unlock; 206 goto out_unlock;
205 207
206 /* 208 /*
207 * Make sure the vma is not shared, that the remaining dst 209 * Make sure the remaining dst range is both valid and
208 * range is both valid and fully within a single existing vma. 210 * fully within a single existing vma.
209 */ 211 */
210 if (dst_vma->vm_flags & VM_SHARED)
211 goto out_unlock;
212 if (dst_start < dst_vma->vm_start || 212 if (dst_start < dst_vma->vm_start ||
213 dst_start + len > dst_vma->vm_end) 213 dst_start + len > dst_vma->vm_end)
214 goto out_unlock; 214 goto out_unlock;
215
216 vm_shared = dst_vma->vm_flags & VM_SHARED;
215 } 217 }
216 218
217 if (WARN_ON(dst_addr & (vma_hpagesize - 1) || 219 if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
@@ -225,11 +227,13 @@ retry:
225 goto out_unlock; 227 goto out_unlock;
226 228
227 /* 229 /*
228 * Ensure the dst_vma has a anon_vma. 230 * If not shared, ensure the dst_vma has a anon_vma.
229 */ 231 */
230 err = -ENOMEM; 232 err = -ENOMEM;
231 if (unlikely(anon_vma_prepare(dst_vma))) 233 if (!vm_shared) {
232 goto out_unlock; 234 if (unlikely(anon_vma_prepare(dst_vma)))
235 goto out_unlock;
236 }
233 237
234 h = hstate_vma(dst_vma); 238 h = hstate_vma(dst_vma);
235 239
@@ -266,6 +270,7 @@ retry:
266 dst_addr, src_addr, &page); 270 dst_addr, src_addr, &page);
267 271
268 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 272 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
273 vm_alloc_shared = vm_shared;
269 274
270 cond_resched(); 275 cond_resched();
271 276
@@ -305,18 +310,49 @@ out:
305 if (page) { 310 if (page) {
306 /* 311 /*
307 * We encountered an error and are about to free a newly 312 * We encountered an error and are about to free a newly
308 * allocated huge page. It is possible that there was a 313 * allocated huge page.
309 * reservation associated with the page that has been 314 *
310 * consumed. See the routine restore_reserve_on_error 315 * Reservation handling is very subtle, and is different for
311 * for details. Unfortunately, we can not call 316 * private and shared mappings. See the routine
312 * restore_reserve_on_error now as it would require holding 317 * restore_reserve_on_error for details. Unfortunately, we
313 * mmap_sem. Clear the PagePrivate flag so that the global 318 * can not call restore_reserve_on_error now as it would
319 * require holding mmap_sem.
320 *
321 * If a reservation for the page existed in the reservation
322 * map of a private mapping, the map was modified to indicate
323 * the reservation was consumed when the page was allocated.
324 * We clear the PagePrivate flag now so that the global
314 * reserve count will not be incremented in free_huge_page. 325 * reserve count will not be incremented in free_huge_page.
315 * The reservation map will still indicate the reservation 326 * The reservation map will still indicate the reservation
316 * was consumed and possibly prevent later page allocation. 327 * was consumed and possibly prevent later page allocation.
317 * This is better than leaking a global reservation. 328 * This is better than leaking a global reservation. If no
329 * reservation existed, it is still safe to clear PagePrivate
330 * as no adjustments to reservation counts were made during
331 * allocation.
332 *
333 * The reservation map for shared mappings indicates which
334 * pages have reservations. When a huge page is allocated
335 * for an address with a reservation, no change is made to
336 * the reserve map. In this case PagePrivate will be set
337 * to indicate that the global reservation count should be
338 * incremented when the page is freed. This is the desired
339 * behavior. However, when a huge page is allocated for an
340 * address without a reservation a reservation entry is added
341 * to the reservation map, and PagePrivate will not be set.
342 * When the page is freed, the global reserve count will NOT
343 * be incremented and it will appear as though we have leaked
344 * reserved page. In this case, set PagePrivate so that the
345 * global reserve count will be incremented to match the
346 * reservation map entry which was created.
347 *
348 * Note that vm_alloc_shared is based on the flags of the vma
349 * for which the page was originally allocated. dst_vma could
350 * be different or NULL on error.
318 */ 351 */
319 ClearPagePrivate(page); 352 if (vm_alloc_shared)
353 SetPagePrivate(page);
354 else
355 ClearPagePrivate(page);
320 put_page(page); 356 put_page(page);
321 } 357 }
322 BUG_ON(copied < 0); 358 BUG_ON(copied < 0);
@@ -372,8 +408,14 @@ retry:
372 dst_vma = find_vma(dst_mm, dst_start); 408 dst_vma = find_vma(dst_mm, dst_start);
373 if (!dst_vma) 409 if (!dst_vma)
374 goto out_unlock; 410 goto out_unlock;
375 if (!vma_is_shmem(dst_vma) && dst_vma->vm_flags & VM_SHARED) 411 /*
412 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
413 * it will overwrite vm_ops, so vma_is_anonymous must return false.
414 */
415 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
416 dst_vma->vm_flags & VM_SHARED))
376 goto out_unlock; 417 goto out_unlock;
418
377 if (dst_start < dst_vma->vm_start || 419 if (dst_start < dst_vma->vm_start ||
378 dst_start + len > dst_vma->vm_end) 420 dst_start + len > dst_vma->vm_end)
379 goto out_unlock; 421 goto out_unlock;