aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Kravetz <mike.kravetz@oracle.com>2016-01-15 19:57:40 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-15 20:56:32 -0500
commit4aae8d1c051ea00b456da6811bc36d1f69de5445 (patch)
tree6a32e3ded1783842e31aaf42d69eb2a6b663975f
parent9aacdd354d197ad64685941b36d28ea20ab88757 (diff)
mm/hugetlbfs: unmap pages if page fault raced with hole punch
Page faults can race with fallocate hole punch. If a page fault happens between the unmap and remove operations, the page is not removed and remains within the hole. This is not the desired behavior. The race is difficult to detect in user level code as even in the non-race case, a page within the hole could be faulted back in before fallocate returns. If userfaultfd is expanded to support hugetlbfs in the future, this race will be easier to observe. If this race is detected and a page is mapped, the remove operation (remove_inode_hugepages) will unmap the page before removing. The unmap within remove_inode_hugepages occurs with the hugetlb_fault_mutex held so that no other faults will be processed until the page is removed. The (unmodified) routine hugetlb_vmdelete_list was moved ahead of remove_inode_hugepages to satisfy the new reference. [akpm@linux-foundation.org: move hugetlb_vmdelete_list()] Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Hugh Dickins <hughd@google.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/hugetlbfs/inode.c144
1 files changed, 75 insertions, 69 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9c07d2d754c9..8bbf7f3e2a27 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page)
324 delete_from_page_cache(page); 324 delete_from_page_cache(page);
325} 325}
326 326
327static void
328hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
329{
330 struct vm_area_struct *vma;
331
332 /*
333 * end == 0 indicates that the entire range after
334 * start should be unmapped.
335 */
336 vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
337 unsigned long v_offset;
338 unsigned long v_end;
339
340 /*
341 * Can the expression below overflow on 32-bit arches?
342 * No, because the interval tree returns us only those vmas
343 * which overlap the truncated area starting at pgoff,
344 * and no vma on a 32-bit arch can span beyond the 4GB.
345 */
346 if (vma->vm_pgoff < start)
347 v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
348 else
349 v_offset = 0;
350
351 if (!end)
352 v_end = vma->vm_end;
353 else {
354 v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
355 + vma->vm_start;
356 if (v_end > vma->vm_end)
357 v_end = vma->vm_end;
358 }
359
360 unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
361 NULL);
362 }
363}
327 364
328/* 365/*
329 * remove_inode_hugepages handles two distinct cases: truncation and hole 366 * remove_inode_hugepages handles two distinct cases: truncation and hole
330 * punch. There are subtle differences in operation for each case. 367 * punch. There are subtle differences in operation for each case.
331 368 *
332 * truncation is indicated by end of range being LLONG_MAX 369 * truncation is indicated by end of range being LLONG_MAX
333 * In this case, we first scan the range and release found pages. 370 * In this case, we first scan the range and release found pages.
334 * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv 371 * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
379 416
380 for (i = 0; i < pagevec_count(&pvec); ++i) { 417 for (i = 0; i < pagevec_count(&pvec); ++i) {
381 struct page *page = pvec.pages[i]; 418 struct page *page = pvec.pages[i];
419 bool rsv_on_error;
382 u32 hash; 420 u32 hash;
383 421
384 /* 422 /*
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
395 mapping, next, 0); 433 mapping, next, 0);
396 mutex_lock(&hugetlb_fault_mutex_table[hash]); 434 mutex_lock(&hugetlb_fault_mutex_table[hash]);
397 435
398 lock_page(page); 436 /*
399 if (likely(!page_mapped(page))) { 437 * If page is mapped, it was faulted in after being
400 bool rsv_on_error = !PagePrivate(page); 438 * unmapped in caller. Unmap (again) now after taking
401 /* 439 * the fault mutex. The mutex will prevent faults
402 * We must free the huge page and remove 440 * until we finish removing the page.
403 * from page cache (remove_huge_page) BEFORE 441 *
404 * removing the region/reserve map 442 * This race can only happen in the hole punch case.
405 * (hugetlb_unreserve_pages). In rare out 443 * Getting here in a truncate operation is a bug.
406 * of memory conditions, removal of the 444 */
407 * region/reserve map could fail. Before 445 if (unlikely(page_mapped(page))) {
408 * free'ing the page, note PagePrivate which
409 * is used in case of error.
410 */
411 remove_huge_page(page);
412 freed++;
413 if (!truncate_op) {
414 if (unlikely(hugetlb_unreserve_pages(
415 inode, next,
416 next + 1, 1)))
417 hugetlb_fix_reserve_counts(
418 inode, rsv_on_error);
419 }
420 } else {
421 /*
422 * If page is mapped, it was faulted in after
423 * being unmapped. It indicates a race between
424 * hole punch and page fault. Do nothing in
425 * this case. Getting here in a truncate
426 * operation is a bug.
427 */
428 BUG_ON(truncate_op); 446 BUG_ON(truncate_op);
447
448 i_mmap_lock_write(mapping);
449 hugetlb_vmdelete_list(&mapping->i_mmap,
450 next * pages_per_huge_page(h),
451 (next + 1) * pages_per_huge_page(h));
452 i_mmap_unlock_write(mapping);
453 }
454
455 lock_page(page);
456 /*
457 * We must free the huge page and remove from page
458 * cache (remove_huge_page) BEFORE removing the
459 * region/reserve map (hugetlb_unreserve_pages). In
460 * rare out of memory conditions, removal of the
461 * region/reserve map could fail. Before free'ing
462 * the page, note PagePrivate which is used in case
463 * of error.
464 */
465 rsv_on_error = !PagePrivate(page);
466 remove_huge_page(page);
467 freed++;
468 if (!truncate_op) {
469 if (unlikely(hugetlb_unreserve_pages(inode,
470 next, next + 1, 1)))
471 hugetlb_fix_reserve_counts(inode,
472 rsv_on_error);
429 } 473 }
430 474
431 unlock_page(page); 475 unlock_page(page);
@@ -452,44 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
452 clear_inode(inode); 496 clear_inode(inode);
453} 497}
454 498
455static inline void
456hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
457{
458 struct vm_area_struct *vma;
459
460 /*
461 * end == 0 indicates that the entire range after
462 * start should be unmapped.
463 */
464 vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
465 unsigned long v_offset;
466 unsigned long v_end;
467
468 /*
469 * Can the expression below overflow on 32-bit arches?
470 * No, because the interval tree returns us only those vmas
471 * which overlap the truncated area starting at pgoff,
472 * and no vma on a 32-bit arch can span beyond the 4GB.
473 */
474 if (vma->vm_pgoff < start)
475 v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
476 else
477 v_offset = 0;
478
479 if (!end)
480 v_end = vma->vm_end;
481 else {
482 v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
483 + vma->vm_start;
484 if (v_end > vma->vm_end)
485 v_end = vma->vm_end;
486 }
487
488 unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
489 NULL);
490 }
491}
492
493static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) 499static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
494{ 500{
495 pgoff_t pgoff; 501 pgoff_t pgoff;