diff options
author | Mike Kravetz <mike.kravetz@oracle.com> | 2019-01-08 18:23:32 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-01-08 20:15:11 -0500 |
commit | e7c58097793ef15d58fadf190ee58738fbf447cd (patch) | |
tree | 7dd04f50d4daadba4f8f3550a6d39d1a0ac2cda7 | |
parent | 8ab88c7169b7fba98812ead6524b9d05bc76cf00 (diff) |
hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race"
This reverts c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54
The reverted commit caused ABBA deadlocks when file migration raced with
file eviction for specific hugetlbfs files. This was discovered with a
modified version of the LTP move_pages12 test.
The purpose of the reverted patch was to close a long existing race
between hugetlbfs file truncation and page faults. After more analysis
of the patch and impacted code, it was determined that i_mmap_rwsem can
not be used for all required synchronization. Therefore, revert this
patch while working an another approach to the underlying issue.
Link: http://lkml.kernel.org/r/20190103235452.29335-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Jan Stancek <jstancek@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | fs/hugetlbfs/inode.c | 61 | ||||
-rw-r--r-- | mm/hugetlb.c | 21 |
2 files changed, 44 insertions, 38 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a2fcea5f8225..32920a10100e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) | |||
383 | * truncation is indicated by end of range being LLONG_MAX | 383 | * truncation is indicated by end of range being LLONG_MAX |
384 | * In this case, we first scan the range and release found pages. | 384 | * In this case, we first scan the range and release found pages. |
385 | * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv | 385 | * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv |
386 | * maps and global counts. | 386 | * maps and global counts. Page faults can not race with truncation |
387 | * in this routine. hugetlb_no_page() prevents page faults in the | ||
388 | * truncated range. It checks i_size before allocation, and again after | ||
389 | * with the page table lock for the page held. The same lock must be | ||
390 | * acquired to unmap a page. | ||
387 | * hole punch is indicated if end is not LLONG_MAX | 391 | * hole punch is indicated if end is not LLONG_MAX |
388 | * In the hole punch case we scan the range and release found pages. | 392 | * In the hole punch case we scan the range and release found pages. |
389 | * Only when releasing a page is the associated region/reserv map | 393 | * Only when releasing a page is the associated region/reserv map |
390 | * deleted. The region/reserv map for ranges without associated | 394 | * deleted. The region/reserv map for ranges without associated |
391 | * pages are not modified. | 395 | * pages are not modified. Page faults can race with hole punch. |
392 | * | 396 | * This is indicated if we find a mapped page. |
393 | * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent | ||
394 | * races with page faults. | ||
395 | * | ||
396 | * Note: If the passed end of range value is beyond the end of file, but | 397 | * Note: If the passed end of range value is beyond the end of file, but |
397 | * not LLONG_MAX this routine still performs a hole punch operation. | 398 | * not LLONG_MAX this routine still performs a hole punch operation. |
398 | */ | 399 | */ |
@@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
422 | 423 | ||
423 | for (i = 0; i < pagevec_count(&pvec); ++i) { | 424 | for (i = 0; i < pagevec_count(&pvec); ++i) { |
424 | struct page *page = pvec.pages[i]; | 425 | struct page *page = pvec.pages[i]; |
426 | u32 hash; | ||
425 | 427 | ||
426 | index = page->index; | 428 | index = page->index; |
429 | hash = hugetlb_fault_mutex_hash(h, current->mm, | ||
430 | &pseudo_vma, | ||
431 | mapping, index, 0); | ||
432 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | ||
433 | |||
427 | /* | 434 | /* |
428 | * A mapped page is impossible as callers should unmap | 435 | * If page is mapped, it was faulted in after being |
429 | * all references before calling. And, i_mmap_rwsem | 436 | * unmapped in caller. Unmap (again) now after taking |
430 | * prevents the creation of additional mappings. | 437 | * the fault mutex. The mutex will prevent faults |
438 | * until we finish removing the page. | ||
439 | * | ||
440 | * This race can only happen in the hole punch case. | ||
441 | * Getting here in a truncate operation is a bug. | ||
431 | */ | 442 | */ |
432 | VM_BUG_ON(page_mapped(page)); | 443 | if (unlikely(page_mapped(page))) { |
444 | BUG_ON(truncate_op); | ||
445 | |||
446 | i_mmap_lock_write(mapping); | ||
447 | hugetlb_vmdelete_list(&mapping->i_mmap, | ||
448 | index * pages_per_huge_page(h), | ||
449 | (index + 1) * pages_per_huge_page(h)); | ||
450 | i_mmap_unlock_write(mapping); | ||
451 | } | ||
433 | 452 | ||
434 | lock_page(page); | 453 | lock_page(page); |
435 | /* | 454 | /* |
@@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
451 | } | 470 | } |
452 | 471 | ||
453 | unlock_page(page); | 472 | unlock_page(page); |
473 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | ||
454 | } | 474 | } |
455 | huge_pagevec_release(&pvec); | 475 | huge_pagevec_release(&pvec); |
456 | cond_resched(); | 476 | cond_resched(); |
@@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
462 | 482 | ||
463 | static void hugetlbfs_evict_inode(struct inode *inode) | 483 | static void hugetlbfs_evict_inode(struct inode *inode) |
464 | { | 484 | { |
465 | struct address_space *mapping = inode->i_mapping; | ||
466 | struct resv_map *resv_map; | 485 | struct resv_map *resv_map; |
467 | 486 | ||
468 | /* | ||
469 | * The vfs layer guarantees that there are no other users of this | ||
470 | * inode. Therefore, it would be safe to call remove_inode_hugepages | ||
471 | * without holding i_mmap_rwsem. We acquire and hold here to be | ||
472 | * consistent with other callers. Since there will be no contention | ||
473 | * on the semaphore, overhead is negligible. | ||
474 | */ | ||
475 | i_mmap_lock_write(mapping); | ||
476 | remove_inode_hugepages(inode, 0, LLONG_MAX); | 487 | remove_inode_hugepages(inode, 0, LLONG_MAX); |
477 | i_mmap_unlock_write(mapping); | ||
478 | |||
479 | resv_map = (struct resv_map *)inode->i_mapping->private_data; | 488 | resv_map = (struct resv_map *)inode->i_mapping->private_data; |
480 | /* root inode doesn't have the resv_map, so we should check it */ | 489 | /* root inode doesn't have the resv_map, so we should check it */ |
481 | if (resv_map) | 490 | if (resv_map) |
@@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) | |||
496 | i_mmap_lock_write(mapping); | 505 | i_mmap_lock_write(mapping); |
497 | if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) | 506 | if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) |
498 | hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); | 507 | hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); |
499 | remove_inode_hugepages(inode, offset, LLONG_MAX); | ||
500 | i_mmap_unlock_write(mapping); | 508 | i_mmap_unlock_write(mapping); |
509 | remove_inode_hugepages(inode, offset, LLONG_MAX); | ||
501 | return 0; | 510 | return 0; |
502 | } | 511 | } |
503 | 512 | ||
@@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) | |||
531 | hugetlb_vmdelete_list(&mapping->i_mmap, | 540 | hugetlb_vmdelete_list(&mapping->i_mmap, |
532 | hole_start >> PAGE_SHIFT, | 541 | hole_start >> PAGE_SHIFT, |
533 | hole_end >> PAGE_SHIFT); | 542 | hole_end >> PAGE_SHIFT); |
534 | remove_inode_hugepages(inode, hole_start, hole_end); | ||
535 | i_mmap_unlock_write(mapping); | 543 | i_mmap_unlock_write(mapping); |
544 | remove_inode_hugepages(inode, hole_start, hole_end); | ||
536 | inode_unlock(inode); | 545 | inode_unlock(inode); |
537 | } | 546 | } |
538 | 547 | ||
@@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, | |||
615 | /* addr is the offset within the file (zero based) */ | 624 | /* addr is the offset within the file (zero based) */ |
616 | addr = index * hpage_size; | 625 | addr = index * hpage_size; |
617 | 626 | ||
618 | /* | 627 | /* mutex taken here, fault path and hole punch */ |
619 | * fault mutex taken here, protects against fault path | ||
620 | * and hole punch. inode_lock previously taken protects | ||
621 | * against truncation. | ||
622 | */ | ||
623 | hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, | 628 | hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, |
624 | index, addr); | 629 | index, addr); |
625 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 630 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 745088810965..aedc1b183cf9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, | |||
3755 | } | 3755 | } |
3756 | 3756 | ||
3757 | /* | 3757 | /* |
3758 | * We can not race with truncation due to holding i_mmap_rwsem. | 3758 | * Use page lock to guard against racing truncation |
3759 | * Check once here for faults beyond end of file. | 3759 | * before we get page_table_lock. |
3760 | */ | 3760 | */ |
3761 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3762 | if (idx >= size) | ||
3763 | goto out; | ||
3764 | |||
3765 | retry: | 3761 | retry: |
3766 | page = find_lock_page(mapping, idx); | 3762 | page = find_lock_page(mapping, idx); |
3767 | if (!page) { | 3763 | if (!page) { |
3764 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3765 | if (idx >= size) | ||
3766 | goto out; | ||
3767 | |||
3768 | /* | 3768 | /* |
3769 | * Check for page in userfault range | 3769 | * Check for page in userfault range |
3770 | */ | 3770 | */ |
@@ -3854,6 +3854,9 @@ retry: | |||
3854 | } | 3854 | } |
3855 | 3855 | ||
3856 | ptl = huge_pte_lock(h, mm, ptep); | 3856 | ptl = huge_pte_lock(h, mm, ptep); |
3857 | size = i_size_read(mapping->host) >> huge_page_shift(h); | ||
3858 | if (idx >= size) | ||
3859 | goto backout; | ||
3857 | 3860 | ||
3858 | ret = 0; | 3861 | ret = 0; |
3859 | if (!huge_pte_none(huge_ptep_get(ptep))) | 3862 | if (!huge_pte_none(huge_ptep_get(ptep))) |
@@ -3956,10 +3959,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3956 | 3959 | ||
3957 | /* | 3960 | /* |
3958 | * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold | 3961 | * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold |
3959 | * until finished with ptep. This serves two purposes: | 3962 | * until finished with ptep. This prevents huge_pmd_unshare from |
3960 | * 1) It prevents huge_pmd_unshare from being called elsewhere | 3963 | * being called elsewhere and making the ptep no longer valid. |
3961 | * and making the ptep no longer valid. | ||
3962 | * 2) It synchronizes us with file truncation. | ||
3963 | * | 3964 | * |
3964 | * ptep could have already be assigned via huge_pte_offset. That | 3965 | * ptep could have already be assigned via huge_pte_offset. That |
3965 | * is OK, as huge_pte_alloc will return the same value unless | 3966 | * is OK, as huge_pte_alloc will return the same value unless |