hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race"

This reverts c86aa7bbfd5568ba8a82d3635d8f7b8a8e06fe54 The reverted commit caused ABBA deadlocks when file migration raced with file eviction for specific hugetlbfs files. This was discovered with a modified version of the LTP move_pages12 test. The purpose of the reverted patch was to close a long existing race between hugetlbfs file truncation and page faults. After more analysis of the patch and impacted code, it was determined that i_mmap_rwsem can not be used for all required synchronization. Therefore, revert this patch while working an another approach to the underlying issue. Link: http://lkml.kernel.org/r/20190103235452.29335-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Reported-by: Jan Stancek <jstancek@redhat.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Hugh Dickins <hughd@google.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Prakash Sangappa <prakash.sangappa@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mike Kravetz <mike.kravetz@oracle.com> 2019-01-08 18:23:32 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-01-08 20:15:11 -0500
commit: e7c58097793ef15d58fadf190ee58738fbf447cd (patch)
tree: 7dd04f50d4daadba4f8f3550a6d39d1a0ac2cda7
parent: 8ab88c7169b7fba98812ead6524b9d05bc76cf00 (diff)
2 files changed, 44 insertions, 38 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a2fcea5f8225..32920a10100e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
 * truncation is indicated by end of range being LLONG_MAX
 *      In this case, we first scan the range and release found pages.
 *      After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
- *      maps and global counts.
+ *      maps and global counts.  Page faults can not race with truncation
+ *      in this routine.  hugetlb_no_page() prevents page faults in the
+ *      truncated range.  It checks i_size before allocation, and again after
+ *      with the page table lock for the page held.  The same lock must be
+ *      acquired to unmap a page.
 * hole punch is indicated if end is not LLONG_MAX
 *      In the hole punch case we scan the range and release found pages.
 *      Only when releasing a page is the associated region/reserv map
 *      deleted.  The region/reserv map for ranges without associated
- *      pages are not modified.
+ *      pages are not modified.  Page faults can race with hole punch.
- *
+ *      This is indicated if we find a mapped page.
- * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
- * races with page faults.
- *
 * Note: If the passed end of range value is beyond the end of file, but
 * not LLONG_MAX this routine still performs a hole punch operation.
 */
@@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
+                        u32 hash;
                        index = page->index;
+                        hash = hugetlb_fault_mutex_hash(h, current->mm,
+                                                        &pseudo_vma,
+                                                        mapping, index, 0);
+                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        /*
-                         * A mapped page is impossible as callers should unmap
+                         * If page is mapped, it was faulted in after being
-                         * all references before calling.  And, i_mmap_rwsem
+                         * unmapped in caller.  Unmap (again) now after taking
-                         * prevents the creation of additional mappings.
+                         * the fault mutex.  The mutex will prevent faults
+                         * until we finish removing the page.
+                         *
+                         * This race can only happen in the hole punch case.
+                         * Getting here in a truncate operation is a bug.
                         */
-                        VM_BUG_ON(page_mapped(page));
+                        if (unlikely(page_mapped(page))) {
+                                BUG_ON(truncate_op);
+                                i_mmap_lock_write(mapping);
+                                hugetlb_vmdelete_list(&mapping->i_mmap,
+                                        index * pages_per_huge_page(h),
+                                        (index + 1) * pages_per_huge_page(h));
+                                i_mmap_unlock_write(mapping);
+                        }
                        lock_page(page);
                        /*
@@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                        }
                        unlock_page(page);
+                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                }
                huge_pagevec_release(&pvec);
                cond_resched();
@@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 static void hugetlbfs_evict_inode(struct inode *inode)
 {
-        struct address_space *mapping = inode->i_mapping;
        struct resv_map *resv_map;
-        /*
-         * The vfs layer guarantees that there are no other users of this
-         * inode.  Therefore, it would be safe to call remove_inode_hugepages
-         * without holding i_mmap_rwsem.  We acquire and hold here to be
-         * consistent with other callers.  Since there will be no contention
-         * on the semaphore, overhead is negligible.
-         */
-        i_mmap_lock_write(mapping);
        remove_inode_hugepages(inode, 0, LLONG_MAX);
-        i_mmap_unlock_write(mapping);
        resv_map = (struct resv_map *)inode->i_mapping->private_data;
        /* root inode doesn't have the resv_map, so we should check it */
        if (resv_map)
@@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
        i_mmap_lock_write(mapping);
        if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
                hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
-        remove_inode_hugepages(inode, offset, LLONG_MAX);
        i_mmap_unlock_write(mapping);
+        remove_inode_hugepages(inode, offset, LLONG_MAX);
        return 0;
 }
@@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                        hugetlb_vmdelete_list(&mapping->i_mmap,
                                                hole_start >> PAGE_SHIFT,
                                                hole_end  >> PAGE_SHIFT);
-                remove_inode_hugepages(inode, hole_start, hole_end);
                i_mmap_unlock_write(mapping);
+                remove_inode_hugepages(inode, hole_start, hole_end);
                inode_unlock(inode);
        }
@@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                /* addr is the offset within the file (zero based) */
                addr = index * hpage_size;
-                /*
+                /* mutex taken here, fault path and hole punch */
-                 * fault mutex taken here, protects against fault path
-                 * and hole punch.  inode_lock previously taken protects
-                 * against truncation.
-                 */
                hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
                                                index, addr);
                mutex_lock(&hugetlb_fault_mutex_table[hash]);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 745088810965..aedc1b183cf9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        }
        /*
-         * We can not race with truncation due to holding i_mmap_rwsem.
+         * Use page lock to guard against racing truncation
-         * Check once here for faults beyond end of file.
+         * before we get page_table_lock.
         */
-        size = i_size_read(mapping->host) >> huge_page_shift(h);
-        if (idx >= size)
-                goto out;
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
+                size = i_size_read(mapping->host) >> huge_page_shift(h);
+                if (idx >= size)
+                        goto out;
                /*
                 * Check for page in userfault range
                 */
@@ -3854,6 +3854,9 @@ retry:
        }
        ptl = huge_pte_lock(h, mm, ptep);
+        size = i_size_read(mapping->host) >> huge_page_shift(h);
+        if (idx >= size)
+                goto backout;
        ret = 0;
        if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3956,10 +3959,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        /*
         * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-         * until finished with ptep.  This serves two purposes:
+         * until finished with ptep.  This prevents huge_pmd_unshare from
-         * 1) It prevents huge_pmd_unshare from being called elsewhere
+         * being called elsewhere and making the ptep no longer valid.
-         *    and making the ptep no longer valid.
-         * 2) It synchronizes us with file truncation.
         *
         * ptep could have already be assigned via huge_pte_offset.  That
         * is OK, as huge_pte_alloc will return the same value unless
author	Mike Kravetz <mike.kravetz@oracle.com>	2019-01-08 18:23:32 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-01-08 20:15:11 -0500
commit	e7c58097793ef15d58fadf190ee58738fbf447cd (patch)
tree	7dd04f50d4daadba4f8f3550a6d39d1a0ac2cda7
parent	8ab88c7169b7fba98812ead6524b9d05bc76cf00 (diff)