mm/hugetlbfs: unmap pages if page fault raced with hole punch

Page faults can race with fallocate hole punch. If a page fault happens between the unmap and remove operations, the page is not removed and remains within the hole. This is not the desired behavior. The race is difficult to detect in user level code as even in the non-race case, a page within the hole could be faulted back in before fallocate returns. If userfaultfd is expanded to support hugetlbfs in the future, this race will be easier to observe. If this race is detected and a page is mapped, the remove operation (remove_inode_hugepages) will unmap the page before removing. The unmap within remove_inode_hugepages occurs with the hugetlb_fault_mutex held so that no other faults will be processed until the page is removed. The (unmodified) routine hugetlb_vmdelete_list was moved ahead of remove_inode_hugepages to satisfy the new reference. [akpm@linux-foundation.org: move hugetlb_vmdelete_list()] Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Hugh Dickins <hughd@google.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mike Kravetz <mike.kravetz@oracle.com> 2016-01-15 19:57:40 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-15 20:56:32 -0500
commit: 4aae8d1c051ea00b456da6811bc36d1f69de5445 (patch)
tree: 6a32e3ded1783842e31aaf42d69eb2a6b663975f
parent: 9aacdd354d197ad64685941b36d28ea20ab88757 (diff)
1 files changed, 75 insertions, 69 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9c07d2d754c9..8bbf7f3e2a27 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page)
        delete_from_page_cache(page);
 }
+static void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+        struct vm_area_struct *vma;
+        /*
+         * end == 0 indicates that the entire range after
+         * start should be unmapped.
+         */
+        vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+                unsigned long v_offset;
+                unsigned long v_end;
+                /*
+                 * Can the expression below overflow on 32-bit arches?
+                 * No, because the interval tree returns us only those vmas
+                 * which overlap the truncated area starting at pgoff,
+                 * and no vma on a 32-bit arch can span beyond the 4GB.
+                 */
+                if (vma->vm_pgoff < start)
+                        v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+                else
+                        v_offset = 0;
+                if (!end)
+                        v_end = vma->vm_end;
+                else {
+                        v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+                                                        + vma->vm_start;
+                        if (v_end > vma->vm_end)
+                                v_end = vma->vm_end;
+                }
+                unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+                                                                        NULL);
+        }
+}
 /*
 * remove_inode_hugepages handles two distinct cases: truncation and hole
 * punch.  There are subtle differences in operation for each case.
+ *
 * truncation is indicated by end of range being LLONG_MAX
 *      In this case, we first scan the range and release found pages.
 *      After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
+                        bool rsv_on_error;
                        u32 hash;
                        /*
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                                                        mapping, next, 0);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
-                        lock_page(page);
+                        /*
-                        if (likely(!page_mapped(page))) {
+                         * If page is mapped, it was faulted in after being
-                                bool rsv_on_error = !PagePrivate(page);
+                         * unmapped in caller.  Unmap (again) now after taking
-                                /*
+                         * the fault mutex.  The mutex will prevent faults
-                                 * We must free the huge page and remove
+                         * until we finish removing the page.
-                                 * from page cache (remove_huge_page) BEFORE
+                         *
-                                 * removing the region/reserve map
+                         * This race can only happen in the hole punch case.
-                                 * (hugetlb_unreserve_pages).  In rare out
+                         * Getting here in a truncate operation is a bug.
-                                 * of memory conditions, removal of the
+                         */
-                                 * region/reserve map could fail.  Before
+                        if (unlikely(page_mapped(page))) {
-                                 * free'ing the page, note PagePrivate which
-                                 * is used in case of error.
-                                 */
-                                remove_huge_page(page);
-                                freed++;
-                                if (!truncate_op) {
-                                        if (unlikely(hugetlb_unreserve_pages(
-                                                        inode, next,
-                                                        next + 1, 1)))
-                                                hugetlb_fix_reserve_counts(
-                                                        inode, rsv_on_error);
-                                }
-                        } else {
-                                /*
-                                 * If page is mapped, it was faulted in after
-                                 * being unmapped.  It indicates a race between
-                                 * hole punch and page fault.  Do nothing in
-                                 * this case.  Getting here in a truncate
-                                 * operation is a bug.
-                                 */
                                BUG_ON(truncate_op);
+                                i_mmap_lock_write(mapping);
+                                hugetlb_vmdelete_list(&mapping->i_mmap,
+                                        next * pages_per_huge_page(h),
+                                        (next + 1) * pages_per_huge_page(h));
+                                i_mmap_unlock_write(mapping);
+                        }
+                        lock_page(page);
+                        /*
+                         * We must free the huge page and remove from page
+                         * cache (remove_huge_page) BEFORE removing the
+                         * region/reserve map (hugetlb_unreserve_pages).  In
+                         * rare out of memory conditions, removal of the
+                         * region/reserve map could fail.  Before free'ing
+                         * the page, note PagePrivate which is used in case
+                         * of error.
+                         */
+                        rsv_on_error = !PagePrivate(page);
+                        remove_huge_page(page);
+                        freed++;
+                        if (!truncate_op) {
+                                if (unlikely(hugetlb_unreserve_pages(inode,
+                                                        next, next + 1, 1)))
+                                        hugetlb_fix_reserve_counts(inode,
+                                                                rsv_on_error);
                        }
                        unlock_page(page);
@@ -452,44 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
        clear_inode(inode);
 }
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
-        struct vm_area_struct *vma;
-        /*
-         * end == 0 indicates that the entire range after
-         * start should be unmapped.
-         */
-        vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
-                unsigned long v_offset;
-                unsigned long v_end;
-                /*
-                 * Can the expression below overflow on 32-bit arches?
-                 * No, because the interval tree returns us only those vmas
-                 * which overlap the truncated area starting at pgoff,
-                 * and no vma on a 32-bit arch can span beyond the 4GB.
-                 */
-                if (vma->vm_pgoff < start)
-                        v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
-                else
-                        v_offset = 0;
-                if (!end)
-                        v_end = vma->vm_end;
-                else {
-                        v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
-                                                        + vma->vm_start;
-                        if (v_end > vma->vm_end)
-                                v_end = vma->vm_end;
-                }
-                unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
-                                                                        NULL);
-        }
-}
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
        pgoff_t pgoff;
author	Mike Kravetz <mike.kravetz@oracle.com>	2016-01-15 19:57:40 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-01-15 20:56:32 -0500
commit	4aae8d1c051ea00b456da6811bc36d1f69de5445 (patch)
tree	6a32e3ded1783842e31aaf42d69eb2a6b663975f
parent	9aacdd354d197ad64685941b36d28ea20ab88757 (diff)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9c07d2d754c9..8bbf7f3e2a27 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page)
324	delete_from_page_cache(page);	324	delete_from_page_cache(page);
325	}	325	}
326		326
		327	static void
		328	hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
		329	{
		330	struct vm_area_struct *vma;
		331
		332	/*
		333	* end == 0 indicates that the entire range after
		334	* start should be unmapped.
		335	*/
		336	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
		337	unsigned long v_offset;
		338	unsigned long v_end;
		339
		340	/*
		341	* Can the expression below overflow on 32-bit arches?
		342	* No, because the interval tree returns us only those vmas
		343	* which overlap the truncated area starting at pgoff,
		344	* and no vma on a 32-bit arch can span beyond the 4GB.
		345	*/
		346	if (vma->vm_pgoff < start)
		347	v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
		348	else
		349	v_offset = 0;
		350
		351	if (!end)
		352	v_end = vma->vm_end;
		353	else {
		354	v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
		355	+ vma->vm_start;
		356	if (v_end > vma->vm_end)
		357	v_end = vma->vm_end;
		358	}
		359
		360	unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
		361	NULL);
		362	}
		363	}
327		364
328	/*	365	/*
329	* remove_inode_hugepages handles two distinct cases: truncation and hole	366	* remove_inode_hugepages handles two distinct cases: truncation and hole
330	* punch. There are subtle differences in operation for each case.	367	* punch. There are subtle differences in operation for each case.
331		368	*
332	* truncation is indicated by end of range being LLONG_MAX	369	* truncation is indicated by end of range being LLONG_MAX
333	* In this case, we first scan the range and release found pages.	370	* In this case, we first scan the range and release found pages.
334	* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv	371	* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
379		416
380	for (i = 0; i < pagevec_count(&pvec); ++i) {	417	for (i = 0; i < pagevec_count(&pvec); ++i) {
381	struct page *page = pvec.pages[i];	418	struct page *page = pvec.pages[i];
		419	bool rsv_on_error;
382	u32 hash;	420	u32 hash;
383		421
384	/*	422	/*
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
395	mapping, next, 0);	433	mapping, next, 0);
396	mutex_lock(&hugetlb_fault_mutex_table[hash]);	434	mutex_lock(&hugetlb_fault_mutex_table[hash]);
397		435
398	lock_page(page);	436	/*
399	if (likely(!page_mapped(page))) {	437	* If page is mapped, it was faulted in after being
400	bool rsv_on_error = !PagePrivate(page);	438	* unmapped in caller. Unmap (again) now after taking
401	/*	439	* the fault mutex. The mutex will prevent faults
402	* We must free the huge page and remove	440	* until we finish removing the page.
403	* from page cache (remove_huge_page) BEFORE	441	*
404	* removing the region/reserve map	442	* This race can only happen in the hole punch case.
405	* (hugetlb_unreserve_pages). In rare out	443	* Getting here in a truncate operation is a bug.
406	* of memory conditions, removal of the	444	*/
407	* region/reserve map could fail. Before	445	if (unlikely(page_mapped(page))) {
408	* free'ing the page, note PagePrivate which
409	* is used in case of error.
410	*/
411	remove_huge_page(page);
412	freed++;
413	if (!truncate_op) {
414	if (unlikely(hugetlb_unreserve_pages(
415	inode, next,
416	next + 1, 1)))
417	hugetlb_fix_reserve_counts(
418	inode, rsv_on_error);
419	}
420	} else {
421	/*
422	* If page is mapped, it was faulted in after
423	* being unmapped. It indicates a race between
424	* hole punch and page fault. Do nothing in
425	* this case. Getting here in a truncate
426	* operation is a bug.
427	*/
428	BUG_ON(truncate_op);	446	BUG_ON(truncate_op);
		447
		448	i_mmap_lock_write(mapping);
		449	hugetlb_vmdelete_list(&mapping->i_mmap,
		450	next * pages_per_huge_page(h),
		451	(next + 1) * pages_per_huge_page(h));
		452	i_mmap_unlock_write(mapping);
		453	}
		454
		455	lock_page(page);
		456	/*
		457	* We must free the huge page and remove from page
		458	* cache (remove_huge_page) BEFORE removing the
		459	* region/reserve map (hugetlb_unreserve_pages). In
		460	* rare out of memory conditions, removal of the
		461	* region/reserve map could fail. Before free'ing
		462	* the page, note PagePrivate which is used in case
		463	* of error.
		464	*/
		465	rsv_on_error = !PagePrivate(page);
		466	remove_huge_page(page);
		467	freed++;
		468	if (!truncate_op) {
		469	if (unlikely(hugetlb_unreserve_pages(inode,
		470	next, next + 1, 1)))
		471	hugetlb_fix_reserve_counts(inode,
		472	rsv_on_error);
429	}	473	}
430		474
431	unlock_page(page);	475	unlock_page(page);
@@ -452,44 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
452	clear_inode(inode);	496	clear_inode(inode);
453	}	497	}
454		498
455	static inline void
456	hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
457	{
458	struct vm_area_struct *vma;
459
460	/*
461	* end == 0 indicates that the entire range after
462	* start should be unmapped.
463	*/
464	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
465	unsigned long v_offset;
466	unsigned long v_end;
467
468	/*
469	* Can the expression below overflow on 32-bit arches?
470	* No, because the interval tree returns us only those vmas
471	* which overlap the truncated area starting at pgoff,
472	* and no vma on a 32-bit arch can span beyond the 4GB.
473	*/
474	if (vma->vm_pgoff < start)
475	v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
476	else
477	v_offset = 0;
478
479	if (!end)
480	v_end = vma->vm_end;
481	else {
482	v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
483	+ vma->vm_start;
484	if (v_end > vma->vm_end)
485	v_end = vma->vm_end;
486	}
487
488	unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
489	NULL);
490	}
491	}
492
493	static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)	499	static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
494	{	500	{
495	pgoff_t pgoff;	501	pgoff_t pgoff;