mm: fix fault vs invalidate race for linear mappings

Fix the race between invalidate_inode_pages and do_no_page. Andrea Arcangeli identified a subtle race between invalidation of pages from pagecache with userspace mappings, and do_no_page. The issue is that invalidation has to shoot down all mappings to the page, before it can be discarded from the pagecache. Between shooting down ptes to a particular page, and actually dropping the struct page from the pagecache, do_no_page from any process might fault on that page and establish a new mapping to the page just before it gets discarded from the pagecache. The most common case where such invalidation is used is in file truncation. This case was catered for by doing a sort of open-coded seqlock between the file's i_size, and its truncate_count. Truncation will decrease i_size, then increment truncate_count before unmapping userspace pages; do_no_page will read truncate_count, then find the page if it is within i_size, and then check truncate_count under the page table lock and back out and retry if it had subsequently been changed (ptl will serialise against unmapping, and ensure a potentially updated truncate_count is actually visible). Complexity and documentation issues aside, the locking protocol fails in the case where we would like to invalidate pagecache inside i_size. do_no_page can come in anytime and filemap_nopage is not aware of the invalidation in progress (as it is when it is outside i_size). The end result is that dangling (->mapping == NULL) pages that appear to be from a particular file may be mapped into userspace with nonsense data. Valid mappings to the same place will see a different page. Andrea implemented two working fixes, one using a real seqlock, another using a page->flags bit. He also proposed using the page lock in do_no_page, but that was initially considered too heavyweight. However, it is not a global or per-file lock, and the page cacheline is modified in do_no_page to increment _count and _mapcount anyway, so a further modification should not be a large performance hit. Scalability is not an issue. This patch implements this latter approach. ->nopage implementations return with the page locked if it is possible for their underlying file to be invalidated (in that case, they must set a special vm_flags bit to indicate so). do_no_page only unlocks the page after setting up the mapping completely. invalidation is excluded because it holds the page lock during invalidation of each page (and ensures that the page is not mapped while holding the lock). This also allows significant simplifications in do_no_page, because we have the page locked in the right place in the pagecache from the start. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Nick Piggin <npiggin@suse.de> 2007-07-19 04:46:57 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-07-19 13:04:41 -0400
commit: d00806b183152af6d24f46f0c33f14162ca1262a (patch)
tree: 36f829cf13d5410374a3f00b56ec0b1f8dc3ce3c /mm/filemap.c
parent: 589f1e81bde732dd0b1bc5d01b6bddd4bcb4527b (diff)
1 files changed, 20 insertions, 33 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 5d5449f3d41c..462cda58a18e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1325,9 +1325,10 @@ struct page *filemap_nopage(struct vm_area_struct *area,
        unsigned long size, pgoff;
        int did_readaround = 0, majmin = VM_FAULT_MINOR;
+        BUG_ON(!(area->vm_flags & VM_CAN_INVALIDATE));
        pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
-retry_all:
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (pgoff >= size)
                goto outside_data_content;
@@ -1349,7 +1350,7 @@ retry_all:
         * Do we have something in the page cache already?
         */
 retry_find:
-        page = find_get_page(mapping, pgoff);
+        page = find_lock_page(mapping, pgoff);
        if (!page) {
                unsigned long ra_pages;
@@ -1383,7 +1384,7 @@ retry_find:
                                start = pgoff - ra_pages / 2;
                        do_page_cache_readahead(mapping, file, start, ra_pages);
                }
-                page = find_get_page(mapping, pgoff);
+                page = find_lock_page(mapping, pgoff);
                if (!page)
                        goto no_cached_page;
        }
@@ -1392,13 +1393,19 @@ retry_find:
                ra->mmap_hit++;
        /*
-         * Ok, found a page in the page cache, now we need to check
+         * We have a locked page in the page cache, now we need to check
-         * that it's up-to-date.
+         * that it's up-to-date. If not, it is going to be due to an error.
         */
-        if (!PageUptodate(page))
+        if (unlikely(!PageUptodate(page)))
                goto page_not_uptodate;
-success:
+        /* Must recheck i_size under page lock */
+        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        if (unlikely(pgoff >= size)) {
+                unlock_page(page);
+                goto outside_data_content;
+        }
        /*
         * Found the page and have a reference on it.
         */
@@ -1440,6 +1447,7 @@ no_cached_page:
        return NOPAGE_SIGBUS;
 page_not_uptodate:
+        /* IO error path */
        if (!did_readaround) {
                majmin = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
@@ -1451,37 +1459,15 @@ page_not_uptodate:
         * because there really aren't any performance issues here
         * and we need to check for errors.
         */
-        lock_page(page);
-        /* Somebody truncated the page on us? */
-        if (!page->mapping) {
-                unlock_page(page);
-                page_cache_release(page);
-                goto retry_all;
-        }
-        /* Somebody else successfully read it in? */
-        if (PageUptodate(page)) {
-                unlock_page(page);
-                goto success;
-        }
        ClearPageError(page);
        error = mapping->a_ops->readpage(file, page);
-        if (!error) {
+        page_cache_release(page);
-                wait_on_page_locked(page);
-                if (PageUptodate(page))
+        if (!error || error == AOP_TRUNCATED_PAGE)
-                        goto success;
-        } else if (error == AOP_TRUNCATED_PAGE) {
-                page_cache_release(page);
                goto retry_find;
-        }
-        /*
+        /* Things didn't work out. Return zero to tell the mm layer so. */
-         * Things didn't work out. Return zero to tell the
-         * mm layer so, possibly freeing the page cache page first.
-         */
        shrink_readahead_size_eio(file, ra);
-        page_cache_release(page);
        return NOPAGE_SIGBUS;
 }
 EXPORT_SYMBOL(filemap_nopage);
@@ -1674,6 +1660,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
+        vma->vm_flags |= VM_CAN_INVALIDATE;
        return 0;
 }
author	Nick Piggin <npiggin@suse.de>	2007-07-19 04:46:57 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-07-19 13:04:41 -0400
commit	d00806b183152af6d24f46f0c33f14162ca1262a (patch)
tree	36f829cf13d5410374a3f00b56ec0b1f8dc3ce3c /mm/filemap.c
parent	589f1e81bde732dd0b1bc5d01b6bddd4bcb4527b (diff)

diff --git a/mm/filemap.c b/mm/filemap.c index 5d5449f3d41c..462cda58a18e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -1325,9 +1325,10 @@ struct page filemap_nopage(struct vm_area_struct area,
1325	unsigned long size, pgoff;	1325	unsigned long size, pgoff;
1326	int did_readaround = 0, majmin = VM_FAULT_MINOR;	1326	int did_readaround = 0, majmin = VM_FAULT_MINOR;
1327		1327
		1328	BUG_ON(!(area->vm_flags & VM_CAN_INVALIDATE));
		1329
1328	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;	1330	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1329		1331
1330	retry_all:
1331	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	1332	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1332	if (pgoff >= size)	1333	if (pgoff >= size)
1333	goto outside_data_content;	1334	goto outside_data_content;
@@ -1349,7 +1350,7 @@ retry_all:
1349	* Do we have something in the page cache already?	1350	* Do we have something in the page cache already?
1350	*/	1351	*/
1351	retry_find:	1352	retry_find:
1352	page = find_get_page(mapping, pgoff);	1353	page = find_lock_page(mapping, pgoff);
1353	if (!page) {	1354	if (!page) {
1354	unsigned long ra_pages;	1355	unsigned long ra_pages;
1355		1356
@@ -1383,7 +1384,7 @@ retry_find:
1383	start = pgoff - ra_pages / 2;	1384	start = pgoff - ra_pages / 2;
1384	do_page_cache_readahead(mapping, file, start, ra_pages);	1385	do_page_cache_readahead(mapping, file, start, ra_pages);
1385	}	1386	}
1386	page = find_get_page(mapping, pgoff);	1387	page = find_lock_page(mapping, pgoff);
1387	if (!page)	1388	if (!page)
1388	goto no_cached_page;	1389	goto no_cached_page;
1389	}	1390	}
@@ -1392,13 +1393,19 @@ retry_find:
1392	ra->mmap_hit++;	1393	ra->mmap_hit++;
1393		1394
1394	/*	1395	/*
1395	* Ok, found a page in the page cache, now we need to check	1396	* We have a locked page in the page cache, now we need to check
1396	* that it's up-to-date.	1397	* that it's up-to-date. If not, it is going to be due to an error.
1397	*/	1398	*/
1398	if (!PageUptodate(page))	1399	if (unlikely(!PageUptodate(page)))
1399	goto page_not_uptodate;	1400	goto page_not_uptodate;
1400		1401
1401	success:	1402	/* Must recheck i_size under page lock */
		1403	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
		1404	if (unlikely(pgoff >= size)) {
		1405	unlock_page(page);
		1406	goto outside_data_content;
		1407	}
		1408
1402	/*	1409	/*
1403	* Found the page and have a reference on it.	1410	* Found the page and have a reference on it.
1404	*/	1411	*/
@@ -1440,6 +1447,7 @@ no_cached_page:
1440	return NOPAGE_SIGBUS;	1447	return NOPAGE_SIGBUS;
1441		1448
1442	page_not_uptodate:	1449	page_not_uptodate:
		1450	/* IO error path */
1443	if (!did_readaround) {	1451	if (!did_readaround) {
1444	majmin = VM_FAULT_MAJOR;	1452	majmin = VM_FAULT_MAJOR;
1445	count_vm_event(PGMAJFAULT);	1453	count_vm_event(PGMAJFAULT);
@@ -1451,37 +1459,15 @@ page_not_uptodate:
1451	* because there really aren't any performance issues here	1459	* because there really aren't any performance issues here
1452	* and we need to check for errors.	1460	* and we need to check for errors.
1453	*/	1461	*/
1454	lock_page(page);
1455
1456	/* Somebody truncated the page on us? */
1457	if (!page->mapping) {
1458	unlock_page(page);
1459	page_cache_release(page);
1460	goto retry_all;
1461	}
1462
1463	/* Somebody else successfully read it in? */
1464	if (PageUptodate(page)) {
1465	unlock_page(page);
1466	goto success;
1467	}
1468	ClearPageError(page);	1462	ClearPageError(page);
1469	error = mapping->a_ops->readpage(file, page);	1463	error = mapping->a_ops->readpage(file, page);
1470	if (!error) {	1464	page_cache_release(page);
1471	wait_on_page_locked(page);	1465
1472	if (PageUptodate(page))	1466	if (!error \|\| error == AOP_TRUNCATED_PAGE)
1473	goto success;
1474	} else if (error == AOP_TRUNCATED_PAGE) {
1475	page_cache_release(page);
1476	goto retry_find;	1467	goto retry_find;
1477	}
1478		1468
1479	/*	1469	/* Things didn't work out. Return zero to tell the mm layer so. */
1480	* Things didn't work out. Return zero to tell the
1481	* mm layer so, possibly freeing the page cache page first.
1482	*/
1483	shrink_readahead_size_eio(file, ra);	1470	shrink_readahead_size_eio(file, ra);
1484	page_cache_release(page);
1485	return NOPAGE_SIGBUS;	1471	return NOPAGE_SIGBUS;
1486	}	1472	}
1487	EXPORT_SYMBOL(filemap_nopage);	1473	EXPORT_SYMBOL(filemap_nopage);
@@ -1674,6 +1660,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1674	return -ENOEXEC;	1660	return -ENOEXEC;
1675	file_accessed(file);	1661	file_accessed(file);
1676	vma->vm_ops = &generic_file_vm_ops;	1662	vma->vm_ops = &generic_file_vm_ops;
		1663	vma->vm_flags \|= VM_CAN_INVALIDATE;
1677	return 0;	1664	return 0;
1678	}	1665	}
1679		1666