aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-07-19 04:46:57 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-19 13:04:41 -0400
commitd00806b183152af6d24f46f0c33f14162ca1262a (patch)
tree36f829cf13d5410374a3f00b56ec0b1f8dc3ce3c /mm/filemap.c
parent589f1e81bde732dd0b1bc5d01b6bddd4bcb4527b (diff)
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page. Andrea Arcangeli identified a subtle race between invalidation of pages from pagecache with userspace mappings, and do_no_page. The issue is that invalidation has to shoot down all mappings to the page, before it can be discarded from the pagecache. Between shooting down ptes to a particular page, and actually dropping the struct page from the pagecache, do_no_page from any process might fault on that page and establish a new mapping to the page just before it gets discarded from the pagecache. The most common case where such invalidation is used is in file truncation. This case was catered for by doing a sort of open-coded seqlock between the file's i_size, and its truncate_count. Truncation will decrease i_size, then increment truncate_count before unmapping userspace pages; do_no_page will read truncate_count, then find the page if it is within i_size, and then check truncate_count under the page table lock and back out and retry if it had subsequently been changed (ptl will serialise against unmapping, and ensure a potentially updated truncate_count is actually visible). Complexity and documentation issues aside, the locking protocol fails in the case where we would like to invalidate pagecache inside i_size. do_no_page can come in anytime and filemap_nopage is not aware of the invalidation in progress (as it is when it is outside i_size). The end result is that dangling (->mapping == NULL) pages that appear to be from a particular file may be mapped into userspace with nonsense data. Valid mappings to the same place will see a different page. Andrea implemented two working fixes, one using a real seqlock, another using a page->flags bit. He also proposed using the page lock in do_no_page, but that was initially considered too heavyweight. However, it is not a global or per-file lock, and the page cacheline is modified in do_no_page to increment _count and _mapcount anyway, so a further modification should not be a large performance hit. Scalability is not an issue. This patch implements this latter approach. ->nopage implementations return with the page locked if it is possible for their underlying file to be invalidated (in that case, they must set a special vm_flags bit to indicate so). do_no_page only unlocks the page after setting up the mapping completely. invalidation is excluded because it holds the page lock during invalidation of each page (and ensures that the page is not mapped while holding the lock). This also allows significant simplifications in do_no_page, because we have the page locked in the right place in the pagecache from the start. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c53
1 files changed, 20 insertions, 33 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 5d5449f3d41c..462cda58a18e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1325,9 +1325,10 @@ struct page *filemap_nopage(struct vm_area_struct *area,
1325 unsigned long size, pgoff; 1325 unsigned long size, pgoff;
1326 int did_readaround = 0, majmin = VM_FAULT_MINOR; 1326 int did_readaround = 0, majmin = VM_FAULT_MINOR;
1327 1327
1328 BUG_ON(!(area->vm_flags & VM_CAN_INVALIDATE));
1329
1328 pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; 1330 pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1329 1331
1330retry_all:
1331 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1332 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1332 if (pgoff >= size) 1333 if (pgoff >= size)
1333 goto outside_data_content; 1334 goto outside_data_content;
@@ -1349,7 +1350,7 @@ retry_all:
1349 * Do we have something in the page cache already? 1350 * Do we have something in the page cache already?
1350 */ 1351 */
1351retry_find: 1352retry_find:
1352 page = find_get_page(mapping, pgoff); 1353 page = find_lock_page(mapping, pgoff);
1353 if (!page) { 1354 if (!page) {
1354 unsigned long ra_pages; 1355 unsigned long ra_pages;
1355 1356
@@ -1383,7 +1384,7 @@ retry_find:
1383 start = pgoff - ra_pages / 2; 1384 start = pgoff - ra_pages / 2;
1384 do_page_cache_readahead(mapping, file, start, ra_pages); 1385 do_page_cache_readahead(mapping, file, start, ra_pages);
1385 } 1386 }
1386 page = find_get_page(mapping, pgoff); 1387 page = find_lock_page(mapping, pgoff);
1387 if (!page) 1388 if (!page)
1388 goto no_cached_page; 1389 goto no_cached_page;
1389 } 1390 }
@@ -1392,13 +1393,19 @@ retry_find:
1392 ra->mmap_hit++; 1393 ra->mmap_hit++;
1393 1394
1394 /* 1395 /*
1395 * Ok, found a page in the page cache, now we need to check 1396 * We have a locked page in the page cache, now we need to check
1396 * that it's up-to-date. 1397 * that it's up-to-date. If not, it is going to be due to an error.
1397 */ 1398 */
1398 if (!PageUptodate(page)) 1399 if (unlikely(!PageUptodate(page)))
1399 goto page_not_uptodate; 1400 goto page_not_uptodate;
1400 1401
1401success: 1402 /* Must recheck i_size under page lock */
1403 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1404 if (unlikely(pgoff >= size)) {
1405 unlock_page(page);
1406 goto outside_data_content;
1407 }
1408
1402 /* 1409 /*
1403 * Found the page and have a reference on it. 1410 * Found the page and have a reference on it.
1404 */ 1411 */
@@ -1440,6 +1447,7 @@ no_cached_page:
1440 return NOPAGE_SIGBUS; 1447 return NOPAGE_SIGBUS;
1441 1448
1442page_not_uptodate: 1449page_not_uptodate:
1450 /* IO error path */
1443 if (!did_readaround) { 1451 if (!did_readaround) {
1444 majmin = VM_FAULT_MAJOR; 1452 majmin = VM_FAULT_MAJOR;
1445 count_vm_event(PGMAJFAULT); 1453 count_vm_event(PGMAJFAULT);
@@ -1451,37 +1459,15 @@ page_not_uptodate:
1451 * because there really aren't any performance issues here 1459 * because there really aren't any performance issues here
1452 * and we need to check for errors. 1460 * and we need to check for errors.
1453 */ 1461 */
1454 lock_page(page);
1455
1456 /* Somebody truncated the page on us? */
1457 if (!page->mapping) {
1458 unlock_page(page);
1459 page_cache_release(page);
1460 goto retry_all;
1461 }
1462
1463 /* Somebody else successfully read it in? */
1464 if (PageUptodate(page)) {
1465 unlock_page(page);
1466 goto success;
1467 }
1468 ClearPageError(page); 1462 ClearPageError(page);
1469 error = mapping->a_ops->readpage(file, page); 1463 error = mapping->a_ops->readpage(file, page);
1470 if (!error) { 1464 page_cache_release(page);
1471 wait_on_page_locked(page); 1465
1472 if (PageUptodate(page)) 1466 if (!error || error == AOP_TRUNCATED_PAGE)
1473 goto success;
1474 } else if (error == AOP_TRUNCATED_PAGE) {
1475 page_cache_release(page);
1476 goto retry_find; 1467 goto retry_find;
1477 }
1478 1468
1479 /* 1469 /* Things didn't work out. Return zero to tell the mm layer so. */
1480 * Things didn't work out. Return zero to tell the
1481 * mm layer so, possibly freeing the page cache page first.
1482 */
1483 shrink_readahead_size_eio(file, ra); 1470 shrink_readahead_size_eio(file, ra);
1484 page_cache_release(page);
1485 return NOPAGE_SIGBUS; 1471 return NOPAGE_SIGBUS;
1486} 1472}
1487EXPORT_SYMBOL(filemap_nopage); 1473EXPORT_SYMBOL(filemap_nopage);
@@ -1674,6 +1660,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1674 return -ENOEXEC; 1660 return -ENOEXEC;
1675 file_accessed(file); 1661 file_accessed(file);
1676 vma->vm_ops = &generic_file_vm_ops; 1662 vma->vm_ops = &generic_file_vm_ops;
1663 vma->vm_flags |= VM_CAN_INVALIDATE;
1677 return 0; 1664 return 0;
1678} 1665}
1679 1666