aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-07-19 04:46:57 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-19 13:04:41 -0400
commitd00806b183152af6d24f46f0c33f14162ca1262a (patch)
tree36f829cf13d5410374a3f00b56ec0b1f8dc3ce3c
parent589f1e81bde732dd0b1bc5d01b6bddd4bcb4527b (diff)
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page. Andrea Arcangeli identified a subtle race between invalidation of pages from pagecache with userspace mappings, and do_no_page. The issue is that invalidation has to shoot down all mappings to the page, before it can be discarded from the pagecache. Between shooting down ptes to a particular page, and actually dropping the struct page from the pagecache, do_no_page from any process might fault on that page and establish a new mapping to the page just before it gets discarded from the pagecache. The most common case where such invalidation is used is in file truncation. This case was catered for by doing a sort of open-coded seqlock between the file's i_size, and its truncate_count. Truncation will decrease i_size, then increment truncate_count before unmapping userspace pages; do_no_page will read truncate_count, then find the page if it is within i_size, and then check truncate_count under the page table lock and back out and retry if it had subsequently been changed (ptl will serialise against unmapping, and ensure a potentially updated truncate_count is actually visible). Complexity and documentation issues aside, the locking protocol fails in the case where we would like to invalidate pagecache inside i_size. do_no_page can come in anytime and filemap_nopage is not aware of the invalidation in progress (as it is when it is outside i_size). The end result is that dangling (->mapping == NULL) pages that appear to be from a particular file may be mapped into userspace with nonsense data. Valid mappings to the same place will see a different page. Andrea implemented two working fixes, one using a real seqlock, another using a page->flags bit. He also proposed using the page lock in do_no_page, but that was initially considered too heavyweight. However, it is not a global or per-file lock, and the page cacheline is modified in do_no_page to increment _count and _mapcount anyway, so a further modification should not be a large performance hit. Scalability is not an issue. This patch implements this latter approach. ->nopage implementations return with the page locked if it is possible for their underlying file to be invalidated (in that case, they must set a special vm_flags bit to indicate so). do_no_page only unlocks the page after setting up the mapping completely. invalidation is excluded because it holds the page lock during invalidation of each page (and ensures that the page is not mapped while holding the lock). This also allows significant simplifications in do_no_page, because we have the page locked in the right place in the pagecache from the start. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/gfs2/ops_file.c2
-rw-r--r--fs/gfs2/ops_vm.c2
-rw-r--r--fs/ncpfs/mmap.c1
-rw-r--r--fs/ocfs2/mmap.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c1
-rw-r--r--include/linux/mm.h6
-rw-r--r--mm/filemap.c53
-rw-r--r--mm/memory.c153
-rw-r--r--mm/shmem.c11
-rw-r--r--mm/truncate.c13
10 files changed, 127 insertions, 116 deletions
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 1a5e8e893d7..bad0b24cb77 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -364,6 +364,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
364 else 364 else
365 vma->vm_ops = &gfs2_vm_ops_private; 365 vma->vm_ops = &gfs2_vm_ops_private;
366 366
367 vma->vm_flags |= VM_CAN_INVALIDATE;
368
367 gfs2_glock_dq_uninit(&i_gh); 369 gfs2_glock_dq_uninit(&i_gh);
368 370
369 return error; 371 return error;
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index 404b7cc9f8c..d5a98cbfebd 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -138,6 +138,8 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
138 if (alloc_required) { 138 if (alloc_required) {
139 error = alloc_page_backing(ip, result); 139 error = alloc_page_backing(ip, result);
140 if (error) { 140 if (error) {
141 if (area->vm_flags & VM_CAN_INVALIDATE)
142 unlock_page(result);
141 page_cache_release(result); 143 page_cache_release(result);
142 result = NULL; 144 result = NULL;
143 goto out; 145 goto out;
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 70a69115500..5416673418b 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -123,6 +123,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
123 return -EFBIG; 123 return -EFBIG;
124 124
125 vma->vm_ops = &ncp_file_mmap; 125 vma->vm_ops = &ncp_file_mmap;
126 vma->vm_flags |= VM_CAN_INVALIDATE;
126 file_accessed(file); 127 file_accessed(file);
127 return 0; 128 return 0;
128} 129}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index d79aa12137d..904f39ff534 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -226,6 +226,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
226 ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level); 226 ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
227out: 227out:
228 vma->vm_ops = &ocfs2_file_vm_ops; 228 vma->vm_ops = &ocfs2_file_vm_ops;
229 vma->vm_flags |= VM_CAN_INVALIDATE;
229 return 0; 230 return 0;
230} 231}
231 232
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index cbcd40c8c2a..92b2f225712 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -310,6 +310,7 @@ xfs_file_mmap(
310 struct vm_area_struct *vma) 310 struct vm_area_struct *vma)
311{ 311{
312 vma->vm_ops = &xfs_file_vm_ops; 312 vma->vm_ops = &xfs_file_vm_ops;
313 vma->vm_flags |= VM_CAN_INVALIDATE;
313 314
314#ifdef CONFIG_XFS_DMAPI 315#ifdef CONFIG_XFS_DMAPI
315 if (vn_from_inode(filp->f_path.dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI) 316 if (vn_from_inode(filp->f_path.dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a5c451816fd..ca9536a348c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -168,6 +168,12 @@ extern unsigned int kobjsize(const void *objp);
168#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ 168#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
169#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ 169#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */
170 170
171#define VM_CAN_INVALIDATE 0x08000000 /* The mapping may be invalidated,
172 * eg. truncate or invalidate_inode_*.
173 * In this case, do_no_page must
174 * return with the page locked.
175 */
176
171#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ 177#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
172#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS 178#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
173#endif 179#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index 5d5449f3d41..462cda58a18 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1325,9 +1325,10 @@ struct page *filemap_nopage(struct vm_area_struct *area,
1325 unsigned long size, pgoff; 1325 unsigned long size, pgoff;
1326 int did_readaround = 0, majmin = VM_FAULT_MINOR; 1326 int did_readaround = 0, majmin = VM_FAULT_MINOR;
1327 1327
1328 BUG_ON(!(area->vm_flags & VM_CAN_INVALIDATE));
1329
1328 pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; 1330 pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1329 1331
1330retry_all:
1331 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1332 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1332 if (pgoff >= size) 1333 if (pgoff >= size)
1333 goto outside_data_content; 1334 goto outside_data_content;
@@ -1349,7 +1350,7 @@ retry_all:
1349 * Do we have something in the page cache already? 1350 * Do we have something in the page cache already?
1350 */ 1351 */
1351retry_find: 1352retry_find:
1352 page = find_get_page(mapping, pgoff); 1353 page = find_lock_page(mapping, pgoff);
1353 if (!page) { 1354 if (!page) {
1354 unsigned long ra_pages; 1355 unsigned long ra_pages;
1355 1356
@@ -1383,7 +1384,7 @@ retry_find:
1383 start = pgoff - ra_pages / 2; 1384 start = pgoff - ra_pages / 2;
1384 do_page_cache_readahead(mapping, file, start, ra_pages); 1385 do_page_cache_readahead(mapping, file, start, ra_pages);
1385 } 1386 }
1386 page = find_get_page(mapping, pgoff); 1387 page = find_lock_page(mapping, pgoff);
1387 if (!page) 1388 if (!page)
1388 goto no_cached_page; 1389 goto no_cached_page;
1389 } 1390 }
@@ -1392,13 +1393,19 @@ retry_find:
1392 ra->mmap_hit++; 1393 ra->mmap_hit++;
1393 1394
1394 /* 1395 /*
1395 * Ok, found a page in the page cache, now we need to check 1396 * We have a locked page in the page cache, now we need to check
1396 * that it's up-to-date. 1397 * that it's up-to-date. If not, it is going to be due to an error.
1397 */ 1398 */
1398 if (!PageUptodate(page)) 1399 if (unlikely(!PageUptodate(page)))
1399 goto page_not_uptodate; 1400 goto page_not_uptodate;
1400 1401
1401success: 1402 /* Must recheck i_size under page lock */
1403 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1404 if (unlikely(pgoff >= size)) {
1405 unlock_page(page);
1406 goto outside_data_content;
1407 }
1408
1402 /* 1409 /*
1403 * Found the page and have a reference on it. 1410 * Found the page and have a reference on it.
1404 */ 1411 */
@@ -1440,6 +1447,7 @@ no_cached_page:
1440 return NOPAGE_SIGBUS; 1447 return NOPAGE_SIGBUS;
1441 1448
1442page_not_uptodate: 1449page_not_uptodate:
1450 /* IO error path */
1443 if (!did_readaround) { 1451 if (!did_readaround) {
1444 majmin = VM_FAULT_MAJOR; 1452 majmin = VM_FAULT_MAJOR;
1445 count_vm_event(PGMAJFAULT); 1453 count_vm_event(PGMAJFAULT);
@@ -1451,37 +1459,15 @@ page_not_uptodate:
1451 * because there really aren't any performance issues here 1459 * because there really aren't any performance issues here
1452 * and we need to check for errors. 1460 * and we need to check for errors.
1453 */ 1461 */
1454 lock_page(page);
1455
1456 /* Somebody truncated the page on us? */
1457 if (!page->mapping) {
1458 unlock_page(page);
1459 page_cache_release(page);
1460 goto retry_all;
1461 }
1462
1463 /* Somebody else successfully read it in? */
1464 if (PageUptodate(page)) {
1465 unlock_page(page);
1466 goto success;
1467 }
1468 ClearPageError(page); 1462 ClearPageError(page);
1469 error = mapping->a_ops->readpage(file, page); 1463 error = mapping->a_ops->readpage(file, page);
1470 if (!error) { 1464 page_cache_release(page);
1471 wait_on_page_locked(page); 1465
1472 if (PageUptodate(page)) 1466 if (!error || error == AOP_TRUNCATED_PAGE)
1473 goto success;
1474 } else if (error == AOP_TRUNCATED_PAGE) {
1475 page_cache_release(page);
1476 goto retry_find; 1467 goto retry_find;
1477 }
1478 1468
1479 /* 1469 /* Things didn't work out. Return zero to tell the mm layer so. */
1480 * Things didn't work out. Return zero to tell the
1481 * mm layer so, possibly freeing the page cache page first.
1482 */
1483 shrink_readahead_size_eio(file, ra); 1470 shrink_readahead_size_eio(file, ra);
1484 page_cache_release(page);
1485 return NOPAGE_SIGBUS; 1471 return NOPAGE_SIGBUS;
1486} 1472}
1487EXPORT_SYMBOL(filemap_nopage); 1473EXPORT_SYMBOL(filemap_nopage);
@@ -1674,6 +1660,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1674 return -ENOEXEC; 1660 return -ENOEXEC;
1675 file_accessed(file); 1661 file_accessed(file);
1676 vma->vm_ops = &generic_file_vm_ops; 1662 vma->vm_ops = &generic_file_vm_ops;
1663 vma->vm_flags |= VM_CAN_INVALIDATE;
1677 return 0; 1664 return 0;
1678} 1665}
1679 1666
diff --git a/mm/memory.c b/mm/memory.c
index 9c6ff7fffdc..e6c99f6b564 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1831,6 +1831,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
1831 unsigned long restart_addr; 1831 unsigned long restart_addr;
1832 int need_break; 1832 int need_break;
1833 1833
1834 /*
1835 * files that support invalidating or truncating portions of the
1836 * file from under mmaped areas must set the VM_CAN_INVALIDATE flag, and
1837 * have their .nopage function return the page locked.
1838 */
1839 BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
1840
1834again: 1841again:
1835 restart_addr = vma->vm_truncate_count; 1842 restart_addr = vma->vm_truncate_count;
1836 if (is_restart_addr(restart_addr) && start_addr < restart_addr) { 1843 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
@@ -1959,17 +1966,8 @@ void unmap_mapping_range(struct address_space *mapping,
1959 1966
1960 spin_lock(&mapping->i_mmap_lock); 1967 spin_lock(&mapping->i_mmap_lock);
1961 1968
1962 /* serialize i_size write against truncate_count write */ 1969 /* Protect against endless unmapping loops */
1963 smp_wmb();
1964 /* Protect against page faults, and endless unmapping loops */
1965 mapping->truncate_count++; 1970 mapping->truncate_count++;
1966 /*
1967 * For archs where spin_lock has inclusive semantics like ia64
1968 * this smp_mb() will prevent to read pagetable contents
1969 * before the truncate_count increment is visible to
1970 * other cpus.
1971 */
1972 smp_mb();
1973 if (unlikely(is_restart_addr(mapping->truncate_count))) { 1971 if (unlikely(is_restart_addr(mapping->truncate_count))) {
1974 if (mapping->truncate_count == 0) 1972 if (mapping->truncate_count == 0)
1975 reset_vma_truncate_counts(mapping); 1973 reset_vma_truncate_counts(mapping);
@@ -2008,8 +2006,18 @@ int vmtruncate(struct inode * inode, loff_t offset)
2008 if (IS_SWAPFILE(inode)) 2006 if (IS_SWAPFILE(inode))
2009 goto out_busy; 2007 goto out_busy;
2010 i_size_write(inode, offset); 2008 i_size_write(inode, offset);
2009
2010 /*
2011 * unmap_mapping_range is called twice, first simply for efficiency
2012 * so that truncate_inode_pages does fewer single-page unmaps. However
2013 * after this first call, and before truncate_inode_pages finishes,
2014 * it is possible for private pages to be COWed, which remain after
2015 * truncate_inode_pages finishes, hence the second unmap_mapping_range
2016 * call must be made for correctness.
2017 */
2011 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); 2018 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2012 truncate_inode_pages(mapping, offset); 2019 truncate_inode_pages(mapping, offset);
2020 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2013 goto out_truncate; 2021 goto out_truncate;
2014 2022
2015do_expand: 2023do_expand:
@@ -2049,6 +2057,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2049 down_write(&inode->i_alloc_sem); 2057 down_write(&inode->i_alloc_sem);
2050 unmap_mapping_range(mapping, offset, (end - offset), 1); 2058 unmap_mapping_range(mapping, offset, (end - offset), 1);
2051 truncate_inode_pages_range(mapping, offset, end); 2059 truncate_inode_pages_range(mapping, offset, end);
2060 unmap_mapping_range(mapping, offset, (end - offset), 1);
2052 inode->i_op->truncate_range(inode, offset, end); 2061 inode->i_op->truncate_range(inode, offset, end);
2053 up_write(&inode->i_alloc_sem); 2062 up_write(&inode->i_alloc_sem);
2054 mutex_unlock(&inode->i_mutex); 2063 mutex_unlock(&inode->i_mutex);
@@ -2206,7 +2215,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2206 2215
2207 /* No need to invalidate - it was non-present before */ 2216 /* No need to invalidate - it was non-present before */
2208 update_mmu_cache(vma, address, pte); 2217 update_mmu_cache(vma, address, pte);
2209 lazy_mmu_prot_update(pte);
2210unlock: 2218unlock:
2211 pte_unmap_unlock(page_table, ptl); 2219 pte_unmap_unlock(page_table, ptl);
2212out: 2220out:
@@ -2297,10 +2305,8 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2297 int write_access) 2305 int write_access)
2298{ 2306{
2299 spinlock_t *ptl; 2307 spinlock_t *ptl;
2300 struct page *new_page; 2308 struct page *page, *nopage_page;
2301 struct address_space *mapping = NULL;
2302 pte_t entry; 2309 pte_t entry;
2303 unsigned int sequence = 0;
2304 int ret = VM_FAULT_MINOR; 2310 int ret = VM_FAULT_MINOR;
2305 int anon = 0; 2311 int anon = 0;
2306 struct page *dirty_page = NULL; 2312 struct page *dirty_page = NULL;
@@ -2308,74 +2314,53 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2308 pte_unmap(page_table); 2314 pte_unmap(page_table);
2309 BUG_ON(vma->vm_flags & VM_PFNMAP); 2315 BUG_ON(vma->vm_flags & VM_PFNMAP);
2310 2316
2311 if (vma->vm_file) { 2317 nopage_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2312 mapping = vma->vm_file->f_mapping;
2313 sequence = mapping->truncate_count;
2314 smp_rmb(); /* serializes i_size against truncate_count */
2315 }
2316retry:
2317 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2318 /*
2319 * No smp_rmb is needed here as long as there's a full
2320 * spin_lock/unlock sequence inside the ->nopage callback
2321 * (for the pagecache lookup) that acts as an implicit
2322 * smp_mb() and prevents the i_size read to happen
2323 * after the next truncate_count read.
2324 */
2325
2326 /* no page was available -- either SIGBUS, OOM or REFAULT */ 2318 /* no page was available -- either SIGBUS, OOM or REFAULT */
2327 if (unlikely(new_page == NOPAGE_SIGBUS)) 2319 if (unlikely(nopage_page == NOPAGE_SIGBUS))
2328 return VM_FAULT_SIGBUS; 2320 return VM_FAULT_SIGBUS;
2329 else if (unlikely(new_page == NOPAGE_OOM)) 2321 else if (unlikely(nopage_page == NOPAGE_OOM))
2330 return VM_FAULT_OOM; 2322 return VM_FAULT_OOM;
2331 else if (unlikely(new_page == NOPAGE_REFAULT)) 2323 else if (unlikely(nopage_page == NOPAGE_REFAULT))
2332 return VM_FAULT_MINOR; 2324 return VM_FAULT_MINOR;
2333 2325
2326 BUG_ON(vma->vm_flags & VM_CAN_INVALIDATE && !PageLocked(nopage_page));
2327 /*
2328 * For consistency in subsequent calls, make the nopage_page always
2329 * locked.
2330 */
2331 if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE)))
2332 lock_page(nopage_page);
2333
2334 /* 2334 /*
2335 * Should we do an early C-O-W break? 2335 * Should we do an early C-O-W break?
2336 */ 2336 */
2337 page = nopage_page;
2337 if (write_access) { 2338 if (write_access) {
2338 if (!(vma->vm_flags & VM_SHARED)) { 2339 if (!(vma->vm_flags & VM_SHARED)) {
2339 struct page *page; 2340 if (unlikely(anon_vma_prepare(vma))) {
2340 2341 ret = VM_FAULT_OOM;
2341 if (unlikely(anon_vma_prepare(vma))) 2342 goto out_error;
2342 goto oom; 2343 }
2343 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, 2344 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2344 vma, address); 2345 if (!page) {
2345 if (!page) 2346 ret = VM_FAULT_OOM;
2346 goto oom; 2347 goto out_error;
2347 copy_user_highpage(page, new_page, address, vma); 2348 }
2348 page_cache_release(new_page); 2349 copy_user_highpage(page, nopage_page, address, vma);
2349 new_page = page;
2350 anon = 1; 2350 anon = 1;
2351
2352 } else { 2351 } else {
2353 /* if the page will be shareable, see if the backing 2352 /* if the page will be shareable, see if the backing
2354 * address space wants to know that the page is about 2353 * address space wants to know that the page is about
2355 * to become writable */ 2354 * to become writable */
2356 if (vma->vm_ops->page_mkwrite && 2355 if (vma->vm_ops->page_mkwrite &&
2357 vma->vm_ops->page_mkwrite(vma, new_page) < 0 2356 vma->vm_ops->page_mkwrite(vma, page) < 0) {
2358 ) { 2357 ret = VM_FAULT_SIGBUS;
2359 page_cache_release(new_page); 2358 goto out_error;
2360 return VM_FAULT_SIGBUS;
2361 } 2359 }
2362 } 2360 }
2363 } 2361 }
2364 2362
2365 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2363 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2366 /*
2367 * For a file-backed vma, someone could have truncated or otherwise
2368 * invalidated this page. If unmap_mapping_range got called,
2369 * retry getting the page.
2370 */
2371 if (mapping && unlikely(sequence != mapping->truncate_count)) {
2372 pte_unmap_unlock(page_table, ptl);
2373 page_cache_release(new_page);
2374 cond_resched();
2375 sequence = mapping->truncate_count;
2376 smp_rmb();
2377 goto retry;
2378 }
2379 2364
2380 /* 2365 /*
2381 * This silly early PAGE_DIRTY setting removes a race 2366 * This silly early PAGE_DIRTY setting removes a race
@@ -2388,43 +2373,51 @@ retry:
2388 * handle that later. 2373 * handle that later.
2389 */ 2374 */
2390 /* Only go through if we didn't race with anybody else... */ 2375 /* Only go through if we didn't race with anybody else... */
2391 if (pte_none(*page_table)) { 2376 if (likely(pte_none(*page_table))) {
2392 flush_icache_page(vma, new_page); 2377 flush_icache_page(vma, page);
2393 entry = mk_pte(new_page, vma->vm_page_prot); 2378 entry = mk_pte(page, vma->vm_page_prot);
2394 if (write_access) 2379 if (write_access)
2395 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2380 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2396 set_pte_at(mm, address, page_table, entry); 2381 set_pte_at(mm, address, page_table, entry);
2397 if (anon) { 2382 if (anon) {
2398 inc_mm_counter(mm, anon_rss); 2383 inc_mm_counter(mm, anon_rss);
2399 lru_cache_add_active(new_page); 2384 lru_cache_add_active(page);
2400 page_add_new_anon_rmap(new_page, vma, address); 2385 page_add_new_anon_rmap(page, vma, address);
2401 } else { 2386 } else {
2402 inc_mm_counter(mm, file_rss); 2387 inc_mm_counter(mm, file_rss);
2403 page_add_file_rmap(new_page); 2388 page_add_file_rmap(page);
2404 if (write_access) { 2389 if (write_access) {
2405 dirty_page = new_page; 2390 dirty_page = page;
2406 get_page(dirty_page); 2391 get_page(dirty_page);
2407 } 2392 }
2408 } 2393 }
2394
2395 /* no need to invalidate: a not-present page won't be cached */
2396 update_mmu_cache(vma, address, entry);
2397 lazy_mmu_prot_update(entry);
2409 } else { 2398 } else {
2410 /* One of our sibling threads was faster, back out. */ 2399 if (anon)
2411 page_cache_release(new_page); 2400 page_cache_release(page);
2412 goto unlock; 2401 else
2402 anon = 1; /* not anon, but release nopage_page */
2413 } 2403 }
2414 2404
2415 /* no need to invalidate: a not-present page shouldn't be cached */
2416 update_mmu_cache(vma, address, entry);
2417 lazy_mmu_prot_update(entry);
2418unlock:
2419 pte_unmap_unlock(page_table, ptl); 2405 pte_unmap_unlock(page_table, ptl);
2420 if (dirty_page) { 2406
2407out:
2408 unlock_page(nopage_page);
2409 if (anon)
2410 page_cache_release(nopage_page);
2411 else if (dirty_page) {
2421 set_page_dirty_balance(dirty_page); 2412 set_page_dirty_balance(dirty_page);
2422 put_page(dirty_page); 2413 put_page(dirty_page);
2423 } 2414 }
2415
2424 return ret; 2416 return ret;
2425oom: 2417
2426 page_cache_release(new_page); 2418out_error:
2427 return VM_FAULT_OOM; 2419 anon = 1; /* relase nopage_page */
2420 goto out;
2428} 2421}
2429 2422
2430/* 2423/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 96fa79fb6ad..5808fadd394 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -83,6 +83,7 @@ enum sgp_type {
83 SGP_READ, /* don't exceed i_size, don't allocate page */ 83 SGP_READ, /* don't exceed i_size, don't allocate page */
84 SGP_CACHE, /* don't exceed i_size, may allocate page */ 84 SGP_CACHE, /* don't exceed i_size, may allocate page */
85 SGP_WRITE, /* may exceed i_size, may allocate page */ 85 SGP_WRITE, /* may exceed i_size, may allocate page */
86 SGP_NOPAGE, /* same as SGP_CACHE, return with page locked */
86}; 87};
87 88
88static int shmem_getpage(struct inode *inode, unsigned long idx, 89static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -1289,8 +1290,10 @@ repeat:
1289 } 1290 }
1290done: 1291done:
1291 if (*pagep != filepage) { 1292 if (*pagep != filepage) {
1292 unlock_page(filepage);
1293 *pagep = filepage; 1293 *pagep = filepage;
1294 if (sgp != SGP_NOPAGE)
1295 unlock_page(filepage);
1296
1294 } 1297 }
1295 return 0; 1298 return 0;
1296 1299
@@ -1310,13 +1313,15 @@ static struct page *shmem_nopage(struct vm_area_struct *vma,
1310 unsigned long idx; 1313 unsigned long idx;
1311 int error; 1314 int error;
1312 1315
1316 BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
1317
1313 idx = (address - vma->vm_start) >> PAGE_SHIFT; 1318 idx = (address - vma->vm_start) >> PAGE_SHIFT;
1314 idx += vma->vm_pgoff; 1319 idx += vma->vm_pgoff;
1315 idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; 1320 idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
1316 if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 1321 if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1317 return NOPAGE_SIGBUS; 1322 return NOPAGE_SIGBUS;
1318 1323
1319 error = shmem_getpage(inode, idx, &page, SGP_CACHE, type); 1324 error = shmem_getpage(inode, idx, &page, SGP_NOPAGE, type);
1320 if (error) 1325 if (error)
1321 return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS; 1326 return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
1322 1327
@@ -1414,6 +1419,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1414{ 1419{
1415 file_accessed(file); 1420 file_accessed(file);
1416 vma->vm_ops = &shmem_vm_ops; 1421 vma->vm_ops = &shmem_vm_ops;
1422 vma->vm_flags |= VM_CAN_INVALIDATE;
1417 return 0; 1423 return 0;
1418} 1424}
1419 1425
@@ -2596,5 +2602,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2596 fput(vma->vm_file); 2602 fput(vma->vm_file);
2597 vma->vm_file = file; 2603 vma->vm_file = file;
2598 vma->vm_ops = &shmem_vm_ops; 2604 vma->vm_ops = &shmem_vm_ops;
2605 vma->vm_flags |= VM_CAN_INVALIDATE;
2599 return 0; 2606 return 0;
2600} 2607}
diff --git a/mm/truncate.c b/mm/truncate.c
index f47e46d1be3..aed85f0b707 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -192,6 +192,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
192 unlock_page(page); 192 unlock_page(page);
193 continue; 193 continue;
194 } 194 }
195 if (page_mapped(page)) {
196 unmap_mapping_range(mapping,
197 (loff_t)page_index<<PAGE_CACHE_SHIFT,
198 PAGE_CACHE_SIZE, 0);
199 }
195 truncate_complete_page(mapping, page); 200 truncate_complete_page(mapping, page);
196 unlock_page(page); 201 unlock_page(page);
197 } 202 }
@@ -229,6 +234,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
229 break; 234 break;
230 lock_page(page); 235 lock_page(page);
231 wait_on_page_writeback(page); 236 wait_on_page_writeback(page);
237 if (page_mapped(page)) {
238 unmap_mapping_range(mapping,
239 (loff_t)page->index<<PAGE_CACHE_SHIFT,
240 PAGE_CACHE_SIZE, 0);
241 }
232 if (page->index > next) 242 if (page->index > next)
233 next = page->index; 243 next = page->index;
234 next++; 244 next++;
@@ -405,7 +415,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
405 break; 415 break;
406 } 416 }
407 wait_on_page_writeback(page); 417 wait_on_page_writeback(page);
408 while (page_mapped(page)) { 418 if (page_mapped(page)) {
409 if (!did_range_unmap) { 419 if (!did_range_unmap) {
410 /* 420 /*
411 * Zap the rest of the file in one hit. 421 * Zap the rest of the file in one hit.
@@ -425,6 +435,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
425 PAGE_CACHE_SIZE, 0); 435 PAGE_CACHE_SIZE, 0);
426 } 436 }
427 } 437 }
438 BUG_ON(page_mapped(page));
428 ret = do_launder_page(mapping, page); 439 ret = do_launder_page(mapping, page);
429 if (ret == 0 && !invalidate_complete_page2(mapping, page)) 440 if (ret == 0 && !invalidate_complete_page2(mapping, page))
430 ret = -EIO; 441 ret = -EIO;