aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-07-19 04:46:57 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-19 13:04:41 -0400
commitd00806b183152af6d24f46f0c33f14162ca1262a (patch)
tree36f829cf13d5410374a3f00b56ec0b1f8dc3ce3c /mm/memory.c
parent589f1e81bde732dd0b1bc5d01b6bddd4bcb4527b (diff)
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page. Andrea Arcangeli identified a subtle race between invalidation of pages from pagecache with userspace mappings, and do_no_page. The issue is that invalidation has to shoot down all mappings to the page, before it can be discarded from the pagecache. Between shooting down ptes to a particular page, and actually dropping the struct page from the pagecache, do_no_page from any process might fault on that page and establish a new mapping to the page just before it gets discarded from the pagecache. The most common case where such invalidation is used is in file truncation. This case was catered for by doing a sort of open-coded seqlock between the file's i_size, and its truncate_count. Truncation will decrease i_size, then increment truncate_count before unmapping userspace pages; do_no_page will read truncate_count, then find the page if it is within i_size, and then check truncate_count under the page table lock and back out and retry if it had subsequently been changed (ptl will serialise against unmapping, and ensure a potentially updated truncate_count is actually visible). Complexity and documentation issues aside, the locking protocol fails in the case where we would like to invalidate pagecache inside i_size. do_no_page can come in anytime and filemap_nopage is not aware of the invalidation in progress (as it is when it is outside i_size). The end result is that dangling (->mapping == NULL) pages that appear to be from a particular file may be mapped into userspace with nonsense data. Valid mappings to the same place will see a different page. Andrea implemented two working fixes, one using a real seqlock, another using a page->flags bit. He also proposed using the page lock in do_no_page, but that was initially considered too heavyweight. However, it is not a global or per-file lock, and the page cacheline is modified in do_no_page to increment _count and _mapcount anyway, so a further modification should not be a large performance hit. Scalability is not an issue. This patch implements this latter approach. ->nopage implementations return with the page locked if it is possible for their underlying file to be invalidated (in that case, they must set a special vm_flags bit to indicate so). do_no_page only unlocks the page after setting up the mapping completely. invalidation is excluded because it holds the page lock during invalidation of each page (and ensures that the page is not mapped while holding the lock). This also allows significant simplifications in do_no_page, because we have the page locked in the right place in the pagecache from the start. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c153
1 files changed, 73 insertions, 80 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 9c6ff7fffdc8..e6c99f6b5649 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1831,6 +1831,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
1831 unsigned long restart_addr; 1831 unsigned long restart_addr;
1832 int need_break; 1832 int need_break;
1833 1833
1834 /*
1835 * files that support invalidating or truncating portions of the
1836 * file from under mmaped areas must set the VM_CAN_INVALIDATE flag, and
1837 * have their .nopage function return the page locked.
1838 */
1839 BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
1840
1834again: 1841again:
1835 restart_addr = vma->vm_truncate_count; 1842 restart_addr = vma->vm_truncate_count;
1836 if (is_restart_addr(restart_addr) && start_addr < restart_addr) { 1843 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
@@ -1959,17 +1966,8 @@ void unmap_mapping_range(struct address_space *mapping,
1959 1966
1960 spin_lock(&mapping->i_mmap_lock); 1967 spin_lock(&mapping->i_mmap_lock);
1961 1968
1962 /* serialize i_size write against truncate_count write */ 1969 /* Protect against endless unmapping loops */
1963 smp_wmb();
1964 /* Protect against page faults, and endless unmapping loops */
1965 mapping->truncate_count++; 1970 mapping->truncate_count++;
1966 /*
1967 * For archs where spin_lock has inclusive semantics like ia64
1968 * this smp_mb() will prevent to read pagetable contents
1969 * before the truncate_count increment is visible to
1970 * other cpus.
1971 */
1972 smp_mb();
1973 if (unlikely(is_restart_addr(mapping->truncate_count))) { 1971 if (unlikely(is_restart_addr(mapping->truncate_count))) {
1974 if (mapping->truncate_count == 0) 1972 if (mapping->truncate_count == 0)
1975 reset_vma_truncate_counts(mapping); 1973 reset_vma_truncate_counts(mapping);
@@ -2008,8 +2006,18 @@ int vmtruncate(struct inode * inode, loff_t offset)
2008 if (IS_SWAPFILE(inode)) 2006 if (IS_SWAPFILE(inode))
2009 goto out_busy; 2007 goto out_busy;
2010 i_size_write(inode, offset); 2008 i_size_write(inode, offset);
2009
2010 /*
2011 * unmap_mapping_range is called twice, first simply for efficiency
2012 * so that truncate_inode_pages does fewer single-page unmaps. However
2013 * after this first call, and before truncate_inode_pages finishes,
2014 * it is possible for private pages to be COWed, which remain after
2015 * truncate_inode_pages finishes, hence the second unmap_mapping_range
2016 * call must be made for correctness.
2017 */
2011 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); 2018 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2012 truncate_inode_pages(mapping, offset); 2019 truncate_inode_pages(mapping, offset);
2020 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2013 goto out_truncate; 2021 goto out_truncate;
2014 2022
2015do_expand: 2023do_expand:
@@ -2049,6 +2057,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2049 down_write(&inode->i_alloc_sem); 2057 down_write(&inode->i_alloc_sem);
2050 unmap_mapping_range(mapping, offset, (end - offset), 1); 2058 unmap_mapping_range(mapping, offset, (end - offset), 1);
2051 truncate_inode_pages_range(mapping, offset, end); 2059 truncate_inode_pages_range(mapping, offset, end);
2060 unmap_mapping_range(mapping, offset, (end - offset), 1);
2052 inode->i_op->truncate_range(inode, offset, end); 2061 inode->i_op->truncate_range(inode, offset, end);
2053 up_write(&inode->i_alloc_sem); 2062 up_write(&inode->i_alloc_sem);
2054 mutex_unlock(&inode->i_mutex); 2063 mutex_unlock(&inode->i_mutex);
@@ -2206,7 +2215,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2206 2215
2207 /* No need to invalidate - it was non-present before */ 2216 /* No need to invalidate - it was non-present before */
2208 update_mmu_cache(vma, address, pte); 2217 update_mmu_cache(vma, address, pte);
2209 lazy_mmu_prot_update(pte);
2210unlock: 2218unlock:
2211 pte_unmap_unlock(page_table, ptl); 2219 pte_unmap_unlock(page_table, ptl);
2212out: 2220out:
@@ -2297,10 +2305,8 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2297 int write_access) 2305 int write_access)
2298{ 2306{
2299 spinlock_t *ptl; 2307 spinlock_t *ptl;
2300 struct page *new_page; 2308 struct page *page, *nopage_page;
2301 struct address_space *mapping = NULL;
2302 pte_t entry; 2309 pte_t entry;
2303 unsigned int sequence = 0;
2304 int ret = VM_FAULT_MINOR; 2310 int ret = VM_FAULT_MINOR;
2305 int anon = 0; 2311 int anon = 0;
2306 struct page *dirty_page = NULL; 2312 struct page *dirty_page = NULL;
@@ -2308,74 +2314,53 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2308 pte_unmap(page_table); 2314 pte_unmap(page_table);
2309 BUG_ON(vma->vm_flags & VM_PFNMAP); 2315 BUG_ON(vma->vm_flags & VM_PFNMAP);
2310 2316
2311 if (vma->vm_file) { 2317 nopage_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2312 mapping = vma->vm_file->f_mapping;
2313 sequence = mapping->truncate_count;
2314 smp_rmb(); /* serializes i_size against truncate_count */
2315 }
2316retry:
2317 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2318 /*
2319 * No smp_rmb is needed here as long as there's a full
2320 * spin_lock/unlock sequence inside the ->nopage callback
2321 * (for the pagecache lookup) that acts as an implicit
2322 * smp_mb() and prevents the i_size read to happen
2323 * after the next truncate_count read.
2324 */
2325
2326 /* no page was available -- either SIGBUS, OOM or REFAULT */ 2318 /* no page was available -- either SIGBUS, OOM or REFAULT */
2327 if (unlikely(new_page == NOPAGE_SIGBUS)) 2319 if (unlikely(nopage_page == NOPAGE_SIGBUS))
2328 return VM_FAULT_SIGBUS; 2320 return VM_FAULT_SIGBUS;
2329 else if (unlikely(new_page == NOPAGE_OOM)) 2321 else if (unlikely(nopage_page == NOPAGE_OOM))
2330 return VM_FAULT_OOM; 2322 return VM_FAULT_OOM;
2331 else if (unlikely(new_page == NOPAGE_REFAULT)) 2323 else if (unlikely(nopage_page == NOPAGE_REFAULT))
2332 return VM_FAULT_MINOR; 2324 return VM_FAULT_MINOR;
2333 2325
2326 BUG_ON(vma->vm_flags & VM_CAN_INVALIDATE && !PageLocked(nopage_page));
2327 /*
2328 * For consistency in subsequent calls, make the nopage_page always
2329 * locked.
2330 */
2331 if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE)))
2332 lock_page(nopage_page);
2333
2334 /* 2334 /*
2335 * Should we do an early C-O-W break? 2335 * Should we do an early C-O-W break?
2336 */ 2336 */
2337 page = nopage_page;
2337 if (write_access) { 2338 if (write_access) {
2338 if (!(vma->vm_flags & VM_SHARED)) { 2339 if (!(vma->vm_flags & VM_SHARED)) {
2339 struct page *page; 2340 if (unlikely(anon_vma_prepare(vma))) {
2340 2341 ret = VM_FAULT_OOM;
2341 if (unlikely(anon_vma_prepare(vma))) 2342 goto out_error;
2342 goto oom; 2343 }
2343 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, 2344 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2344 vma, address); 2345 if (!page) {
2345 if (!page) 2346 ret = VM_FAULT_OOM;
2346 goto oom; 2347 goto out_error;
2347 copy_user_highpage(page, new_page, address, vma); 2348 }
2348 page_cache_release(new_page); 2349 copy_user_highpage(page, nopage_page, address, vma);
2349 new_page = page;
2350 anon = 1; 2350 anon = 1;
2351
2352 } else { 2351 } else {
2353 /* if the page will be shareable, see if the backing 2352 /* if the page will be shareable, see if the backing
2354 * address space wants to know that the page is about 2353 * address space wants to know that the page is about
2355 * to become writable */ 2354 * to become writable */
2356 if (vma->vm_ops->page_mkwrite && 2355 if (vma->vm_ops->page_mkwrite &&
2357 vma->vm_ops->page_mkwrite(vma, new_page) < 0 2356 vma->vm_ops->page_mkwrite(vma, page) < 0) {
2358 ) { 2357 ret = VM_FAULT_SIGBUS;
2359 page_cache_release(new_page); 2358 goto out_error;
2360 return VM_FAULT_SIGBUS;
2361 } 2359 }
2362 } 2360 }
2363 } 2361 }
2364 2362
2365 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2363 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2366 /*
2367 * For a file-backed vma, someone could have truncated or otherwise
2368 * invalidated this page. If unmap_mapping_range got called,
2369 * retry getting the page.
2370 */
2371 if (mapping && unlikely(sequence != mapping->truncate_count)) {
2372 pte_unmap_unlock(page_table, ptl);
2373 page_cache_release(new_page);
2374 cond_resched();
2375 sequence = mapping->truncate_count;
2376 smp_rmb();
2377 goto retry;
2378 }
2379 2364
2380 /* 2365 /*
2381 * This silly early PAGE_DIRTY setting removes a race 2366 * This silly early PAGE_DIRTY setting removes a race
@@ -2388,43 +2373,51 @@ retry:
2388 * handle that later. 2373 * handle that later.
2389 */ 2374 */
2390 /* Only go through if we didn't race with anybody else... */ 2375 /* Only go through if we didn't race with anybody else... */
2391 if (pte_none(*page_table)) { 2376 if (likely(pte_none(*page_table))) {
2392 flush_icache_page(vma, new_page); 2377 flush_icache_page(vma, page);
2393 entry = mk_pte(new_page, vma->vm_page_prot); 2378 entry = mk_pte(page, vma->vm_page_prot);
2394 if (write_access) 2379 if (write_access)
2395 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2380 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2396 set_pte_at(mm, address, page_table, entry); 2381 set_pte_at(mm, address, page_table, entry);
2397 if (anon) { 2382 if (anon) {
2398 inc_mm_counter(mm, anon_rss); 2383 inc_mm_counter(mm, anon_rss);
2399 lru_cache_add_active(new_page); 2384 lru_cache_add_active(page);
2400 page_add_new_anon_rmap(new_page, vma, address); 2385 page_add_new_anon_rmap(page, vma, address);
2401 } else { 2386 } else {
2402 inc_mm_counter(mm, file_rss); 2387 inc_mm_counter(mm, file_rss);
2403 page_add_file_rmap(new_page); 2388 page_add_file_rmap(page);
2404 if (write_access) { 2389 if (write_access) {
2405 dirty_page = new_page; 2390 dirty_page = page;
2406 get_page(dirty_page); 2391 get_page(dirty_page);
2407 } 2392 }
2408 } 2393 }
2394
2395 /* no need to invalidate: a not-present page won't be cached */
2396 update_mmu_cache(vma, address, entry);
2397 lazy_mmu_prot_update(entry);
2409 } else { 2398 } else {
2410 /* One of our sibling threads was faster, back out. */ 2399 if (anon)
2411 page_cache_release(new_page); 2400 page_cache_release(page);
2412 goto unlock; 2401 else
2402 anon = 1; /* not anon, but release nopage_page */
2413 } 2403 }
2414 2404
2415 /* no need to invalidate: a not-present page shouldn't be cached */
2416 update_mmu_cache(vma, address, entry);
2417 lazy_mmu_prot_update(entry);
2418unlock:
2419 pte_unmap_unlock(page_table, ptl); 2405 pte_unmap_unlock(page_table, ptl);
2420 if (dirty_page) { 2406
2407out:
2408 unlock_page(nopage_page);
2409 if (anon)
2410 page_cache_release(nopage_page);
2411 else if (dirty_page) {
2421 set_page_dirty_balance(dirty_page); 2412 set_page_dirty_balance(dirty_page);
2422 put_page(dirty_page); 2413 put_page(dirty_page);
2423 } 2414 }
2415
2424 return ret; 2416 return ret;
2425oom: 2417
2426 page_cache_release(new_page); 2418out_error:
2427 return VM_FAULT_OOM; 2419 anon = 1; /* relase nopage_page */
2420 goto out;
2428} 2421}
2429 2422
2430/* 2423/*