From a969e903a944f69309ee5cc9e7c7b08310d1151e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jul 2008 21:27:04 -0700 Subject: kill generic_file_direct_IO() generic_file_direct_IO is a common helper around the invocation of ->direct_IO. But there's almost nothing shared between the read and write side, so we're better off without this helper. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 117 ++++++++++++++++++++++++++--------------------------------- 1 file changed, 51 insertions(+), 66 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 65d9d9e2b755..6343f3c841b7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,9 +42,6 @@ #include -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs); /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -1205,8 +1202,11 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) { - retval = generic_file_direct_IO(READ, iocb, - iov, pos, nr_segs); + retval = filemap_write_and_wait(mapping); + if (!retval) { + retval = mapping->a_ops->direct_IO(READ, iocb, + iov, pos, nr_segs); + } if (retval > 0) *ppos = pos + retval; } @@ -2004,11 +2004,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written; + size_t write_len; + pgoff_t end; if (count != ocount) *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); + /* + * Unmap all mmappings of the file up-front. + * + * This will cause any pte dirty bits to be propagated into the + * pageframes for the subsequent filemap_write_and_wait(). + */ + write_len = iov_length(iov, *nr_segs); + end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; + if (mapping_mapped(mapping)) + unmap_mapping_range(mapping, pos, write_len, 0); + + written = filemap_write_and_wait(mapping); + if (written) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (mapping->nrpages) { + written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + if (written) + goto out; + } + + written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + + /* + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } + if (written > 0) { loff_t end = pos + written; if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -2024,6 +2068,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * i_mutex is held, which protects generic_osync_inode() from * livelocking. AIO O_DIRECT ops attempt to sync metadata here. */ +out: if ((written >= 0 || written == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); @@ -2511,66 +2556,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL(generic_file_aio_write); -/* - * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something - * went wrong during pagecache shootdown. - */ -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - ssize_t retval; - size_t write_len; - pgoff_t end = 0; /* silence gcc */ - - /* - * If it's a write, unmap all mmappings of the file up-front. This - * will cause any pte dirty bits to be propagated into the pageframes - * for the subsequent filemap_write_and_wait(). - */ - if (rw == WRITE) { - write_len = iov_length(iov, nr_segs); - end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, offset, write_len, 0); - } - - retval = filemap_write_and_wait(mapping); - if (retval) - goto out; - - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - if (rw == WRITE && mapping->nrpages) { - retval = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (retval) - goto out; - } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - - /* - * Finally, try again to invalidate clean pages which might have been - * cached by non-direct readahead, or faulted in by get_user_pages() - * if the source of the write was an mmap'ed region of the file - * we're writing. Either one is a pretty crazy thing to do, - * so we don't support it 100%. If this invalidation - * fails, tough, the write still worked... - */ - if (rw == WRITE && mapping->nrpages) { - invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); - } -out: - return retval; -} - /** * try_to_release_page() - release old fs-specific metadata on a page * -- cgit v1.2.2 From 11fa977ecde652ab324dd79c179deb52e82a8df1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Jul 2008 21:27:34 -0700 Subject: generic_file_aio_read() cleanups As akpm points out, there's really no need for generic_file_aio_read to make a special case of count 0: just loop through nr_segs doing nothing. And as Harvey Harrison points out, there's no need to reset retval to 0 where it's already 0. Setting count (or ocount) to 0 before calling generic_segment_checks is unnecessary too; but reluctantly I'll leave that removal to someone with a wider range of gcc versions to hand - 4.1.2 and 4.2.1 don't warn about it, but perhaps others do - I forget which are the warniest versions. Signed-off-by: Hugh Dickins Tested-by: Lawrence Greenfield Cc: Christoph Rohland Cc: Badari Pulavarty Cc: Zach Brown Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 6343f3c841b7..7675b91f4f63 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1197,7 +1197,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, mapping = filp->f_mapping; inode = mapping->host; - retval = 0; if (!count) goto out; /* skip atime */ size = i_size_read(inode); @@ -1209,33 +1208,30 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, } if (retval > 0) *ppos = pos + retval; - } - if (likely(retval != 0)) { - file_accessed(filp); - goto out; + if (retval) { + file_accessed(filp); + goto out; + } } } - retval = 0; - if (count) { - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp,ppos,&desc,file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; } + if (desc.count > 0) + break; } out: return retval; -- cgit v1.2.2 From 3f31fddfa26b7594b44ff2b34f9a04ba409e0f91 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Fri, 25 Jul 2008 01:46:22 -0700 Subject: jbd: fix race between free buffer and commit transaction journal_try_to_free_buffers() could race with jbd commit transaction when the later is holding the buffer reference while waiting for the data buffer to flush to disk. If the caller of journal_try_to_free_buffers() request tries hard to release the buffers, it will treat the failure as error and return back to the caller. We have seen the directo IO failed due to this race. Some of the caller of releasepage() also expecting the buffer to be dropped when passed with GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers(). With this patch, if the caller is passing the __GFP_WAIT and __GFP_FS to indicating this call could wait, in case of try_to_free_buffers() failed, let's waiting for journal_commit_transaction() to finish commit the current committing transaction, then try to free those buffers again. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mingming Cao Reviewed-by: Badari Pulavarty Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 7675b91f4f63..5d4c880d7cd9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2563,9 +2563,8 @@ EXPORT_SYMBOL(generic_file_aio_write); * Otherwise return zero. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). * - * NOTE: @gfp_mask may go away, and this function may become non-blocking. */ int try_to_release_page(struct page *page, gfp_t gfp_mask) { -- cgit v1.2.2 From 69029cd550284e32de13d6dd2f77b723c8a0e444 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:14 -0700 Subject: memcg: remove refcnt from page_cgroup memcg: performance improvements Patch Description 1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed) 2/5 ... swapcache handling patch 3/5 ... add helper function for shmem's memory reclaim patch 4/5 ... optimize by likely/unlikely ppatch 5/5 ... remove redundunt check patch (shmem handling is fixed.) Unix bench result. == 2.6.26-rc2-mm1 + memory resource controller Execl Throughput 2915.4 lps (29.6 secs, 3 samples) C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples) Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples) Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples) Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples) File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples) File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples) File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples) File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples) File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples) File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples) File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples) File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples) File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples) Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples) INDEX VALUES TEST BASELINE RESULT INDEX Execl Throughput 43.0 2915.4 678.0 File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0 File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5 File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8 Shell Scripts (8 concurrent) 6.0 1097.7 1829.5 ========= FINAL SCORE 991.3 == 2.6.26-rc2-mm1 + this set == Execl Throughput 3012.9 lps (29.9 secs, 3 samples) C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples) Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples) Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples) Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples) File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples) File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples) File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples) File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples) File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples) File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples) File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples) File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples) File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples) Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples) INDEX VALUES TEST BASELINE RESULT INDEX Execl Throughput 43.0 3012.9 700.7 File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7 File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3 File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0 Shell Scripts (8 concurrent) 6.0 1120.3 1867.2 ========= FINAL SCORE 1004.6 This patch: Remove refcnt from page_cgroup(). After this, * A page is charged only when !page_mapped() && no page_cgroup is assigned. * Anon page is newly mapped. * File page is added to mapping->tree. * A page is uncharged only when * Anon page is fully unmapped. * File page is removed from LRU. There is no change in behavior from user's view. This patch also removes unnecessary calls in rmap.c which was used only for refcnt mangement. [akpm@linux-foundation.org: fix warning] [hugh@veritas.com: fix shmem_unuse_inode charging] Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: Hugh Dickins Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 5d4c880d7cd9..2d3ec1ffc66e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -115,7 +115,7 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_cache_page(page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; @@ -474,12 +474,12 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); } else - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_cache_page(page); write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_cache_page(page); out: return error; } -- cgit v1.2.2 From e286781d5f2e9c846e012a39653a166e9d31777d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:30 -0700 Subject: mm: speculative page references If we can be sure that elevating the page_count on a pagecache page will pin it, we can speculatively run this operation, and subsequently check to see if we hit the right page rather than relying on holding a lock or otherwise pinning a reference to the page. This can be done if get_page/put_page behaves consistently throughout the whole tree (ie. if we "get" the page after it has been used for something else, we must be able to free it with a put_page). Actually, there is a period where the count behaves differently: when the page is free or if it is a constituent page of a compound page. We need an atomic_inc_not_zero operation to ensure we don't try to grab the page in either case. This patch introduces the core locking protocol to the pagecache (ie. adds page_cache_get_speculative, and tweaks some update-side code to make it work). Thanks to Hugh for pointing out an improvement to the algorithm setting page_count to zero when we have control of all references, in order to hold off speculative getters. [kamezawa.hiroyu@jp.fujitsu.com: fix migration_entry_wait()] [hugh@veritas.com: fix add_to_page_cache] [akpm@linux-foundation.org: repair a comment] Signed-off-by: Nick Piggin Cc: Jeff Garzik Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 2d3ec1ffc66e..4e182a9a14c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -442,39 +442,43 @@ int filemap_write_and_wait_range(struct address_space *mapping, } /** - * add_to_page_cache - add newly allocated pagecache pages + * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space * @offset: page index * @gfp_mask: page allocation mode * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * + * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. */ -int add_to_page_cache(struct page *page, struct address_space *mapping, +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = mem_cgroup_cache_charge(page, current->mm, + int error; + + VM_BUG_ON(!PageLocked(page)); + + error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & ~__GFP_HIGHMEM); if (error) goto out; error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; + if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - } else + } else { + page->mapping = NULL; mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); + } write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); @@ -483,7 +487,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, out: return error; } -EXPORT_SYMBOL(add_to_page_cache); +EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) -- cgit v1.2.2 From a60637c85893e7191faaafa6a72e197c24386727 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:31 -0700 Subject: mm: lockless pagecache Combine page_cache_get_speculative with lockless radix tree lookups to introduce lockless page cache lookups (ie. no mapping->tree_lock on the read-side). The only atomicity changes this introduces is that the gang pagecache lookup functions now behave as if they are implemented with multiple find_get_page calls, rather than operating on a snapshot of the pages. In practice, this atomicity guarantee is not used anyway, and it is to replace individual lookups, so these semantics are natural. Signed-off-by: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 179 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 134 insertions(+), 45 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 4e182a9a14c0..feb8448d8618 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -637,15 +637,35 @@ void __lock_page_nosync(struct page *page) * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { + void **pagep; struct page *page; - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page || page == RADIX_TREE_RETRY)) + goto repeat; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + return page; } EXPORT_SYMBOL(find_get_page); @@ -660,32 +680,22 @@ EXPORT_SYMBOL(find_get_page); * * Returns zero if the page was not present. find_lock_page() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); + page = find_get_page(mapping, offset); if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } - VM_BUG_ON(page->index != offset); - goto out; + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } + VM_BUG_ON(page->index != offset); } - read_unlock_irq(&mapping->tree_lock); -out: return page; } EXPORT_SYMBOL(find_lock_page); @@ -751,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); return ret; } @@ -778,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, index, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + if (page->mapping == NULL || page->index != index) break; - page_cache_get(pages[i]); + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; index++; } - read_unlock_irq(&mapping->tree_lock); - return i; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(find_get_pages_contig); @@ -810,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, + (void ***)pages, *index, nr_pages, tag); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup_tag(&mapping->page_tree, - (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); if (ret) *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); + return ret; } EXPORT_SYMBOL(find_get_pages_tag); -- cgit v1.2.2 From 19fd6231279be3c3bdd02ed99f9b0eb195978064 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:32 -0700 Subject: mm: spinlock tree_lock mapping->tree_lock has no read lockers. convert the lock from an rwlock to a spinlock. Signed-off-by: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index feb8448d8618..2ed8b0389c51 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -109,7 +109,7 @@ /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { @@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page) BUG_ON(!PageLocked(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); } static int sync_page(void *word) @@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->mapping = mapping; page->index = offset; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (likely(!error)) { mapping->nrpages++; @@ -480,7 +480,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page_cache_release(page); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page); -- cgit v1.2.2 From 2f1936b87783a3a56c9441b27b9ba7a747f11e8e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 24 Jun 2008 16:50:14 +0200 Subject: [patch 3/5] vfs: change remove_suid() to file_remove_suid() All calls to remove_suid() are made with a file pointer, because (similarly to file_update_time) it is called when the file is written. Clean up callers by passing in a file instead of a dentry. Signed-off-by: Miklos Szeredi --- mm/filemap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 2ed8b0389c51..5de7633e1dbe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1758,8 +1758,9 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs); } -int remove_suid(struct dentry *dentry) +int file_remove_suid(struct file *file) { + struct dentry *dentry = file->f_path.dentry; int killsuid = should_remove_suid(dentry); int killpriv = security_inode_need_killpriv(dentry); int error = 0; @@ -1773,7 +1774,7 @@ int remove_suid(struct dentry *dentry) return error; } -EXPORT_SYMBOL(remove_suid); +EXPORT_SYMBOL(file_remove_suid); static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) @@ -2529,7 +2530,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_path.dentry); + err = file_remove_suid(file); if (err) goto out; -- cgit v1.2.2 From 8ab22b9abb5c55413802e4adc9aa6223324547c3 Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Mon, 28 Jul 2008 15:46:36 -0700 Subject: vfs: pagecache usage optimization for pagesize!=blocksize When we read some part of a file through pagecache, if there is a pagecache of corresponding index but this page is not uptodate, read IO is issued and this page will be uptodate. I think this is good for pagesize == blocksize environment but there is room for improvement on pagesize != blocksize environment. Because in this case a page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate. So I suggest that when all buffers which correspond to a part of a file that we want to read are uptodate, use this pagecache and copy data from this pagecache to user buffer even if a page is not uptodate. This can reduce read IO and improve system throughput. I wrote a benchmark program and got result number with this program. This benchmark do: 1: mount and open a test file. 2: create a 512MB file. 3: close a file and umount. 4: mount and again open a test file. 5: pwrite randomly 300000 times on a test file. offset is aligned by IO size(1024bytes). 6: measure time of preading randomly 100000 times on a test file. The result was: 2.6.26 330 sec 2.6.26-patched 226 sec Arch:i386 Filesystem:ext3 Blocksize:1024 bytes Memory: 1GB On ext3/4, a file is written through buffer/block. So random read/write mixed workloads or random read after random write workloads are optimized with this patch under pagesize != blocksize environment. This test result showed this. The benchmark program is as follows: #include #include #include #include #include #include #include #include #include #define LEN 1024 #define LOOP 1024*512 /* 512MB */ main(void) { unsigned long i, offset, filesize; int fd; char buf[LEN]; time_t t1, t2; if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } memset(buf, 0, LEN); fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC); if (fd < 0) { perror("cannot open file\n"); exit(1); } for (i = 0; i < LOOP; i++) write(fd, buf, LEN); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } fd = open("/root/test1/testfile", O_RDWR); if (fd < 0) { perror("cannot open file\n"); exit(1); } filesize = LEN * LOOP; for (i = 0; i < 300000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pwrite(fd, buf, LEN, offset); } printf("start test\n"); time(&t1); for (i = 0; i < 100000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pread(fd, buf, LEN, offset); } time(&t2); printf("%ld sec\n", t2-t1); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } } Signed-off-by: Hisashi Hifumi Cc: Nick Piggin Cc: Christoph Hellwig Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 5de7633e1dbe..42bbc6909ba4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1023,8 +1023,17 @@ find_page: ra, filp, page, index, last_index - index); } - if (!PageUptodate(page)) - goto page_not_up_to_date; + if (!PageUptodate(page)) { + if (inode->i_blkbits == PAGE_CACHE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + if (TestSetPageLocked(page)) + goto page_not_up_to_date; + if (!mapping->a_ops->is_partially_uptodate(page, + desc, offset)) + goto page_not_up_to_date_locked; + unlock_page(page); + } page_ok: /* * i_size must be checked after we know the page is Uptodate. @@ -1094,6 +1103,7 @@ page_not_up_to_date: if (lock_page_killable(page)) goto readpage_eio; +page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); -- cgit v1.2.2 From 94ad374a0751f40d25e22e036c37f7263569d24c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 30 Jul 2008 14:45:12 -0700 Subject: Fix off-by-one error in iov_iter_advance() The iov_iter_advance() function would look at the iov->iov_len entry even though it might have iterated over the whole array, and iov was pointing past the end. This would cause DEBUG_PAGEALLOC to trigger a kernel page fault if the allocation was at the end of a page, and the next page was unallocated. The quick fix is to just change the order of the tests: check that there is any iovec data left before we check the iov entry itself. Thanks to Alexey Dobriyan for finding this case, and testing the fix. Reported-and-tested-by: Alexey Dobriyan Cc: Nick Piggin Cc: Andrew Morton Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 42bbc6909ba4..d97d1ad55473 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1879,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) * The !iov->iov_len check ensures we skip over unlikely * zero-length segments (without overruning the iovec). */ - while (bytes || unlikely(!iov->iov_len && i->count)) { + while (bytes || unlikely(i->count && !iov->iov_len)) { int copy; copy = min(bytes, iov->iov_len - base); -- cgit v1.2.2 From 529ae9aaa08378cfe2a4350bded76f32cc8ff0ce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 2 Aug 2008 12:01:03 +0200 Subject: mm: rename page trylock Converting page lock to new locking bitops requires a change of page flag operation naming, so we might as well convert it to something nicer (!TestSetPageLocked_Lock => trylock_page, SetPageLocked => set_page_locked). This also facilitates lockdeping of page lock. Signed-off-by: Nick Piggin Acked-by: KOSAKI Motohiro Acked-by: Peter Zijlstra Acked-by: Andrew Morton Acked-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- mm/filemap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index d97d1ad55473..54e968650855 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -558,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit); * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * * The first mb is necessary to safely close the critical section opened by the - * TestSetPageLocked(), the second mb is necessary to enforce ordering between - * the clear_bit and the read of the waitqueue (to avoid SMP races with a - * parallel wait_on_page_locked()). + * test_and_set_bit() to lock the page; the second mb is necessary to enforce + * ordering between the clear_bit and the read of the waitqueue (to avoid SMP + * races with a parallel wait_on_page_locked()). */ void unlock_page(struct page *page) { smp_mb__before_clear_bit(); - if (!TestClearPageLocked(page)) + if (!test_and_clear_bit(PG_locked, &page->flags)) BUG(); smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); @@ -931,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) struct page *page = find_get_page(mapping, index); if (page) { - if (!TestSetPageLocked(page)) + if (trylock_page(page)) return page; page_cache_release(page); return NULL; @@ -1027,7 +1027,7 @@ find_page: if (inode->i_blkbits == PAGE_CACHE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto page_not_up_to_date; if (!mapping->a_ops->is_partially_uptodate(page, desc, offset)) -- cgit v1.2.2