vfs: pagecache usage optimization for pagesize!=blocksize

When we read some part of a file through pagecache, if there is a pagecache of corresponding index but this page is not uptodate, read IO is issued and this page will be uptodate. I think this is good for pagesize == blocksize environment but there is room for improvement on pagesize != blocksize environment. Because in this case a page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate. So I suggest that when all buffers which correspond to a part of a file that we want to read are uptodate, use this pagecache and copy data from this pagecache to user buffer even if a page is not uptodate. This can reduce read IO and improve system throughput. I wrote a benchmark program and got result number with this program. This benchmark do: 1: mount and open a test file. 2: create a 512MB file. 3: close a file and umount. 4: mount and again open a test file. 5: pwrite randomly 300000 times on a test file. offset is aligned by IO size(1024bytes). 6: measure time of preading randomly 100000 times on a test file. The result was: 2.6.26 330 sec 2.6.26-patched 226 sec Arch:i386 Filesystem:ext3 Blocksize:1024 bytes Memory: 1GB On ext3/4, a file is written through buffer/block. So random read/write mixed workloads or random read after random write workloads are optimized with this patch under pagesize != blocksize environment. This test result showed this. The benchmark program is as follows: #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <time.h> #include <stdlib.h> #include <string.h> #include <sys/mount.h> #define LEN 1024 #define LOOP 1024*512 /* 512MB */ main(void) { unsigned long i, offset, filesize; int fd; char buf[LEN]; time_t t1, t2; if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } memset(buf, 0, LEN); fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC); if (fd < 0) { perror("cannot open file\n"); exit(1); } for (i = 0; i < LOOP; i++) write(fd, buf, LEN); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } fd = open("/root/test1/testfile", O_RDWR); if (fd < 0) { perror("cannot open file\n"); exit(1); } filesize = LEN * LOOP; for (i = 0; i < 300000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pwrite(fd, buf, LEN, offset); } printf("start test\n"); time(&t1); for (i = 0; i < 100000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pread(fd, buf, LEN, offset); } time(&t2); printf("%ld sec\n", t2-t1); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } } Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Christoph Hellwig <hch@infradead.org> Cc: Jan Kara <jack@ucw.cz> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> 2008-07-28 18:46:36 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-07-28 19:30:21 -0400
commit: 8ab22b9abb5c55413802e4adc9aa6223324547c3 (patch)
tree: cff3319e1275e8a7c083d492889ec6bd0c7712d3 /fs/buffer.c
parent: d84a52f62f6a396ed77aa0052da74ca9e760b28a (diff)
1 files changed, 46 insertions, 0 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index f95805019639..ca12a6bb82b1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2096,6 +2096,52 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 EXPORT_SYMBOL(generic_write_end);
 /*
+ * block_is_partially_uptodate checks whether buffers within a page are
+ * uptodate or not.
+ *
+ * Returns true if all buffers which correspond to a file portion
+ * we want to read are uptodate.
+ */
+int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+                                        unsigned long from)
+{
+        struct inode *inode = page->mapping->host;
+        unsigned block_start, block_end, blocksize;
+        unsigned to;
+        struct buffer_head *bh, *head;
+        int ret = 1;
+        if (!page_has_buffers(page))
+                return 0;
+        blocksize = 1 << inode->i_blkbits;
+        to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+        to = from + to;
+        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+                return 0;
+        head = page_buffers(page);
+        bh = head;
+        block_start = 0;
+        do {
+                block_end = block_start + blocksize;
+                if (block_end > from && block_start < to) {
+                        if (!buffer_uptodate(bh)) {
+                                ret = 0;
+                                break;
+                        }
+                        if (block_end >= to)
+                                break;
+                }
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return ret;
+}
+EXPORT_SYMBOL(block_is_partially_uptodate);
+/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
author	Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>	2008-07-28 18:46:36 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-07-28 19:30:21 -0400
commit	8ab22b9abb5c55413802e4adc9aa6223324547c3 (patch)
tree	cff3319e1275e8a7c083d492889ec6bd0c7712d3 /fs/buffer.c
parent	d84a52f62f6a396ed77aa0052da74ca9e760b28a (diff)

diff --git a/fs/buffer.c b/fs/buffer.c index f95805019639..ca12a6bb82b1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c
@@ -2096,6 +2096,52 @@ int generic_write_end(struct file file, struct address_space mapping,
2096	EXPORT_SYMBOL(generic_write_end);	2096	EXPORT_SYMBOL(generic_write_end);
2097		2097
2098	/*	2098	/*
		2099	* block_is_partially_uptodate checks whether buffers within a page are
		2100	* uptodate or not.
		2101	*
		2102	* Returns true if all buffers which correspond to a file portion
		2103	* we want to read are uptodate.
		2104	*/
		2105	int block_is_partially_uptodate(struct page page, read_descriptor_t desc,
		2106	unsigned long from)
		2107	{
		2108	struct inode *inode = page->mapping->host;
		2109	unsigned block_start, block_end, blocksize;
		2110	unsigned to;
		2111	struct buffer_head bh, head;
		2112	int ret = 1;
		2113
		2114	if (!page_has_buffers(page))
		2115	return 0;
		2116
		2117	blocksize = 1 << inode->i_blkbits;
		2118	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
		2119	to = from + to;
		2120	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
		2121	return 0;
		2122
		2123	head = page_buffers(page);
		2124	bh = head;
		2125	block_start = 0;
		2126	do {
		2127	block_end = block_start + blocksize;
		2128	if (block_end > from && block_start < to) {
		2129	if (!buffer_uptodate(bh)) {
		2130	ret = 0;
		2131	break;
		2132	}
		2133	if (block_end >= to)
		2134	break;
		2135	}
		2136	block_start = block_end;
		2137	bh = bh->b_this_page;
		2138	} while (bh != head);
		2139
		2140	return ret;
		2141	}
		2142	EXPORT_SYMBOL(block_is_partially_uptodate);
		2143
		2144	/*
2099	* Generic "read page" function for block devices that have the normal	2145	* Generic "read page" function for block devices that have the normal
2100	* get_block functionality. This is most of the block device filesystems.	2146	* get_block functionality. This is most of the block device filesystems.
2101	* Reads the page asynchronously --- the unlock_buffer() and	2147	* Reads the page asynchronously --- the unlock_buffer() and