diff options
author | Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> | 2008-07-28 18:46:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-28 19:30:21 -0400 |
commit | 8ab22b9abb5c55413802e4adc9aa6223324547c3 (patch) | |
tree | cff3319e1275e8a7c083d492889ec6bd0c7712d3 /fs/buffer.c | |
parent | d84a52f62f6a396ed77aa0052da74ca9e760b28a (diff) |
vfs: pagecache usage optimization for pagesize!=blocksize
When we read some part of a file through pagecache, if there is a
pagecache of corresponding index but this page is not uptodate, read IO
is issued and this page will be uptodate.
I think this is good for pagesize == blocksize environment but there is
room for improvement on pagesize != blocksize environment. Because in
this case a page can have multiple buffers and even if a page is not
uptodate, some buffers can be uptodate.
So I suggest that when all buffers which correspond to a part of a file
that we want to read are uptodate, use this pagecache and copy data from
this pagecache to user buffer even if a page is not uptodate. This can
reduce read IO and improve system throughput.
I wrote a benchmark program and got result number with this program.
This benchmark do:
1: mount and open a test file.
2: create a 512MB file.
3: close a file and umount.
4: mount and again open a test file.
5: pwrite randomly 300000 times on a test file. offset is aligned
by IO size(1024bytes).
6: measure time of preading randomly 100000 times on a test file.
The result was:
2.6.26
330 sec
2.6.26-patched
226 sec
Arch:i386
Filesystem:ext3
Blocksize:1024 bytes
Memory: 1GB
On ext3/4, a file is written through buffer/block. So random read/write
mixed workloads or random read after random write workloads are optimized
with this patch under pagesize != blocksize environment. This test result
showed this.
The benchmark program is as follows:
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#define LEN 1024
#define LOOP 1024*512 /* 512MB */
main(void)
{
unsigned long i, offset, filesize;
int fd;
char buf[LEN];
time_t t1, t2;
if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
memset(buf, 0, LEN);
fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}
for (i = 0; i < LOOP; i++)
write(fd, buf, LEN);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
fd = open("/root/test1/testfile", O_RDWR);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}
filesize = LEN * LOOP;
for (i = 0; i < 300000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pwrite(fd, buf, LEN, offset);
}
printf("start test\n");
time(&t1);
for (i = 0; i < 100000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pread(fd, buf, LEN, offset);
}
time(&t2);
printf("%ld sec\n", t2-t1);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
}
Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jan Kara <jack@ucw.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/buffer.c')
-rw-r--r-- | fs/buffer.c | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index f95805019639..ca12a6bb82b1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -2096,6 +2096,52 @@ int generic_write_end(struct file *file, struct address_space *mapping, | |||
2096 | EXPORT_SYMBOL(generic_write_end); | 2096 | EXPORT_SYMBOL(generic_write_end); |
2097 | 2097 | ||
2098 | /* | 2098 | /* |
2099 | * block_is_partially_uptodate checks whether buffers within a page are | ||
2100 | * uptodate or not. | ||
2101 | * | ||
2102 | * Returns true if all buffers which correspond to a file portion | ||
2103 | * we want to read are uptodate. | ||
2104 | */ | ||
2105 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, | ||
2106 | unsigned long from) | ||
2107 | { | ||
2108 | struct inode *inode = page->mapping->host; | ||
2109 | unsigned block_start, block_end, blocksize; | ||
2110 | unsigned to; | ||
2111 | struct buffer_head *bh, *head; | ||
2112 | int ret = 1; | ||
2113 | |||
2114 | if (!page_has_buffers(page)) | ||
2115 | return 0; | ||
2116 | |||
2117 | blocksize = 1 << inode->i_blkbits; | ||
2118 | to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); | ||
2119 | to = from + to; | ||
2120 | if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) | ||
2121 | return 0; | ||
2122 | |||
2123 | head = page_buffers(page); | ||
2124 | bh = head; | ||
2125 | block_start = 0; | ||
2126 | do { | ||
2127 | block_end = block_start + blocksize; | ||
2128 | if (block_end > from && block_start < to) { | ||
2129 | if (!buffer_uptodate(bh)) { | ||
2130 | ret = 0; | ||
2131 | break; | ||
2132 | } | ||
2133 | if (block_end >= to) | ||
2134 | break; | ||
2135 | } | ||
2136 | block_start = block_end; | ||
2137 | bh = bh->b_this_page; | ||
2138 | } while (bh != head); | ||
2139 | |||
2140 | return ret; | ||
2141 | } | ||
2142 | EXPORT_SYMBOL(block_is_partially_uptodate); | ||
2143 | |||
2144 | /* | ||
2099 | * Generic "read page" function for block devices that have the normal | 2145 | * Generic "read page" function for block devices that have the normal |
2100 | * get_block functionality. This is most of the block device filesystems. | 2146 | * get_block functionality. This is most of the block device filesystems. |
2101 | * Reads the page asynchronously --- the unlock_buffer() and | 2147 | * Reads the page asynchronously --- the unlock_buffer() and |