diff options
author | Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> | 2008-07-28 18:46:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-28 19:30:21 -0400 |
commit | 8ab22b9abb5c55413802e4adc9aa6223324547c3 (patch) | |
tree | cff3319e1275e8a7c083d492889ec6bd0c7712d3 /fs | |
parent | d84a52f62f6a396ed77aa0052da74ca9e760b28a (diff) |
vfs: pagecache usage optimization for pagesize!=blocksize
When we read some part of a file through pagecache, if there is a
pagecache of corresponding index but this page is not uptodate, read IO
is issued and this page will be uptodate.
I think this is good for pagesize == blocksize environment but there is
room for improvement on pagesize != blocksize environment. Because in
this case a page can have multiple buffers and even if a page is not
uptodate, some buffers can be uptodate.
So I suggest that when all buffers which correspond to a part of a file
that we want to read are uptodate, use this pagecache and copy data from
this pagecache to user buffer even if a page is not uptodate. This can
reduce read IO and improve system throughput.
I wrote a benchmark program and got result number with this program.
This benchmark do:
1: mount and open a test file.
2: create a 512MB file.
3: close a file and umount.
4: mount and again open a test file.
5: pwrite randomly 300000 times on a test file. offset is aligned
by IO size(1024bytes).
6: measure time of preading randomly 100000 times on a test file.
The result was:
2.6.26
330 sec
2.6.26-patched
226 sec
Arch:i386
Filesystem:ext3
Blocksize:1024 bytes
Memory: 1GB
On ext3/4, a file is written through buffer/block. So random read/write
mixed workloads or random read after random write workloads are optimized
with this patch under pagesize != blocksize environment. This test result
showed this.
The benchmark program is as follows:
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#define LEN 1024
#define LOOP 1024*512 /* 512MB */
main(void)
{
unsigned long i, offset, filesize;
int fd;
char buf[LEN];
time_t t1, t2;
if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
memset(buf, 0, LEN);
fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}
for (i = 0; i < LOOP; i++)
write(fd, buf, LEN);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
fd = open("/root/test1/testfile", O_RDWR);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}
filesize = LEN * LOOP;
for (i = 0; i < 300000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pwrite(fd, buf, LEN, offset);
}
printf("start test\n");
time(&t1);
for (i = 0; i < 100000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pread(fd, buf, LEN, offset);
}
time(&t2);
printf("%ld sec\n", t2-t1);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
}
Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jan Kara <jack@ucw.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/buffer.c | 46 | ||||
-rw-r--r-- | fs/ext2/inode.c | 1 | ||||
-rw-r--r-- | fs/ext3/inode.c | 67 | ||||
-rw-r--r-- | fs/ext4/inode.c | 92 |
4 files changed, 130 insertions, 76 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index f95805019639..ca12a6bb82b1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -2096,6 +2096,52 @@ int generic_write_end(struct file *file, struct address_space *mapping, | |||
2096 | EXPORT_SYMBOL(generic_write_end); | 2096 | EXPORT_SYMBOL(generic_write_end); |
2097 | 2097 | ||
2098 | /* | 2098 | /* |
2099 | * block_is_partially_uptodate checks whether buffers within a page are | ||
2100 | * uptodate or not. | ||
2101 | * | ||
2102 | * Returns true if all buffers which correspond to a file portion | ||
2103 | * we want to read are uptodate. | ||
2104 | */ | ||
2105 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, | ||
2106 | unsigned long from) | ||
2107 | { | ||
2108 | struct inode *inode = page->mapping->host; | ||
2109 | unsigned block_start, block_end, blocksize; | ||
2110 | unsigned to; | ||
2111 | struct buffer_head *bh, *head; | ||
2112 | int ret = 1; | ||
2113 | |||
2114 | if (!page_has_buffers(page)) | ||
2115 | return 0; | ||
2116 | |||
2117 | blocksize = 1 << inode->i_blkbits; | ||
2118 | to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); | ||
2119 | to = from + to; | ||
2120 | if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) | ||
2121 | return 0; | ||
2122 | |||
2123 | head = page_buffers(page); | ||
2124 | bh = head; | ||
2125 | block_start = 0; | ||
2126 | do { | ||
2127 | block_end = block_start + blocksize; | ||
2128 | if (block_end > from && block_start < to) { | ||
2129 | if (!buffer_uptodate(bh)) { | ||
2130 | ret = 0; | ||
2131 | break; | ||
2132 | } | ||
2133 | if (block_end >= to) | ||
2134 | break; | ||
2135 | } | ||
2136 | block_start = block_end; | ||
2137 | bh = bh->b_this_page; | ||
2138 | } while (bh != head); | ||
2139 | |||
2140 | return ret; | ||
2141 | } | ||
2142 | EXPORT_SYMBOL(block_is_partially_uptodate); | ||
2143 | |||
2144 | /* | ||
2099 | * Generic "read page" function for block devices that have the normal | 2145 | * Generic "read page" function for block devices that have the normal |
2100 | * get_block functionality. This is most of the block device filesystems. | 2146 | * get_block functionality. This is most of the block device filesystems. |
2101 | * Reads the page asynchronously --- the unlock_buffer() and | 2147 | * Reads the page asynchronously --- the unlock_buffer() and |
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 384fc0d1dd74..991d6dfeb51f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c | |||
@@ -791,6 +791,7 @@ const struct address_space_operations ext2_aops = { | |||
791 | .direct_IO = ext2_direct_IO, | 791 | .direct_IO = ext2_direct_IO, |
792 | .writepages = ext2_writepages, | 792 | .writepages = ext2_writepages, |
793 | .migratepage = buffer_migrate_page, | 793 | .migratepage = buffer_migrate_page, |
794 | .is_partially_uptodate = block_is_partially_uptodate, | ||
794 | }; | 795 | }; |
795 | 796 | ||
796 | const struct address_space_operations ext2_aops_xip = { | 797 | const struct address_space_operations ext2_aops_xip = { |
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3bf07d70b914..507d8689b111 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c | |||
@@ -1767,44 +1767,47 @@ static int ext3_journalled_set_page_dirty(struct page *page) | |||
1767 | } | 1767 | } |
1768 | 1768 | ||
1769 | static const struct address_space_operations ext3_ordered_aops = { | 1769 | static const struct address_space_operations ext3_ordered_aops = { |
1770 | .readpage = ext3_readpage, | 1770 | .readpage = ext3_readpage, |
1771 | .readpages = ext3_readpages, | 1771 | .readpages = ext3_readpages, |
1772 | .writepage = ext3_ordered_writepage, | 1772 | .writepage = ext3_ordered_writepage, |
1773 | .sync_page = block_sync_page, | 1773 | .sync_page = block_sync_page, |
1774 | .write_begin = ext3_write_begin, | 1774 | .write_begin = ext3_write_begin, |
1775 | .write_end = ext3_ordered_write_end, | 1775 | .write_end = ext3_ordered_write_end, |
1776 | .bmap = ext3_bmap, | 1776 | .bmap = ext3_bmap, |
1777 | .invalidatepage = ext3_invalidatepage, | 1777 | .invalidatepage = ext3_invalidatepage, |
1778 | .releasepage = ext3_releasepage, | 1778 | .releasepage = ext3_releasepage, |
1779 | .direct_IO = ext3_direct_IO, | 1779 | .direct_IO = ext3_direct_IO, |
1780 | .migratepage = buffer_migrate_page, | 1780 | .migratepage = buffer_migrate_page, |
1781 | .is_partially_uptodate = block_is_partially_uptodate, | ||
1781 | }; | 1782 | }; |
1782 | 1783 | ||
1783 | static const struct address_space_operations ext3_writeback_aops = { | 1784 | static const struct address_space_operations ext3_writeback_aops = { |
1784 | .readpage = ext3_readpage, | 1785 | .readpage = ext3_readpage, |
1785 | .readpages = ext3_readpages, | 1786 | .readpages = ext3_readpages, |
1786 | .writepage = ext3_writeback_writepage, | 1787 | .writepage = ext3_writeback_writepage, |
1787 | .sync_page = block_sync_page, | 1788 | .sync_page = block_sync_page, |
1788 | .write_begin = ext3_write_begin, | 1789 | .write_begin = ext3_write_begin, |
1789 | .write_end = ext3_writeback_write_end, | 1790 | .write_end = ext3_writeback_write_end, |
1790 | .bmap = ext3_bmap, | 1791 | .bmap = ext3_bmap, |
1791 | .invalidatepage = ext3_invalidatepage, | 1792 | .invalidatepage = ext3_invalidatepage, |
1792 | .releasepage = ext3_releasepage, | 1793 | .releasepage = ext3_releasepage, |
1793 | .direct_IO = ext3_direct_IO, | 1794 | .direct_IO = ext3_direct_IO, |
1794 | .migratepage = buffer_migrate_page, | 1795 | .migratepage = buffer_migrate_page, |
1796 | .is_partially_uptodate = block_is_partially_uptodate, | ||
1795 | }; | 1797 | }; |
1796 | 1798 | ||
1797 | static const struct address_space_operations ext3_journalled_aops = { | 1799 | static const struct address_space_operations ext3_journalled_aops = { |
1798 | .readpage = ext3_readpage, | 1800 | .readpage = ext3_readpage, |
1799 | .readpages = ext3_readpages, | 1801 | .readpages = ext3_readpages, |
1800 | .writepage = ext3_journalled_writepage, | 1802 | .writepage = ext3_journalled_writepage, |
1801 | .sync_page = block_sync_page, | 1803 | .sync_page = block_sync_page, |
1802 | .write_begin = ext3_write_begin, | 1804 | .write_begin = ext3_write_begin, |
1803 | .write_end = ext3_journalled_write_end, | 1805 | .write_end = ext3_journalled_write_end, |
1804 | .set_page_dirty = ext3_journalled_set_page_dirty, | 1806 | .set_page_dirty = ext3_journalled_set_page_dirty, |
1805 | .bmap = ext3_bmap, | 1807 | .bmap = ext3_bmap, |
1806 | .invalidatepage = ext3_invalidatepage, | 1808 | .invalidatepage = ext3_invalidatepage, |
1807 | .releasepage = ext3_releasepage, | 1809 | .releasepage = ext3_releasepage, |
1810 | .is_partially_uptodate = block_is_partially_uptodate, | ||
1808 | }; | 1811 | }; |
1809 | 1812 | ||
1810 | void ext3_set_aops(struct inode *inode) | 1813 | void ext3_set_aops(struct inode *inode) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8ca2763df091..9843b046c235 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -2806,59 +2806,63 @@ static int ext4_journalled_set_page_dirty(struct page *page) | |||
2806 | } | 2806 | } |
2807 | 2807 | ||
2808 | static const struct address_space_operations ext4_ordered_aops = { | 2808 | static const struct address_space_operations ext4_ordered_aops = { |
2809 | .readpage = ext4_readpage, | 2809 | .readpage = ext4_readpage, |
2810 | .readpages = ext4_readpages, | 2810 | .readpages = ext4_readpages, |
2811 | .writepage = ext4_normal_writepage, | 2811 | .writepage = ext4_normal_writepage, |
2812 | .sync_page = block_sync_page, | 2812 | .sync_page = block_sync_page, |
2813 | .write_begin = ext4_write_begin, | 2813 | .write_begin = ext4_write_begin, |
2814 | .write_end = ext4_ordered_write_end, | 2814 | .write_end = ext4_ordered_write_end, |
2815 | .bmap = ext4_bmap, | 2815 | .bmap = ext4_bmap, |
2816 | .invalidatepage = ext4_invalidatepage, | 2816 | .invalidatepage = ext4_invalidatepage, |
2817 | .releasepage = ext4_releasepage, | 2817 | .releasepage = ext4_releasepage, |
2818 | .direct_IO = ext4_direct_IO, | 2818 | .direct_IO = ext4_direct_IO, |
2819 | .migratepage = buffer_migrate_page, | 2819 | .migratepage = buffer_migrate_page, |
2820 | .is_partially_uptodate = block_is_partially_uptodate, | ||
2820 | }; | 2821 | }; |
2821 | 2822 | ||
2822 | static const struct address_space_operations ext4_writeback_aops = { | 2823 | static const struct address_space_operations ext4_writeback_aops = { |
2823 | .readpage = ext4_readpage, | 2824 | .readpage = ext4_readpage, |
2824 | .readpages = ext4_readpages, | 2825 | .readpages = ext4_readpages, |
2825 | .writepage = ext4_normal_writepage, | 2826 | .writepage = ext4_normal_writepage, |
2826 | .sync_page = block_sync_page, | 2827 | .sync_page = block_sync_page, |
2827 | .write_begin = ext4_write_begin, | 2828 | .write_begin = ext4_write_begin, |
2828 | .write_end = ext4_writeback_write_end, | 2829 | .write_end = ext4_writeback_write_end, |
2829 | .bmap = ext4_bmap, | 2830 | .bmap = ext4_bmap, |
2830 | .invalidatepage = ext4_invalidatepage, | 2831 | .invalidatepage = ext4_invalidatepage, |
2831 | .releasepage = ext4_releasepage, | 2832 | .releasepage = ext4_releasepage, |
2832 | .direct_IO = ext4_direct_IO, | 2833 | .direct_IO = ext4_direct_IO, |
2833 | .migratepage = buffer_migrate_page, | 2834 | .migratepage = buffer_migrate_page, |
2835 | .is_partially_uptodate = block_is_partially_uptodate, | ||
2834 | }; | 2836 | }; |
2835 | 2837 | ||
2836 | static const struct address_space_operations ext4_journalled_aops = { | 2838 | static const struct address_space_operations ext4_journalled_aops = { |
2837 | .readpage = ext4_readpage, | 2839 | .readpage = ext4_readpage, |
2838 | .readpages = ext4_readpages, | 2840 | .readpages = ext4_readpages, |
2839 | .writepage = ext4_journalled_writepage, | 2841 | .writepage = ext4_journalled_writepage, |
2840 | .sync_page = block_sync_page, | 2842 | .sync_page = block_sync_page, |
2841 | .write_begin = ext4_write_begin, | 2843 | .write_begin = ext4_write_begin, |
2842 | .write_end = ext4_journalled_write_end, | 2844 | .write_end = ext4_journalled_write_end, |
2843 | .set_page_dirty = ext4_journalled_set_page_dirty, | 2845 | .set_page_dirty = ext4_journalled_set_page_dirty, |
2844 | .bmap = ext4_bmap, | 2846 | .bmap = ext4_bmap, |
2845 | .invalidatepage = ext4_invalidatepage, | 2847 | .invalidatepage = ext4_invalidatepage, |
2846 | .releasepage = ext4_releasepage, | 2848 | .releasepage = ext4_releasepage, |
2849 | .is_partially_uptodate = block_is_partially_uptodate, | ||
2847 | }; | 2850 | }; |
2848 | 2851 | ||
2849 | static const struct address_space_operations ext4_da_aops = { | 2852 | static const struct address_space_operations ext4_da_aops = { |
2850 | .readpage = ext4_readpage, | 2853 | .readpage = ext4_readpage, |
2851 | .readpages = ext4_readpages, | 2854 | .readpages = ext4_readpages, |
2852 | .writepage = ext4_da_writepage, | 2855 | .writepage = ext4_da_writepage, |
2853 | .writepages = ext4_da_writepages, | 2856 | .writepages = ext4_da_writepages, |
2854 | .sync_page = block_sync_page, | 2857 | .sync_page = block_sync_page, |
2855 | .write_begin = ext4_da_write_begin, | 2858 | .write_begin = ext4_da_write_begin, |
2856 | .write_end = ext4_da_write_end, | 2859 | .write_end = ext4_da_write_end, |
2857 | .bmap = ext4_bmap, | 2860 | .bmap = ext4_bmap, |
2858 | .invalidatepage = ext4_da_invalidatepage, | 2861 | .invalidatepage = ext4_da_invalidatepage, |
2859 | .releasepage = ext4_releasepage, | 2862 | .releasepage = ext4_releasepage, |
2860 | .direct_IO = ext4_direct_IO, | 2863 | .direct_IO = ext4_direct_IO, |
2861 | .migratepage = buffer_migrate_page, | 2864 | .migratepage = buffer_migrate_page, |
2865 | .is_partially_uptodate = block_is_partially_uptodate, | ||
2862 | }; | 2866 | }; |
2863 | 2867 | ||
2864 | void ext4_set_aops(struct inode *inode) | 2868 | void ext4_set_aops(struct inode *inode) |