aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>2008-07-28 18:46:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-28 19:30:21 -0400
commit8ab22b9abb5c55413802e4adc9aa6223324547c3 (patch)
treecff3319e1275e8a7c083d492889ec6bd0c7712d3
parentd84a52f62f6a396ed77aa0052da74ca9e760b28a (diff)
vfs: pagecache usage optimization for pagesize!=blocksize
When we read some part of a file through pagecache, if there is a pagecache of corresponding index but this page is not uptodate, read IO is issued and this page will be uptodate. I think this is good for pagesize == blocksize environment but there is room for improvement on pagesize != blocksize environment. Because in this case a page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate. So I suggest that when all buffers which correspond to a part of a file that we want to read are uptodate, use this pagecache and copy data from this pagecache to user buffer even if a page is not uptodate. This can reduce read IO and improve system throughput. I wrote a benchmark program and got result number with this program. This benchmark do: 1: mount and open a test file. 2: create a 512MB file. 3: close a file and umount. 4: mount and again open a test file. 5: pwrite randomly 300000 times on a test file. offset is aligned by IO size(1024bytes). 6: measure time of preading randomly 100000 times on a test file. The result was: 2.6.26 330 sec 2.6.26-patched 226 sec Arch:i386 Filesystem:ext3 Blocksize:1024 bytes Memory: 1GB On ext3/4, a file is written through buffer/block. So random read/write mixed workloads or random read after random write workloads are optimized with this patch under pagesize != blocksize environment. This test result showed this. The benchmark program is as follows: #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <time.h> #include <stdlib.h> #include <string.h> #include <sys/mount.h> #define LEN 1024 #define LOOP 1024*512 /* 512MB */ main(void) { unsigned long i, offset, filesize; int fd; char buf[LEN]; time_t t1, t2; if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } memset(buf, 0, LEN); fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC); if (fd < 0) { perror("cannot open file\n"); exit(1); } for (i = 0; i < LOOP; i++) write(fd, buf, LEN); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } fd = open("/root/test1/testfile", O_RDWR); if (fd < 0) { perror("cannot open file\n"); exit(1); } filesize = LEN * LOOP; for (i = 0; i < 300000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pwrite(fd, buf, LEN, offset); } printf("start test\n"); time(&t1); for (i = 0; i < 100000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pread(fd, buf, LEN, offset); } time(&t2); printf("%ld sec\n", t2-t1); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } } Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Christoph Hellwig <hch@infradead.org> Cc: Jan Kara <jack@ucw.cz> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/buffer.c46
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext3/inode.c67
-rw-r--r--fs/ext4/inode.c92
-rw-r--r--include/linux/buffer_head.h2
-rw-r--r--include/linux/fs.h44
-rw-r--r--mm/filemap.c14
7 files changed, 167 insertions, 99 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index f95805019639..ca12a6bb82b1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2096,6 +2096,52 @@ int generic_write_end(struct file *file, struct address_space *mapping,
2096EXPORT_SYMBOL(generic_write_end); 2096EXPORT_SYMBOL(generic_write_end);
2097 2097
2098/* 2098/*
2099 * block_is_partially_uptodate checks whether buffers within a page are
2100 * uptodate or not.
2101 *
2102 * Returns true if all buffers which correspond to a file portion
2103 * we want to read are uptodate.
2104 */
2105int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2106 unsigned long from)
2107{
2108 struct inode *inode = page->mapping->host;
2109 unsigned block_start, block_end, blocksize;
2110 unsigned to;
2111 struct buffer_head *bh, *head;
2112 int ret = 1;
2113
2114 if (!page_has_buffers(page))
2115 return 0;
2116
2117 blocksize = 1 << inode->i_blkbits;
2118 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2119 to = from + to;
2120 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2121 return 0;
2122
2123 head = page_buffers(page);
2124 bh = head;
2125 block_start = 0;
2126 do {
2127 block_end = block_start + blocksize;
2128 if (block_end > from && block_start < to) {
2129 if (!buffer_uptodate(bh)) {
2130 ret = 0;
2131 break;
2132 }
2133 if (block_end >= to)
2134 break;
2135 }
2136 block_start = block_end;
2137 bh = bh->b_this_page;
2138 } while (bh != head);
2139
2140 return ret;
2141}
2142EXPORT_SYMBOL(block_is_partially_uptodate);
2143
2144/*
2099 * Generic "read page" function for block devices that have the normal 2145 * Generic "read page" function for block devices that have the normal
2100 * get_block functionality. This is most of the block device filesystems. 2146 * get_block functionality. This is most of the block device filesystems.
2101 * Reads the page asynchronously --- the unlock_buffer() and 2147 * Reads the page asynchronously --- the unlock_buffer() and
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 384fc0d1dd74..991d6dfeb51f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -791,6 +791,7 @@ const struct address_space_operations ext2_aops = {
791 .direct_IO = ext2_direct_IO, 791 .direct_IO = ext2_direct_IO,
792 .writepages = ext2_writepages, 792 .writepages = ext2_writepages,
793 .migratepage = buffer_migrate_page, 793 .migratepage = buffer_migrate_page,
794 .is_partially_uptodate = block_is_partially_uptodate,
794}; 795};
795 796
796const struct address_space_operations ext2_aops_xip = { 797const struct address_space_operations ext2_aops_xip = {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 3bf07d70b914..507d8689b111 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1767,44 +1767,47 @@ static int ext3_journalled_set_page_dirty(struct page *page)
1767} 1767}
1768 1768
1769static const struct address_space_operations ext3_ordered_aops = { 1769static const struct address_space_operations ext3_ordered_aops = {
1770 .readpage = ext3_readpage, 1770 .readpage = ext3_readpage,
1771 .readpages = ext3_readpages, 1771 .readpages = ext3_readpages,
1772 .writepage = ext3_ordered_writepage, 1772 .writepage = ext3_ordered_writepage,
1773 .sync_page = block_sync_page, 1773 .sync_page = block_sync_page,
1774 .write_begin = ext3_write_begin, 1774 .write_begin = ext3_write_begin,
1775 .write_end = ext3_ordered_write_end, 1775 .write_end = ext3_ordered_write_end,
1776 .bmap = ext3_bmap, 1776 .bmap = ext3_bmap,
1777 .invalidatepage = ext3_invalidatepage, 1777 .invalidatepage = ext3_invalidatepage,
1778 .releasepage = ext3_releasepage, 1778 .releasepage = ext3_releasepage,
1779 .direct_IO = ext3_direct_IO, 1779 .direct_IO = ext3_direct_IO,
1780 .migratepage = buffer_migrate_page, 1780 .migratepage = buffer_migrate_page,
1781 .is_partially_uptodate = block_is_partially_uptodate,
1781}; 1782};
1782 1783
1783static const struct address_space_operations ext3_writeback_aops = { 1784static const struct address_space_operations ext3_writeback_aops = {
1784 .readpage = ext3_readpage, 1785 .readpage = ext3_readpage,
1785 .readpages = ext3_readpages, 1786 .readpages = ext3_readpages,
1786 .writepage = ext3_writeback_writepage, 1787 .writepage = ext3_writeback_writepage,
1787 .sync_page = block_sync_page, 1788 .sync_page = block_sync_page,
1788 .write_begin = ext3_write_begin, 1789 .write_begin = ext3_write_begin,
1789 .write_end = ext3_writeback_write_end, 1790 .write_end = ext3_writeback_write_end,
1790 .bmap = ext3_bmap, 1791 .bmap = ext3_bmap,
1791 .invalidatepage = ext3_invalidatepage, 1792 .invalidatepage = ext3_invalidatepage,
1792 .releasepage = ext3_releasepage, 1793 .releasepage = ext3_releasepage,
1793 .direct_IO = ext3_direct_IO, 1794 .direct_IO = ext3_direct_IO,
1794 .migratepage = buffer_migrate_page, 1795 .migratepage = buffer_migrate_page,
1796 .is_partially_uptodate = block_is_partially_uptodate,
1795}; 1797};
1796 1798
1797static const struct address_space_operations ext3_journalled_aops = { 1799static const struct address_space_operations ext3_journalled_aops = {
1798 .readpage = ext3_readpage, 1800 .readpage = ext3_readpage,
1799 .readpages = ext3_readpages, 1801 .readpages = ext3_readpages,
1800 .writepage = ext3_journalled_writepage, 1802 .writepage = ext3_journalled_writepage,
1801 .sync_page = block_sync_page, 1803 .sync_page = block_sync_page,
1802 .write_begin = ext3_write_begin, 1804 .write_begin = ext3_write_begin,
1803 .write_end = ext3_journalled_write_end, 1805 .write_end = ext3_journalled_write_end,
1804 .set_page_dirty = ext3_journalled_set_page_dirty, 1806 .set_page_dirty = ext3_journalled_set_page_dirty,
1805 .bmap = ext3_bmap, 1807 .bmap = ext3_bmap,
1806 .invalidatepage = ext3_invalidatepage, 1808 .invalidatepage = ext3_invalidatepage,
1807 .releasepage = ext3_releasepage, 1809 .releasepage = ext3_releasepage,
1810 .is_partially_uptodate = block_is_partially_uptodate,
1808}; 1811};
1809 1812
1810void ext3_set_aops(struct inode *inode) 1813void ext3_set_aops(struct inode *inode)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8ca2763df091..9843b046c235 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2806,59 +2806,63 @@ static int ext4_journalled_set_page_dirty(struct page *page)
2806} 2806}
2807 2807
2808static const struct address_space_operations ext4_ordered_aops = { 2808static const struct address_space_operations ext4_ordered_aops = {
2809 .readpage = ext4_readpage, 2809 .readpage = ext4_readpage,
2810 .readpages = ext4_readpages, 2810 .readpages = ext4_readpages,
2811 .writepage = ext4_normal_writepage, 2811 .writepage = ext4_normal_writepage,
2812 .sync_page = block_sync_page, 2812 .sync_page = block_sync_page,
2813 .write_begin = ext4_write_begin, 2813 .write_begin = ext4_write_begin,
2814 .write_end = ext4_ordered_write_end, 2814 .write_end = ext4_ordered_write_end,
2815 .bmap = ext4_bmap, 2815 .bmap = ext4_bmap,
2816 .invalidatepage = ext4_invalidatepage, 2816 .invalidatepage = ext4_invalidatepage,
2817 .releasepage = ext4_releasepage, 2817 .releasepage = ext4_releasepage,
2818 .direct_IO = ext4_direct_IO, 2818 .direct_IO = ext4_direct_IO,
2819 .migratepage = buffer_migrate_page, 2819 .migratepage = buffer_migrate_page,
2820 .is_partially_uptodate = block_is_partially_uptodate,
2820}; 2821};
2821 2822
2822static const struct address_space_operations ext4_writeback_aops = { 2823static const struct address_space_operations ext4_writeback_aops = {
2823 .readpage = ext4_readpage, 2824 .readpage = ext4_readpage,
2824 .readpages = ext4_readpages, 2825 .readpages = ext4_readpages,
2825 .writepage = ext4_normal_writepage, 2826 .writepage = ext4_normal_writepage,
2826 .sync_page = block_sync_page, 2827 .sync_page = block_sync_page,
2827 .write_begin = ext4_write_begin, 2828 .write_begin = ext4_write_begin,
2828 .write_end = ext4_writeback_write_end, 2829 .write_end = ext4_writeback_write_end,
2829 .bmap = ext4_bmap, 2830 .bmap = ext4_bmap,
2830 .invalidatepage = ext4_invalidatepage, 2831 .invalidatepage = ext4_invalidatepage,
2831 .releasepage = ext4_releasepage, 2832 .releasepage = ext4_releasepage,
2832 .direct_IO = ext4_direct_IO, 2833 .direct_IO = ext4_direct_IO,
2833 .migratepage = buffer_migrate_page, 2834 .migratepage = buffer_migrate_page,
2835 .is_partially_uptodate = block_is_partially_uptodate,
2834}; 2836};
2835 2837
2836static const struct address_space_operations ext4_journalled_aops = { 2838static const struct address_space_operations ext4_journalled_aops = {
2837 .readpage = ext4_readpage, 2839 .readpage = ext4_readpage,
2838 .readpages = ext4_readpages, 2840 .readpages = ext4_readpages,
2839 .writepage = ext4_journalled_writepage, 2841 .writepage = ext4_journalled_writepage,
2840 .sync_page = block_sync_page, 2842 .sync_page = block_sync_page,
2841 .write_begin = ext4_write_begin, 2843 .write_begin = ext4_write_begin,
2842 .write_end = ext4_journalled_write_end, 2844 .write_end = ext4_journalled_write_end,
2843 .set_page_dirty = ext4_journalled_set_page_dirty, 2845 .set_page_dirty = ext4_journalled_set_page_dirty,
2844 .bmap = ext4_bmap, 2846 .bmap = ext4_bmap,
2845 .invalidatepage = ext4_invalidatepage, 2847 .invalidatepage = ext4_invalidatepage,
2846 .releasepage = ext4_releasepage, 2848 .releasepage = ext4_releasepage,
2849 .is_partially_uptodate = block_is_partially_uptodate,
2847}; 2850};
2848 2851
2849static const struct address_space_operations ext4_da_aops = { 2852static const struct address_space_operations ext4_da_aops = {
2850 .readpage = ext4_readpage, 2853 .readpage = ext4_readpage,
2851 .readpages = ext4_readpages, 2854 .readpages = ext4_readpages,
2852 .writepage = ext4_da_writepage, 2855 .writepage = ext4_da_writepage,
2853 .writepages = ext4_da_writepages, 2856 .writepages = ext4_da_writepages,
2854 .sync_page = block_sync_page, 2857 .sync_page = block_sync_page,
2855 .write_begin = ext4_da_write_begin, 2858 .write_begin = ext4_da_write_begin,
2856 .write_end = ext4_da_write_end, 2859 .write_end = ext4_da_write_end,
2857 .bmap = ext4_bmap, 2860 .bmap = ext4_bmap,
2858 .invalidatepage = ext4_da_invalidatepage, 2861 .invalidatepage = ext4_da_invalidatepage,
2859 .releasepage = ext4_releasepage, 2862 .releasepage = ext4_releasepage,
2860 .direct_IO = ext4_direct_IO, 2863 .direct_IO = ext4_direct_IO,
2861 .migratepage = buffer_migrate_page, 2864 .migratepage = buffer_migrate_page,
2865 .is_partially_uptodate = block_is_partially_uptodate,
2862}; 2866};
2863 2867
2864void ext4_set_aops(struct inode *inode) 2868void ext4_set_aops(struct inode *inode)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 82aa36c53ea7..50cfe8ceb478 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,6 +205,8 @@ void block_invalidatepage(struct page *page, unsigned long offset);
205int block_write_full_page(struct page *page, get_block_t *get_block, 205int block_write_full_page(struct page *page, get_block_t *get_block,
206 struct writeback_control *wbc); 206 struct writeback_control *wbc);
207int block_read_full_page(struct page*, get_block_t*); 207int block_read_full_page(struct page*, get_block_t*);
208int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
209 unsigned long from);
208int block_write_begin(struct file *, struct address_space *, 210int block_write_begin(struct file *, struct address_space *,
209 loff_t, unsigned, unsigned, 211 loff_t, unsigned, unsigned,
210 struct page **, void **, get_block_t*); 212 struct page **, void **, get_block_t*);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8252b045e624..580b513668fe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -443,6 +443,27 @@ static inline size_t iov_iter_count(struct iov_iter *i)
443 return i->count; 443 return i->count;
444} 444}
445 445
446/*
447 * "descriptor" for what we're up to with a read.
448 * This allows us to use the same read code yet
449 * have multiple different users of the data that
450 * we read from a file.
451 *
452 * The simplest case just copies the data to user
453 * mode.
454 */
455typedef struct {
456 size_t written;
457 size_t count;
458 union {
459 char __user *buf;
460 void *data;
461 } arg;
462 int error;
463} read_descriptor_t;
464
465typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
466 unsigned long, unsigned long);
446 467
447struct address_space_operations { 468struct address_space_operations {
448 int (*writepage)(struct page *page, struct writeback_control *wbc); 469 int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -484,6 +505,8 @@ struct address_space_operations {
484 int (*migratepage) (struct address_space *, 505 int (*migratepage) (struct address_space *,
485 struct page *, struct page *); 506 struct page *, struct page *);
486 int (*launder_page) (struct page *); 507 int (*launder_page) (struct page *);
508 int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
509 unsigned long);
487}; 510};
488 511
489/* 512/*
@@ -1198,27 +1221,6 @@ struct block_device_operations {
1198 struct module *owner; 1221 struct module *owner;
1199}; 1222};
1200 1223
1201/*
1202 * "descriptor" for what we're up to with a read.
1203 * This allows us to use the same read code yet
1204 * have multiple different users of the data that
1205 * we read from a file.
1206 *
1207 * The simplest case just copies the data to user
1208 * mode.
1209 */
1210typedef struct {
1211 size_t written;
1212 size_t count;
1213 union {
1214 char __user * buf;
1215 void *data;
1216 } arg;
1217 int error;
1218} read_descriptor_t;
1219
1220typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long);
1221
1222/* These macros are for out of kernel modules to test that 1224/* These macros are for out of kernel modules to test that
1223 * the kernel supports the unlocked_ioctl and compat_ioctl 1225 * the kernel supports the unlocked_ioctl and compat_ioctl
1224 * fields in struct file_operations. */ 1226 * fields in struct file_operations. */
diff --git a/mm/filemap.c b/mm/filemap.c
index 5de7633e1dbe..42bbc6909ba4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1023,8 +1023,17 @@ find_page:
1023 ra, filp, page, 1023 ra, filp, page,
1024 index, last_index - index); 1024 index, last_index - index);
1025 } 1025 }
1026 if (!PageUptodate(page)) 1026 if (!PageUptodate(page)) {
1027 goto page_not_up_to_date; 1027 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1028 !mapping->a_ops->is_partially_uptodate)
1029 goto page_not_up_to_date;
1030 if (TestSetPageLocked(page))
1031 goto page_not_up_to_date;
1032 if (!mapping->a_ops->is_partially_uptodate(page,
1033 desc, offset))
1034 goto page_not_up_to_date_locked;
1035 unlock_page(page);
1036 }
1028page_ok: 1037page_ok:
1029 /* 1038 /*
1030 * i_size must be checked after we know the page is Uptodate. 1039 * i_size must be checked after we know the page is Uptodate.
@@ -1094,6 +1103,7 @@ page_not_up_to_date:
1094 if (lock_page_killable(page)) 1103 if (lock_page_killable(page))
1095 goto readpage_eio; 1104 goto readpage_eio;
1096 1105
1106page_not_up_to_date_locked:
1097 /* Did it get truncated before we got the lock? */ 1107 /* Did it get truncated before we got the lock? */
1098 if (!page->mapping) { 1108 if (!page->mapping) {
1099 unlock_page(page); 1109 unlock_page(page);