vfs: pagecache usage optimization for pagesize!=blocksize

When we read some part of a file through pagecache, if there is a pagecache of corresponding index but this page is not uptodate, read IO is issued and this page will be uptodate. I think this is good for pagesize == blocksize environment but there is room for improvement on pagesize != blocksize environment. Because in this case a page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate. So I suggest that when all buffers which correspond to a part of a file that we want to read are uptodate, use this pagecache and copy data from this pagecache to user buffer even if a page is not uptodate. This can reduce read IO and improve system throughput. I wrote a benchmark program and got result number with this program. This benchmark do: 1: mount and open a test file. 2: create a 512MB file. 3: close a file and umount. 4: mount and again open a test file. 5: pwrite randomly 300000 times on a test file. offset is aligned by IO size(1024bytes). 6: measure time of preading randomly 100000 times on a test file. The result was: 2.6.26 330 sec 2.6.26-patched 226 sec Arch:i386 Filesystem:ext3 Blocksize:1024 bytes Memory: 1GB On ext3/4, a file is written through buffer/block. So random read/write mixed workloads or random read after random write workloads are optimized with this patch under pagesize != blocksize environment. This test result showed this. The benchmark program is as follows: #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <time.h> #include <stdlib.h> #include <string.h> #include <sys/mount.h> #define LEN 1024 #define LOOP 1024*512 /* 512MB */ main(void) { unsigned long i, offset, filesize; int fd; char buf[LEN]; time_t t1, t2; if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } memset(buf, 0, LEN); fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC); if (fd < 0) { perror("cannot open file\n"); exit(1); } for (i = 0; i < LOOP; i++) write(fd, buf, LEN); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } fd = open("/root/test1/testfile", O_RDWR); if (fd < 0) { perror("cannot open file\n"); exit(1); } filesize = LEN * LOOP; for (i = 0; i < 300000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pwrite(fd, buf, LEN, offset); } printf("start test\n"); time(&t1); for (i = 0; i < 100000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pread(fd, buf, LEN, offset); } time(&t2); printf("%ld sec\n", t2-t1); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } } Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Christoph Hellwig <hch@infradead.org> Cc: Jan Kara <jack@ucw.cz> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> 2008-07-28 18:46:36 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-07-28 19:30:21 -0400
commit: 8ab22b9abb5c55413802e4adc9aa6223324547c3 (patch)
tree: cff3319e1275e8a7c083d492889ec6bd0c7712d3 /include/linux
parent: d84a52f62f6a396ed77aa0052da74ca9e760b28a (diff)
2 files changed, 25 insertions, 21 deletions
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 82aa36c53ea7..50cfe8ceb478 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,6 +205,8 @@ void block_invalidatepage(struct page *page, unsigned long offset);
 int block_write_full_page(struct page *page, get_block_t *get_block,
                                struct writeback_control *wbc);
 int block_read_full_page(struct page*, get_block_t*);
+int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+                                unsigned long from);
 int block_write_begin(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page **, void **, get_block_t*);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8252b045e624..580b513668fe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -443,6 +443,27 @@ static inline size_t iov_iter_count(struct iov_iter *i)
        return i->count;
 }
+/*
+ * "descriptor" for what we're up to with a read.
+ * This allows us to use the same read code yet
+ * have multiple different users of the data that
+ * we read from a file.
+ *
+ * The simplest case just copies the data to user
+ * mode.
+ */
+typedef struct {
+        size_t written;
+        size_t count;
+        union {
+                char __user *buf;
+                void *data;
+        } arg;
+        int error;
+} read_descriptor_t;
+typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
+                unsigned long, unsigned long);
 struct address_space_operations {
        int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -484,6 +505,8 @@ struct address_space_operations {
        int (*migratepage) (struct address_space *,
                        struct page *, struct page *);
        int (*launder_page) (struct page *);
+        int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
+                                        unsigned long);
 };
 /*
@@ -1198,27 +1221,6 @@ struct block_device_operations {
        struct module *owner;
 };
-/*
- * "descriptor" for what we're up to with a read.
- * This allows us to use the same read code yet
- * have multiple different users of the data that
- * we read from a file.
- *
- * The simplest case just copies the data to user
- * mode.
- */
-typedef struct {
-        size_t written;
-        size_t count;
-        union {
-                char __user * buf;
-                void *data;
-        } arg;
-        int error;
-} read_descriptor_t;
-typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long);
 /* These macros are for out of kernel modules to test that
 * the kernel supports the unlocked_ioctl and compat_ioctl
 * fields in struct file_operations. */
author	Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>	2008-07-28 18:46:36 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-07-28 19:30:21 -0400
commit	8ab22b9abb5c55413802e4adc9aa6223324547c3 (patch)
tree	cff3319e1275e8a7c083d492889ec6bd0c7712d3 /include/linux
parent	d84a52f62f6a396ed77aa0052da74ca9e760b28a (diff)