diff options
author | Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> | 2008-07-28 18:46:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-28 19:30:21 -0400 |
commit | 8ab22b9abb5c55413802e4adc9aa6223324547c3 (patch) | |
tree | cff3319e1275e8a7c083d492889ec6bd0c7712d3 /include/linux | |
parent | d84a52f62f6a396ed77aa0052da74ca9e760b28a (diff) |
vfs: pagecache usage optimization for pagesize!=blocksize
When we read some part of a file through pagecache, if there is a
pagecache of corresponding index but this page is not uptodate, read IO
is issued and this page will be uptodate.
I think this is good for pagesize == blocksize environment but there is
room for improvement on pagesize != blocksize environment. Because in
this case a page can have multiple buffers and even if a page is not
uptodate, some buffers can be uptodate.
So I suggest that when all buffers which correspond to a part of a file
that we want to read are uptodate, use this pagecache and copy data from
this pagecache to user buffer even if a page is not uptodate. This can
reduce read IO and improve system throughput.
I wrote a benchmark program and got result number with this program.
This benchmark do:
1: mount and open a test file.
2: create a 512MB file.
3: close a file and umount.
4: mount and again open a test file.
5: pwrite randomly 300000 times on a test file. offset is aligned
by IO size(1024bytes).
6: measure time of preading randomly 100000 times on a test file.
The result was:
2.6.26
330 sec
2.6.26-patched
226 sec
Arch:i386
Filesystem:ext3
Blocksize:1024 bytes
Memory: 1GB
On ext3/4, a file is written through buffer/block. So random read/write
mixed workloads or random read after random write workloads are optimized
with this patch under pagesize != blocksize environment. This test result
showed this.
The benchmark program is as follows:
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#define LEN 1024
#define LOOP 1024*512 /* 512MB */
main(void)
{
unsigned long i, offset, filesize;
int fd;
char buf[LEN];
time_t t1, t2;
if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
memset(buf, 0, LEN);
fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}
for (i = 0; i < LOOP; i++)
write(fd, buf, LEN);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
perror("cannot mount\n");
exit(1);
}
fd = open("/root/test1/testfile", O_RDWR);
if (fd < 0) {
perror("cannot open file\n");
exit(1);
}
filesize = LEN * LOOP;
for (i = 0; i < 300000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pwrite(fd, buf, LEN, offset);
}
printf("start test\n");
time(&t1);
for (i = 0; i < 100000; i++){
offset = (random() % filesize) & (~(LEN - 1));
pread(fd, buf, LEN, offset);
}
time(&t2);
printf("%ld sec\n", t2-t1);
close(fd);
if (umount("/root/test1/") < 0) {
perror("cannot umount\n");
exit(1);
}
}
Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jan Kara <jack@ucw.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include/linux')
-rw-r--r-- | include/linux/buffer_head.h | 2 | ||||
-rw-r--r-- | include/linux/fs.h | 44 |
2 files changed, 25 insertions, 21 deletions
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 82aa36c53ea7..50cfe8ceb478 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -205,6 +205,8 @@ void block_invalidatepage(struct page *page, unsigned long offset); | |||
205 | int block_write_full_page(struct page *page, get_block_t *get_block, | 205 | int block_write_full_page(struct page *page, get_block_t *get_block, |
206 | struct writeback_control *wbc); | 206 | struct writeback_control *wbc); |
207 | int block_read_full_page(struct page*, get_block_t*); | 207 | int block_read_full_page(struct page*, get_block_t*); |
208 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, | ||
209 | unsigned long from); | ||
208 | int block_write_begin(struct file *, struct address_space *, | 210 | int block_write_begin(struct file *, struct address_space *, |
209 | loff_t, unsigned, unsigned, | 211 | loff_t, unsigned, unsigned, |
210 | struct page **, void **, get_block_t*); | 212 | struct page **, void **, get_block_t*); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 8252b045e624..580b513668fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -443,6 +443,27 @@ static inline size_t iov_iter_count(struct iov_iter *i) | |||
443 | return i->count; | 443 | return i->count; |
444 | } | 444 | } |
445 | 445 | ||
446 | /* | ||
447 | * "descriptor" for what we're up to with a read. | ||
448 | * This allows us to use the same read code yet | ||
449 | * have multiple different users of the data that | ||
450 | * we read from a file. | ||
451 | * | ||
452 | * The simplest case just copies the data to user | ||
453 | * mode. | ||
454 | */ | ||
455 | typedef struct { | ||
456 | size_t written; | ||
457 | size_t count; | ||
458 | union { | ||
459 | char __user *buf; | ||
460 | void *data; | ||
461 | } arg; | ||
462 | int error; | ||
463 | } read_descriptor_t; | ||
464 | |||
465 | typedef int (*read_actor_t)(read_descriptor_t *, struct page *, | ||
466 | unsigned long, unsigned long); | ||
446 | 467 | ||
447 | struct address_space_operations { | 468 | struct address_space_operations { |
448 | int (*writepage)(struct page *page, struct writeback_control *wbc); | 469 | int (*writepage)(struct page *page, struct writeback_control *wbc); |
@@ -484,6 +505,8 @@ struct address_space_operations { | |||
484 | int (*migratepage) (struct address_space *, | 505 | int (*migratepage) (struct address_space *, |
485 | struct page *, struct page *); | 506 | struct page *, struct page *); |
486 | int (*launder_page) (struct page *); | 507 | int (*launder_page) (struct page *); |
508 | int (*is_partially_uptodate) (struct page *, read_descriptor_t *, | ||
509 | unsigned long); | ||
487 | }; | 510 | }; |
488 | 511 | ||
489 | /* | 512 | /* |
@@ -1198,27 +1221,6 @@ struct block_device_operations { | |||
1198 | struct module *owner; | 1221 | struct module *owner; |
1199 | }; | 1222 | }; |
1200 | 1223 | ||
1201 | /* | ||
1202 | * "descriptor" for what we're up to with a read. | ||
1203 | * This allows us to use the same read code yet | ||
1204 | * have multiple different users of the data that | ||
1205 | * we read from a file. | ||
1206 | * | ||
1207 | * The simplest case just copies the data to user | ||
1208 | * mode. | ||
1209 | */ | ||
1210 | typedef struct { | ||
1211 | size_t written; | ||
1212 | size_t count; | ||
1213 | union { | ||
1214 | char __user * buf; | ||
1215 | void *data; | ||
1216 | } arg; | ||
1217 | int error; | ||
1218 | } read_descriptor_t; | ||
1219 | |||
1220 | typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); | ||
1221 | |||
1222 | /* These macros are for out of kernel modules to test that | 1224 | /* These macros are for out of kernel modules to test that |
1223 | * the kernel supports the unlocked_ioctl and compat_ioctl | 1225 | * the kernel supports the unlocked_ioctl and compat_ioctl |
1224 | * fields in struct file_operations. */ | 1226 | * fields in struct file_operations. */ |