diff options
author | Theodore Ts'o <tytso@mit.edu> | 2008-10-09 23:53:47 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-10-09 23:53:47 -0400 |
commit | 240799cdf22bd789ea6852653c3b879d35ad0a6c (patch) | |
tree | e696b60cc103f23838b5c14d8d397f692abffbc3 | |
parent | 37515facd001942221d68171c81c1f46d54ffdd0 (diff) |
ext4: Use readahead when reading an inode from the inode table
With modern hard drives, reading 64k takes roughly the same time as
reading a 4k block. So request readahead for adjacent inode table
blocks to reduce the time it takes when iterating over directories
(especially when doing this in htree sort order) in a cold cache case.
With this patch, the time it takes to run "git status" on a kernel
tree after flushing the caches via "echo 3 > /proc/sys/vm/drop_caches"
is reduced by 21%.
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r-- | Documentation/filesystems/ext4.txt | 6 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 3 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 2 | ||||
-rw-r--r-- | fs/ext4/ext4_sb.h | 1 | ||||
-rw-r--r-- | fs/ext4/inode.c | 134 | ||||
-rw-r--r-- | fs/ext4/super.c | 27 |
6 files changed, 101 insertions, 72 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 0d5394920a31..289057958f90 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -177,6 +177,11 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in | |||
177 | your disks are battery-backed in one way or another, | 177 | your disks are battery-backed in one way or another, |
178 | disabling barriers may safely improve performance. | 178 | disabling barriers may safely improve performance. |
179 | 179 | ||
180 | inode_readahead=n This tuning parameter controls the maximum | ||
181 | number of inode table blocks that ext4's inode | ||
182 | table readahead algorithm will pre-read into | ||
183 | the buffer cache. The default value is 32 blocks. | ||
184 | |||
180 | orlov (*) This enables the new Orlov block allocator. It is | 185 | orlov (*) This enables the new Orlov block allocator. It is |
181 | enabled by default. | 186 | enabled by default. |
182 | 187 | ||
@@ -252,6 +257,7 @@ stripe=n Number of filesystem blocks that mballoc will try | |||
252 | delalloc (*) Deferring block allocation until write-out time. | 257 | delalloc (*) Deferring block allocation until write-out time. |
253 | nodelalloc Disable delayed allocation. Blocks are allocation | 258 | nodelalloc Disable delayed allocation. Blocks are allocation |
254 | when data is copied from user to page cache. | 259 | when data is copied from user to page cache. |
260 | |||
255 | Data Mode | 261 | Data Mode |
256 | ========= | 262 | ========= |
257 | There are 3 different data modes: | 263 | There are 3 different data modes: |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index d9ac9706735b..d831d24d2a6c 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -956,6 +956,9 @@ Table 1-10: Files in /proc/fs/ext4/<devname> | |||
956 | files are packed closely together. Each large file | 956 | files are packed closely together. Each large file |
957 | will have its blocks allocated out of its own unique | 957 | will have its blocks allocated out of its own unique |
958 | preallocation pool. | 958 | preallocation pool. |
959 | inode_readahead Tuning parameter which controls the maximum number of | ||
960 | inode table blocks that ext4's inode table readahead | ||
961 | algorithm will pre-read into the buffer cache | ||
959 | .............................................................................. | 962 | .............................................................................. |
960 | 963 | ||
961 | 964 | ||
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 163c44527dde..922d18720c9e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -790,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) | |||
790 | #define EXT4_DEF_RESUID 0 | 790 | #define EXT4_DEF_RESUID 0 |
791 | #define EXT4_DEF_RESGID 0 | 791 | #define EXT4_DEF_RESGID 0 |
792 | 792 | ||
793 | #define EXT4_DEF_INODE_READAHEAD_BLKS 32 | ||
794 | |||
793 | /* | 795 | /* |
794 | * Default mount options | 796 | * Default mount options |
795 | */ | 797 | */ |
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index f92af01138d4..94e0757522a6 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h | |||
@@ -52,6 +52,7 @@ struct ext4_sb_info { | |||
52 | int s_desc_per_block_bits; | 52 | int s_desc_per_block_bits; |
53 | int s_inode_size; | 53 | int s_inode_size; |
54 | int s_first_ino; | 54 | int s_first_ino; |
55 | unsigned int s_inode_readahead_blks; | ||
55 | spinlock_t s_next_gen_lock; | 56 | spinlock_t s_next_gen_lock; |
56 | u32 s_next_generation; | 57 | u32 s_next_generation; |
57 | u32 s_hash_seed[4]; | 58 | u32 s_hash_seed[4]; |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 22fcbb67cd88..ef4ca3d4abc0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -3833,41 +3833,6 @@ out_stop: | |||
3833 | ext4_journal_stop(handle); | 3833 | ext4_journal_stop(handle); |
3834 | } | 3834 | } |
3835 | 3835 | ||
3836 | static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, | ||
3837 | unsigned long ino, struct ext4_iloc *iloc) | ||
3838 | { | ||
3839 | ext4_group_t block_group; | ||
3840 | unsigned long offset; | ||
3841 | ext4_fsblk_t block; | ||
3842 | struct ext4_group_desc *gdp; | ||
3843 | |||
3844 | if (!ext4_valid_inum(sb, ino)) { | ||
3845 | /* | ||
3846 | * This error is already checked for in namei.c unless we are | ||
3847 | * looking at an NFS filehandle, in which case no error | ||
3848 | * report is needed | ||
3849 | */ | ||
3850 | return 0; | ||
3851 | } | ||
3852 | |||
3853 | block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); | ||
3854 | gdp = ext4_get_group_desc(sb, block_group, NULL); | ||
3855 | if (!gdp) | ||
3856 | return 0; | ||
3857 | |||
3858 | /* | ||
3859 | * Figure out the offset within the block group inode table | ||
3860 | */ | ||
3861 | offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) * | ||
3862 | EXT4_INODE_SIZE(sb); | ||
3863 | block = ext4_inode_table(sb, gdp) + | ||
3864 | (offset >> EXT4_BLOCK_SIZE_BITS(sb)); | ||
3865 | |||
3866 | iloc->block_group = block_group; | ||
3867 | iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1); | ||
3868 | return block; | ||
3869 | } | ||
3870 | |||
3871 | /* | 3836 | /* |
3872 | * ext4_get_inode_loc returns with an extra refcount against the inode's | 3837 | * ext4_get_inode_loc returns with an extra refcount against the inode's |
3873 | * underlying buffer_head on success. If 'in_mem' is true, we have all | 3838 | * underlying buffer_head on success. If 'in_mem' is true, we have all |
@@ -3877,19 +3842,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, | |||
3877 | static int __ext4_get_inode_loc(struct inode *inode, | 3842 | static int __ext4_get_inode_loc(struct inode *inode, |
3878 | struct ext4_iloc *iloc, int in_mem) | 3843 | struct ext4_iloc *iloc, int in_mem) |
3879 | { | 3844 | { |
3880 | ext4_fsblk_t block; | 3845 | struct ext4_group_desc *gdp; |
3881 | struct buffer_head *bh; | 3846 | struct buffer_head *bh; |
3847 | struct super_block *sb = inode->i_sb; | ||
3848 | ext4_fsblk_t block; | ||
3849 | int inodes_per_block, inode_offset; | ||
3850 | |||
3851 | iloc->bh = 0; | ||
3852 | if (!ext4_valid_inum(sb, inode->i_ino)) | ||
3853 | return -EIO; | ||
3882 | 3854 | ||
3883 | block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); | 3855 | iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); |
3884 | if (!block) | 3856 | gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); |
3857 | if (!gdp) | ||
3885 | return -EIO; | 3858 | return -EIO; |
3886 | 3859 | ||
3887 | bh = sb_getblk(inode->i_sb, block); | 3860 | /* |
3861 | * Figure out the offset within the block group inode table | ||
3862 | */ | ||
3863 | inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); | ||
3864 | inode_offset = ((inode->i_ino - 1) % | ||
3865 | EXT4_INODES_PER_GROUP(sb)); | ||
3866 | block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); | ||
3867 | iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); | ||
3868 | |||
3869 | bh = sb_getblk(sb, block); | ||
3888 | if (!bh) { | 3870 | if (!bh) { |
3889 | ext4_error (inode->i_sb, "ext4_get_inode_loc", | 3871 | ext4_error(sb, "ext4_get_inode_loc", "unable to read " |
3890 | "unable to read inode block - " | 3872 | "inode block - inode=%lu, block=%llu", |
3891 | "inode=%lu, block=%llu", | 3873 | inode->i_ino, block); |
3892 | inode->i_ino, block); | ||
3893 | return -EIO; | 3874 | return -EIO; |
3894 | } | 3875 | } |
3895 | if (!buffer_uptodate(bh)) { | 3876 | if (!buffer_uptodate(bh)) { |
@@ -3917,28 +3898,12 @@ static int __ext4_get_inode_loc(struct inode *inode, | |||
3917 | */ | 3898 | */ |
3918 | if (in_mem) { | 3899 | if (in_mem) { |
3919 | struct buffer_head *bitmap_bh; | 3900 | struct buffer_head *bitmap_bh; |
3920 | struct ext4_group_desc *desc; | 3901 | int i, start; |
3921 | int inodes_per_buffer; | ||
3922 | int inode_offset, i; | ||
3923 | ext4_group_t block_group; | ||
3924 | int start; | ||
3925 | |||
3926 | block_group = (inode->i_ino - 1) / | ||
3927 | EXT4_INODES_PER_GROUP(inode->i_sb); | ||
3928 | inodes_per_buffer = bh->b_size / | ||
3929 | EXT4_INODE_SIZE(inode->i_sb); | ||
3930 | inode_offset = ((inode->i_ino - 1) % | ||
3931 | EXT4_INODES_PER_GROUP(inode->i_sb)); | ||
3932 | start = inode_offset & ~(inodes_per_buffer - 1); | ||
3933 | 3902 | ||
3934 | /* Is the inode bitmap in cache? */ | 3903 | start = inode_offset & ~(inodes_per_block - 1); |
3935 | desc = ext4_get_group_desc(inode->i_sb, | ||
3936 | block_group, NULL); | ||
3937 | if (!desc) | ||
3938 | goto make_io; | ||
3939 | 3904 | ||
3940 | bitmap_bh = sb_getblk(inode->i_sb, | 3905 | /* Is the inode bitmap in cache? */ |
3941 | ext4_inode_bitmap(inode->i_sb, desc)); | 3906 | bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); |
3942 | if (!bitmap_bh) | 3907 | if (!bitmap_bh) |
3943 | goto make_io; | 3908 | goto make_io; |
3944 | 3909 | ||
@@ -3951,14 +3916,14 @@ static int __ext4_get_inode_loc(struct inode *inode, | |||
3951 | brelse(bitmap_bh); | 3916 | brelse(bitmap_bh); |
3952 | goto make_io; | 3917 | goto make_io; |
3953 | } | 3918 | } |
3954 | for (i = start; i < start + inodes_per_buffer; i++) { | 3919 | for (i = start; i < start + inodes_per_block; i++) { |
3955 | if (i == inode_offset) | 3920 | if (i == inode_offset) |
3956 | continue; | 3921 | continue; |
3957 | if (ext4_test_bit(i, bitmap_bh->b_data)) | 3922 | if (ext4_test_bit(i, bitmap_bh->b_data)) |
3958 | break; | 3923 | break; |
3959 | } | 3924 | } |
3960 | brelse(bitmap_bh); | 3925 | brelse(bitmap_bh); |
3961 | if (i == start + inodes_per_buffer) { | 3926 | if (i == start + inodes_per_block) { |
3962 | /* all other inodes are free, so skip I/O */ | 3927 | /* all other inodes are free, so skip I/O */ |
3963 | memset(bh->b_data, 0, bh->b_size); | 3928 | memset(bh->b_data, 0, bh->b_size); |
3964 | set_buffer_uptodate(bh); | 3929 | set_buffer_uptodate(bh); |
@@ -3969,6 +3934,36 @@ static int __ext4_get_inode_loc(struct inode *inode, | |||
3969 | 3934 | ||
3970 | make_io: | 3935 | make_io: |
3971 | /* | 3936 | /* |
3937 | * If we need to do any I/O, try to pre-readahead extra | ||
3938 | * blocks from the inode table. | ||
3939 | */ | ||
3940 | if (EXT4_SB(sb)->s_inode_readahead_blks) { | ||
3941 | ext4_fsblk_t b, end, table; | ||
3942 | unsigned num; | ||
3943 | |||
3944 | table = ext4_inode_table(sb, gdp); | ||
3945 | /* Make sure s_inode_readahead_blks is a power of 2 */ | ||
3946 | while (EXT4_SB(sb)->s_inode_readahead_blks & | ||
3947 | (EXT4_SB(sb)->s_inode_readahead_blks-1)) | ||
3948 | EXT4_SB(sb)->s_inode_readahead_blks = | ||
3949 | (EXT4_SB(sb)->s_inode_readahead_blks & | ||
3950 | (EXT4_SB(sb)->s_inode_readahead_blks-1)); | ||
3951 | b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); | ||
3952 | if (table > b) | ||
3953 | b = table; | ||
3954 | end = b + EXT4_SB(sb)->s_inode_readahead_blks; | ||
3955 | num = EXT4_INODES_PER_GROUP(sb); | ||
3956 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, | ||
3957 | EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) | ||
3958 | num -= le16_to_cpu(gdp->bg_itable_unused); | ||
3959 | table += num / inodes_per_block; | ||
3960 | if (end > table) | ||
3961 | end = table; | ||
3962 | while (b <= end) | ||
3963 | sb_breadahead(sb, b++); | ||
3964 | } | ||
3965 | |||
3966 | /* | ||
3972 | * There are other valid inodes in the buffer, this inode | 3967 | * There are other valid inodes in the buffer, this inode |
3973 | * has in-inode xattrs, or we don't have this inode in memory. | 3968 | * has in-inode xattrs, or we don't have this inode in memory. |
3974 | * Read the block from disk. | 3969 | * Read the block from disk. |
@@ -3978,10 +3973,9 @@ make_io: | |||
3978 | submit_bh(READ_META, bh); | 3973 | submit_bh(READ_META, bh); |
3979 | wait_on_buffer(bh); | 3974 | wait_on_buffer(bh); |
3980 | if (!buffer_uptodate(bh)) { | 3975 | if (!buffer_uptodate(bh)) { |
3981 | ext4_error(inode->i_sb, "ext4_get_inode_loc", | 3976 | ext4_error(sb, __func__, |
3982 | "unable to read inode block - " | 3977 | "unable to read inode block - inode=%lu, " |
3983 | "inode=%lu, block=%llu", | 3978 | "block=%llu", inode->i_ino, block); |
3984 | inode->i_ino, block); | ||
3985 | brelse(bh); | 3979 | brelse(bh); |
3986 | return -EIO; | 3980 | return -EIO; |
3987 | } | 3981 | } |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9f5468fb06da..6583aee5177f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -515,8 +515,10 @@ static void ext4_put_super(struct super_block *sb) | |||
515 | mark_buffer_dirty(sbi->s_sbh); | 515 | mark_buffer_dirty(sbi->s_sbh); |
516 | ext4_commit_super(sb, es, 1); | 516 | ext4_commit_super(sb, es, 1); |
517 | } | 517 | } |
518 | if (sbi->s_proc) | 518 | if (sbi->s_proc) { |
519 | remove_proc_entry("inode_readahead_blks", sbi->s_proc); | ||
519 | remove_proc_entry(sb->s_id, ext4_proc_root); | 520 | remove_proc_entry(sb->s_id, ext4_proc_root); |
521 | } | ||
520 | 522 | ||
521 | for (i = 0; i < sbi->s_gdb_count; i++) | 523 | for (i = 0; i < sbi->s_gdb_count; i++) |
522 | brelse(sbi->s_group_desc[i]); | 524 | brelse(sbi->s_group_desc[i]); |
@@ -779,6 +781,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
779 | else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) | 781 | else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) |
780 | seq_puts(seq, ",data=writeback"); | 782 | seq_puts(seq, ",data=writeback"); |
781 | 783 | ||
784 | if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) | ||
785 | seq_printf(seq, ",inode_readahead_blks=%u", | ||
786 | sbi->s_inode_readahead_blks); | ||
787 | |||
782 | ext4_show_quota_options(seq, sb); | 788 | ext4_show_quota_options(seq, sb); |
783 | return 0; | 789 | return 0; |
784 | } | 790 | } |
@@ -913,6 +919,7 @@ enum { | |||
913 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, | 919 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, |
914 | Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, | 920 | Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, |
915 | Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, | 921 | Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, |
922 | Opt_inode_readahead_blks | ||
916 | }; | 923 | }; |
917 | 924 | ||
918 | static match_table_t tokens = { | 925 | static match_table_t tokens = { |
@@ -973,6 +980,7 @@ static match_table_t tokens = { | |||
973 | {Opt_resize, "resize"}, | 980 | {Opt_resize, "resize"}, |
974 | {Opt_delalloc, "delalloc"}, | 981 | {Opt_delalloc, "delalloc"}, |
975 | {Opt_nodelalloc, "nodelalloc"}, | 982 | {Opt_nodelalloc, "nodelalloc"}, |
983 | {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, | ||
976 | {Opt_err, NULL}, | 984 | {Opt_err, NULL}, |
977 | }; | 985 | }; |
978 | 986 | ||
@@ -1381,6 +1389,13 @@ set_qf_format: | |||
1381 | case Opt_delalloc: | 1389 | case Opt_delalloc: |
1382 | set_opt(sbi->s_mount_opt, DELALLOC); | 1390 | set_opt(sbi->s_mount_opt, DELALLOC); |
1383 | break; | 1391 | break; |
1392 | case Opt_inode_readahead_blks: | ||
1393 | if (match_int(&args[0], &option)) | ||
1394 | return 0; | ||
1395 | if (option < 0 || option > (1 << 30)) | ||
1396 | return 0; | ||
1397 | sbi->s_inode_readahead_blks = option; | ||
1398 | break; | ||
1384 | default: | 1399 | default: |
1385 | printk(KERN_ERR | 1400 | printk(KERN_ERR |
1386 | "EXT4-fs: Unrecognized mount option \"%s\" " | 1401 | "EXT4-fs: Unrecognized mount option \"%s\" " |
@@ -1938,6 +1953,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
1938 | sbi->s_mount_opt = 0; | 1953 | sbi->s_mount_opt = 0; |
1939 | sbi->s_resuid = EXT4_DEF_RESUID; | 1954 | sbi->s_resuid = EXT4_DEF_RESUID; |
1940 | sbi->s_resgid = EXT4_DEF_RESGID; | 1955 | sbi->s_resgid = EXT4_DEF_RESGID; |
1956 | sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; | ||
1941 | sbi->s_sb_block = sb_block; | 1957 | sbi->s_sb_block = sb_block; |
1942 | 1958 | ||
1943 | unlock_kernel(); | 1959 | unlock_kernel(); |
@@ -2234,6 +2250,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
2234 | if (ext4_proc_root) | 2250 | if (ext4_proc_root) |
2235 | sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); | 2251 | sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); |
2236 | 2252 | ||
2253 | if (sbi->s_proc) | ||
2254 | proc_create_data("inode_readahead_blks", 0644, sbi->s_proc, | ||
2255 | &ext4_ui_proc_fops, | ||
2256 | &sbi->s_inode_readahead_blks); | ||
2257 | |||
2237 | bgl_lock_init(&sbi->s_blockgroup_lock); | 2258 | bgl_lock_init(&sbi->s_blockgroup_lock); |
2238 | 2259 | ||
2239 | for (i = 0; i < db_count; i++) { | 2260 | for (i = 0; i < db_count; i++) { |
@@ -2513,8 +2534,10 @@ failed_mount2: | |||
2513 | brelse(sbi->s_group_desc[i]); | 2534 | brelse(sbi->s_group_desc[i]); |
2514 | kfree(sbi->s_group_desc); | 2535 | kfree(sbi->s_group_desc); |
2515 | failed_mount: | 2536 | failed_mount: |
2516 | if (sbi->s_proc) | 2537 | if (sbi->s_proc) { |
2538 | remove_proc_entry("inode_readahead_blks", sbi->s_proc); | ||
2517 | remove_proc_entry(sb->s_id, ext4_proc_root); | 2539 | remove_proc_entry(sb->s_id, ext4_proc_root); |
2540 | } | ||
2518 | #ifdef CONFIG_QUOTA | 2541 | #ifdef CONFIG_QUOTA |
2519 | for (i = 0; i < MAXQUOTAS; i++) | 2542 | for (i = 0; i < MAXQUOTAS; i++) |
2520 | kfree(sbi->s_qf_names[i]); | 2543 | kfree(sbi->s_qf_names[i]); |