aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2008-10-09 23:53:47 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-10-09 23:53:47 -0400
commit240799cdf22bd789ea6852653c3b879d35ad0a6c (patch)
treee696b60cc103f23838b5c14d8d397f692abffbc3
parent37515facd001942221d68171c81c1f46d54ffdd0 (diff)
ext4: Use readahead when reading an inode from the inode table
With modern hard drives, reading 64k takes roughly the same time as reading a 4k block. So request readahead for adjacent inode table blocks to reduce the time it takes when iterating over directories (especially when doing this in htree sort order) in a cold cache case. With this patch, the time it takes to run "git status" on a kernel tree after flushing the caches via "echo 3 > /proc/sys/vm/drop_caches" is reduced by 21%. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--Documentation/filesystems/ext4.txt6
-rw-r--r--Documentation/filesystems/proc.txt3
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/ext4_sb.h1
-rw-r--r--fs/ext4/inode.c134
-rw-r--r--fs/ext4/super.c27
6 files changed, 101 insertions, 72 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0d5394920a31..289057958f90 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -177,6 +177,11 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
177 your disks are battery-backed in one way or another, 177 your disks are battery-backed in one way or another,
178 disabling barriers may safely improve performance. 178 disabling barriers may safely improve performance.
179 179
180inode_readahead=n This tuning parameter controls the maximum
181 number of inode table blocks that ext4's inode
182 table readahead algorithm will pre-read into
183 the buffer cache. The default value is 32 blocks.
184
180orlov (*) This enables the new Orlov block allocator. It is 185orlov (*) This enables the new Orlov block allocator. It is
181 enabled by default. 186 enabled by default.
182 187
@@ -252,6 +257,7 @@ stripe=n Number of filesystem blocks that mballoc will try
252delalloc (*) Deferring block allocation until write-out time. 257delalloc (*) Deferring block allocation until write-out time.
253nodelalloc Disable delayed allocation. Blocks are allocation 258nodelalloc Disable delayed allocation. Blocks are allocation
254 when data is copied from user to page cache. 259 when data is copied from user to page cache.
260
255Data Mode 261Data Mode
256========= 262=========
257There are 3 different data modes: 263There are 3 different data modes:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index d9ac9706735b..d831d24d2a6c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -956,6 +956,9 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
956 files are packed closely together. Each large file 956 files are packed closely together. Each large file
957 will have its blocks allocated out of its own unique 957 will have its blocks allocated out of its own unique
958 preallocation pool. 958 preallocation pool.
959inode_readahead Tuning parameter which controls the maximum number of
960 inode table blocks that ext4's inode table readahead
961 algorithm will pre-read into the buffer cache
959.............................................................................. 962..............................................................................
960 963
961 964
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 163c44527dde..922d18720c9e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -790,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
790#define EXT4_DEF_RESUID 0 790#define EXT4_DEF_RESUID 0
791#define EXT4_DEF_RESGID 0 791#define EXT4_DEF_RESGID 0
792 792
793#define EXT4_DEF_INODE_READAHEAD_BLKS 32
794
793/* 795/*
794 * Default mount options 796 * Default mount options
795 */ 797 */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index f92af01138d4..94e0757522a6 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -52,6 +52,7 @@ struct ext4_sb_info {
52 int s_desc_per_block_bits; 52 int s_desc_per_block_bits;
53 int s_inode_size; 53 int s_inode_size;
54 int s_first_ino; 54 int s_first_ino;
55 unsigned int s_inode_readahead_blks;
55 spinlock_t s_next_gen_lock; 56 spinlock_t s_next_gen_lock;
56 u32 s_next_generation; 57 u32 s_next_generation;
57 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 22fcbb67cd88..ef4ca3d4abc0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3833,41 +3833,6 @@ out_stop:
3833 ext4_journal_stop(handle); 3833 ext4_journal_stop(handle);
3834} 3834}
3835 3835
3836static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3837 unsigned long ino, struct ext4_iloc *iloc)
3838{
3839 ext4_group_t block_group;
3840 unsigned long offset;
3841 ext4_fsblk_t block;
3842 struct ext4_group_desc *gdp;
3843
3844 if (!ext4_valid_inum(sb, ino)) {
3845 /*
3846 * This error is already checked for in namei.c unless we are
3847 * looking at an NFS filehandle, in which case no error
3848 * report is needed
3849 */
3850 return 0;
3851 }
3852
3853 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
3854 gdp = ext4_get_group_desc(sb, block_group, NULL);
3855 if (!gdp)
3856 return 0;
3857
3858 /*
3859 * Figure out the offset within the block group inode table
3860 */
3861 offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
3862 EXT4_INODE_SIZE(sb);
3863 block = ext4_inode_table(sb, gdp) +
3864 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
3865
3866 iloc->block_group = block_group;
3867 iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
3868 return block;
3869}
3870
3871/* 3836/*
3872 * ext4_get_inode_loc returns with an extra refcount against the inode's 3837 * ext4_get_inode_loc returns with an extra refcount against the inode's
3873 * underlying buffer_head on success. If 'in_mem' is true, we have all 3838 * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3877,19 +3842,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3877static int __ext4_get_inode_loc(struct inode *inode, 3842static int __ext4_get_inode_loc(struct inode *inode,
3878 struct ext4_iloc *iloc, int in_mem) 3843 struct ext4_iloc *iloc, int in_mem)
3879{ 3844{
3880 ext4_fsblk_t block; 3845 struct ext4_group_desc *gdp;
3881 struct buffer_head *bh; 3846 struct buffer_head *bh;
3847 struct super_block *sb = inode->i_sb;
3848 ext4_fsblk_t block;
3849 int inodes_per_block, inode_offset;
3850
3851 iloc->bh = 0;
3852 if (!ext4_valid_inum(sb, inode->i_ino))
3853 return -EIO;
3882 3854
3883 block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); 3855 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
3884 if (!block) 3856 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
3857 if (!gdp)
3885 return -EIO; 3858 return -EIO;
3886 3859
3887 bh = sb_getblk(inode->i_sb, block); 3860 /*
3861 * Figure out the offset within the block group inode table
3862 */
3863 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
3864 inode_offset = ((inode->i_ino - 1) %
3865 EXT4_INODES_PER_GROUP(sb));
3866 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
3867 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
3868
3869 bh = sb_getblk(sb, block);
3888 if (!bh) { 3870 if (!bh) {
3889 ext4_error (inode->i_sb, "ext4_get_inode_loc", 3871 ext4_error(sb, "ext4_get_inode_loc", "unable to read "
3890 "unable to read inode block - " 3872 "inode block - inode=%lu, block=%llu",
3891 "inode=%lu, block=%llu", 3873 inode->i_ino, block);
3892 inode->i_ino, block);
3893 return -EIO; 3874 return -EIO;
3894 } 3875 }
3895 if (!buffer_uptodate(bh)) { 3876 if (!buffer_uptodate(bh)) {
@@ -3917,28 +3898,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
3917 */ 3898 */
3918 if (in_mem) { 3899 if (in_mem) {
3919 struct buffer_head *bitmap_bh; 3900 struct buffer_head *bitmap_bh;
3920 struct ext4_group_desc *desc; 3901 int i, start;
3921 int inodes_per_buffer;
3922 int inode_offset, i;
3923 ext4_group_t block_group;
3924 int start;
3925
3926 block_group = (inode->i_ino - 1) /
3927 EXT4_INODES_PER_GROUP(inode->i_sb);
3928 inodes_per_buffer = bh->b_size /
3929 EXT4_INODE_SIZE(inode->i_sb);
3930 inode_offset = ((inode->i_ino - 1) %
3931 EXT4_INODES_PER_GROUP(inode->i_sb));
3932 start = inode_offset & ~(inodes_per_buffer - 1);
3933 3902
3934 /* Is the inode bitmap in cache? */ 3903 start = inode_offset & ~(inodes_per_block - 1);
3935 desc = ext4_get_group_desc(inode->i_sb,
3936 block_group, NULL);
3937 if (!desc)
3938 goto make_io;
3939 3904
3940 bitmap_bh = sb_getblk(inode->i_sb, 3905 /* Is the inode bitmap in cache? */
3941 ext4_inode_bitmap(inode->i_sb, desc)); 3906 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3942 if (!bitmap_bh) 3907 if (!bitmap_bh)
3943 goto make_io; 3908 goto make_io;
3944 3909
@@ -3951,14 +3916,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
3951 brelse(bitmap_bh); 3916 brelse(bitmap_bh);
3952 goto make_io; 3917 goto make_io;
3953 } 3918 }
3954 for (i = start; i < start + inodes_per_buffer; i++) { 3919 for (i = start; i < start + inodes_per_block; i++) {
3955 if (i == inode_offset) 3920 if (i == inode_offset)
3956 continue; 3921 continue;
3957 if (ext4_test_bit(i, bitmap_bh->b_data)) 3922 if (ext4_test_bit(i, bitmap_bh->b_data))
3958 break; 3923 break;
3959 } 3924 }
3960 brelse(bitmap_bh); 3925 brelse(bitmap_bh);
3961 if (i == start + inodes_per_buffer) { 3926 if (i == start + inodes_per_block) {
3962 /* all other inodes are free, so skip I/O */ 3927 /* all other inodes are free, so skip I/O */
3963 memset(bh->b_data, 0, bh->b_size); 3928 memset(bh->b_data, 0, bh->b_size);
3964 set_buffer_uptodate(bh); 3929 set_buffer_uptodate(bh);
@@ -3969,6 +3934,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
3969 3934
3970make_io: 3935make_io:
3971 /* 3936 /*
3937 * If we need to do any I/O, try to pre-readahead extra
3938 * blocks from the inode table.
3939 */
3940 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3941 ext4_fsblk_t b, end, table;
3942 unsigned num;
3943
3944 table = ext4_inode_table(sb, gdp);
3945 /* Make sure s_inode_readahead_blks is a power of 2 */
3946 while (EXT4_SB(sb)->s_inode_readahead_blks &
3947 (EXT4_SB(sb)->s_inode_readahead_blks-1))
3948 EXT4_SB(sb)->s_inode_readahead_blks =
3949 (EXT4_SB(sb)->s_inode_readahead_blks &
3950 (EXT4_SB(sb)->s_inode_readahead_blks-1));
3951 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
3952 if (table > b)
3953 b = table;
3954 end = b + EXT4_SB(sb)->s_inode_readahead_blks;
3955 num = EXT4_INODES_PER_GROUP(sb);
3956 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3957 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3958 num -= le16_to_cpu(gdp->bg_itable_unused);
3959 table += num / inodes_per_block;
3960 if (end > table)
3961 end = table;
3962 while (b <= end)
3963 sb_breadahead(sb, b++);
3964 }
3965
3966 /*
3972 * There are other valid inodes in the buffer, this inode 3967 * There are other valid inodes in the buffer, this inode
3973 * has in-inode xattrs, or we don't have this inode in memory. 3968 * has in-inode xattrs, or we don't have this inode in memory.
3974 * Read the block from disk. 3969 * Read the block from disk.
@@ -3978,10 +3973,9 @@ make_io:
3978 submit_bh(READ_META, bh); 3973 submit_bh(READ_META, bh);
3979 wait_on_buffer(bh); 3974 wait_on_buffer(bh);
3980 if (!buffer_uptodate(bh)) { 3975 if (!buffer_uptodate(bh)) {
3981 ext4_error(inode->i_sb, "ext4_get_inode_loc", 3976 ext4_error(sb, __func__,
3982 "unable to read inode block - " 3977 "unable to read inode block - inode=%lu, "
3983 "inode=%lu, block=%llu", 3978 "block=%llu", inode->i_ino, block);
3984 inode->i_ino, block);
3985 brelse(bh); 3979 brelse(bh);
3986 return -EIO; 3980 return -EIO;
3987 } 3981 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9f5468fb06da..6583aee5177f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -515,8 +515,10 @@ static void ext4_put_super(struct super_block *sb)
515 mark_buffer_dirty(sbi->s_sbh); 515 mark_buffer_dirty(sbi->s_sbh);
516 ext4_commit_super(sb, es, 1); 516 ext4_commit_super(sb, es, 1);
517 } 517 }
518 if (sbi->s_proc) 518 if (sbi->s_proc) {
519 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
519 remove_proc_entry(sb->s_id, ext4_proc_root); 520 remove_proc_entry(sb->s_id, ext4_proc_root);
521 }
520 522
521 for (i = 0; i < sbi->s_gdb_count; i++) 523 for (i = 0; i < sbi->s_gdb_count; i++)
522 brelse(sbi->s_group_desc[i]); 524 brelse(sbi->s_group_desc[i]);
@@ -779,6 +781,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
779 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 781 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
780 seq_puts(seq, ",data=writeback"); 782 seq_puts(seq, ",data=writeback");
781 783
784 if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
785 seq_printf(seq, ",inode_readahead_blks=%u",
786 sbi->s_inode_readahead_blks);
787
782 ext4_show_quota_options(seq, sb); 788 ext4_show_quota_options(seq, sb);
783 return 0; 789 return 0;
784} 790}
@@ -913,6 +919,7 @@ enum {
913 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 919 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
914 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 920 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
915 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, 921 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
922 Opt_inode_readahead_blks
916}; 923};
917 924
918static match_table_t tokens = { 925static match_table_t tokens = {
@@ -973,6 +980,7 @@ static match_table_t tokens = {
973 {Opt_resize, "resize"}, 980 {Opt_resize, "resize"},
974 {Opt_delalloc, "delalloc"}, 981 {Opt_delalloc, "delalloc"},
975 {Opt_nodelalloc, "nodelalloc"}, 982 {Opt_nodelalloc, "nodelalloc"},
983 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
976 {Opt_err, NULL}, 984 {Opt_err, NULL},
977}; 985};
978 986
@@ -1381,6 +1389,13 @@ set_qf_format:
1381 case Opt_delalloc: 1389 case Opt_delalloc:
1382 set_opt(sbi->s_mount_opt, DELALLOC); 1390 set_opt(sbi->s_mount_opt, DELALLOC);
1383 break; 1391 break;
1392 case Opt_inode_readahead_blks:
1393 if (match_int(&args[0], &option))
1394 return 0;
1395 if (option < 0 || option > (1 << 30))
1396 return 0;
1397 sbi->s_inode_readahead_blks = option;
1398 break;
1384 default: 1399 default:
1385 printk(KERN_ERR 1400 printk(KERN_ERR
1386 "EXT4-fs: Unrecognized mount option \"%s\" " 1401 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1938,6 +1953,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1938 sbi->s_mount_opt = 0; 1953 sbi->s_mount_opt = 0;
1939 sbi->s_resuid = EXT4_DEF_RESUID; 1954 sbi->s_resuid = EXT4_DEF_RESUID;
1940 sbi->s_resgid = EXT4_DEF_RESGID; 1955 sbi->s_resgid = EXT4_DEF_RESGID;
1956 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
1941 sbi->s_sb_block = sb_block; 1957 sbi->s_sb_block = sb_block;
1942 1958
1943 unlock_kernel(); 1959 unlock_kernel();
@@ -2234,6 +2250,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2234 if (ext4_proc_root) 2250 if (ext4_proc_root)
2235 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 2251 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2236 2252
2253 if (sbi->s_proc)
2254 proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2255 &ext4_ui_proc_fops,
2256 &sbi->s_inode_readahead_blks);
2257
2237 bgl_lock_init(&sbi->s_blockgroup_lock); 2258 bgl_lock_init(&sbi->s_blockgroup_lock);
2238 2259
2239 for (i = 0; i < db_count; i++) { 2260 for (i = 0; i < db_count; i++) {
@@ -2513,8 +2534,10 @@ failed_mount2:
2513 brelse(sbi->s_group_desc[i]); 2534 brelse(sbi->s_group_desc[i]);
2514 kfree(sbi->s_group_desc); 2535 kfree(sbi->s_group_desc);
2515failed_mount: 2536failed_mount:
2516 if (sbi->s_proc) 2537 if (sbi->s_proc) {
2538 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2517 remove_proc_entry(sb->s_id, ext4_proc_root); 2539 remove_proc_entry(sb->s_id, ext4_proc_root);
2540 }
2518#ifdef CONFIG_QUOTA 2541#ifdef CONFIG_QUOTA
2519 for (i = 0; i < MAXQUOTAS; i++) 2542 for (i = 0; i < MAXQUOTAS; i++)
2520 kfree(sbi->s_qf_names[i]); 2543 kfree(sbi->s_qf_names[i]);