aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/ialloc.c
diff options
context:
space:
mode:
authorLukas Czerner <lczerner@redhat.com>2010-10-27 21:30:05 -0400
committerTheodore Ts'o <tytso@mit.edu>2010-10-27 21:30:05 -0400
commitbfff68738f1cb5c93dab1114634cea02aae9e7ba (patch)
treeb6cdf3f26e86464c7088cab62d837eb32f559fb9 /fs/ext4/ialloc.c
parente6fa0be699449d28a20e815bfe9ce26725ec4962 (diff)
ext4: add support for lazy inode table initialization
When the lazy_itable_init extended option is passed to mke2fs, it considerably speeds up filesystem creation because inode tables are not zeroed out. The fact that parts of the inode table are uninitialized is not a problem so long as the block group descriptors, which contain information regarding how much of the inode table has been initialized, has not been corrupted However, if the block group checksums are not valid, e2fsck must scan the entire inode table, and the the old, uninitialized data could potentially cause e2fsck to report false problems. Hence, it is important for the inode tables to be initialized as soon as possble. This commit adds this feature so that mke2fs can safely use the lazy inode table initialization feature to speed up formatting file systems. This is done via a new new kernel thread called ext4lazyinit, which is created on demand and destroyed, when it is no longer needed. There is only one thread for all ext4 filesystems in the system. When the first filesystem with inititable mount option is mounted, ext4lazyinit thread is created, then the filesystem can register its request in the request list. This thread then walks through the list of requests picking up scheduled requests and invoking ext4_init_inode_table(). Next schedule time for the request is computed by multiplying the time it took to zero out last inode table with wait multiplier, which can be set with the (init_itable=n) mount option (default is 10). We are doing this so we do not take the whole I/O bandwidth. When the thread is no longer necessary (request list is empty) it frees the appropriate structures and exits (and can be created later later by another filesystem). We do not disturb regular inode allocations in any way, it just do not care whether the inode table is, or is not zeroed. But when zeroing, we have to skip used inodes, obviously. Also we should prevent new inode allocations from the group, while zeroing is on the way. For that we take write alloc_sem lock in ext4_init_inode_table() and read alloc_sem in the ext4_claim_inode, so when we are unlucky and allocator hits the group which is currently being zeroed, it just has to wait. This can be suppresed using the mount option no_init_itable. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/ialloc.c')
-rw-r--r--fs/ext4/ialloc.c120
1 files changed, 120 insertions, 0 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..e428f23215c0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -107,6 +107,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
107 desc = ext4_get_group_desc(sb, block_group, NULL); 107 desc = ext4_get_group_desc(sb, block_group, NULL);
108 if (!desc) 108 if (!desc)
109 return NULL; 109 return NULL;
110
110 bitmap_blk = ext4_inode_bitmap(sb, desc); 111 bitmap_blk = ext4_inode_bitmap(sb, desc);
111 bh = sb_getblk(sb, bitmap_blk); 112 bh = sb_getblk(sb, bitmap_blk);
112 if (unlikely(!bh)) { 113 if (unlikely(!bh)) {
@@ -123,6 +124,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
123 unlock_buffer(bh); 124 unlock_buffer(bh);
124 return bh; 125 return bh;
125 } 126 }
127
126 ext4_lock_group(sb, block_group); 128 ext4_lock_group(sb, block_group);
127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 129 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
128 ext4_init_inode_bitmap(sb, bh, block_group, desc); 130 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +135,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
133 return bh; 135 return bh;
134 } 136 }
135 ext4_unlock_group(sb, block_group); 137 ext4_unlock_group(sb, block_group);
138
136 if (buffer_uptodate(bh)) { 139 if (buffer_uptodate(bh)) {
137 /* 140 /*
138 * if not uninit if bh is uptodate, 141 * if not uninit if bh is uptodate,
@@ -712,8 +715,17 @@ static int ext4_claim_inode(struct super_block *sb,
712{ 715{
713 int free = 0, retval = 0, count; 716 int free = 0, retval = 0, count;
714 struct ext4_sb_info *sbi = EXT4_SB(sb); 717 struct ext4_sb_info *sbi = EXT4_SB(sb);
718 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
715 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 719 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
716 720
721 /*
722 * We have to be sure that new inode allocation does not race with
723 * inode table initialization, because otherwise we may end up
724 * allocating and writing new inode right before sb_issue_zeroout
725 * takes place and overwriting our new inode with zeroes. So we
726 * take alloc_sem to prevent it.
727 */
728 down_read(&grp->alloc_sem);
717 ext4_lock_group(sb, group); 729 ext4_lock_group(sb, group);
718 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { 730 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
719 /* not a free inode */ 731 /* not a free inode */
@@ -724,6 +736,7 @@ static int ext4_claim_inode(struct super_block *sb,
724 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 736 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
725 ino > EXT4_INODES_PER_GROUP(sb)) { 737 ino > EXT4_INODES_PER_GROUP(sb)) {
726 ext4_unlock_group(sb, group); 738 ext4_unlock_group(sb, group);
739 up_read(&grp->alloc_sem);
727 ext4_error(sb, "reserved inode or inode > inodes count - " 740 ext4_error(sb, "reserved inode or inode > inodes count - "
728 "block_group = %u, inode=%lu", group, 741 "block_group = %u, inode=%lu", group,
729 ino + group * EXT4_INODES_PER_GROUP(sb)); 742 ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +785,7 @@ static int ext4_claim_inode(struct super_block *sb,
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 785 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773err_ret: 786err_ret:
774 ext4_unlock_group(sb, group); 787 ext4_unlock_group(sb, group);
788 up_read(&grp->alloc_sem);
775 return retval; 789 return retval;
776} 790}
777 791
@@ -1205,3 +1219,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1205 } 1219 }
1206 return count; 1220 return count;
1207} 1221}
1222
1223/*
1224 * Zeroes not yet zeroed inode table - just write zeroes through the whole
1225 * inode table. Must be called without any spinlock held. The only place
1226 * where it is called from on active part of filesystem is ext4lazyinit
1227 * thread, so we do not need any special locks, however we have to prevent
1228 * inode allocation from the current group, so we take alloc_sem lock, to
1229 * block ext4_claim_inode until we are finished.
1230 */
1231extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1232 int barrier)
1233{
1234 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1235 struct ext4_sb_info *sbi = EXT4_SB(sb);
1236 struct ext4_group_desc *gdp = NULL;
1237 struct buffer_head *group_desc_bh;
1238 handle_t *handle;
1239 ext4_fsblk_t blk;
1240 int num, ret = 0, used_blks = 0;
1241 unsigned long flags = BLKDEV_IFL_WAIT;
1242
1243 /* This should not happen, but just to be sure check this */
1244 if (sb->s_flags & MS_RDONLY) {
1245 ret = 1;
1246 goto out;
1247 }
1248
1249 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1250 if (!gdp)
1251 goto out;
1252
1253 /*
1254 * We do not need to lock this, because we are the only one
1255 * handling this flag.
1256 */
1257 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
1258 goto out;
1259
1260 handle = ext4_journal_start_sb(sb, 1);
1261 if (IS_ERR(handle)) {
1262 ret = PTR_ERR(handle);
1263 goto out;
1264 }
1265
1266 down_write(&grp->alloc_sem);
1267 /*
1268 * If inode bitmap was already initialized there may be some
1269 * used inodes so we need to skip blocks with used inodes in
1270 * inode table.
1271 */
1272 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
1273 used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
1274 ext4_itable_unused_count(sb, gdp)),
1275 sbi->s_inodes_per_block);
1276
1277 blk = ext4_inode_table(sb, gdp) + used_blks;
1278 num = sbi->s_itb_per_group - used_blks;
1279
1280 BUFFER_TRACE(group_desc_bh, "get_write_access");
1281 ret = ext4_journal_get_write_access(handle,
1282 group_desc_bh);
1283 if (ret)
1284 goto err_out;
1285
1286 if (unlikely(num > EXT4_INODES_PER_GROUP(sb))) {
1287 ext4_error(sb, "Something is wrong with group %u\n"
1288 "Used itable blocks: %d"
1289 "Itable blocks per group: %lu\n",
1290 group, used_blks, sbi->s_itb_per_group);
1291 ret = 1;
1292 goto err_out;
1293 }
1294
1295 /*
1296 * Skip zeroout if the inode table is full. But we set the ZEROED
1297 * flag anyway, because obviously, when it is full it does not need
1298 * further zeroing.
1299 */
1300 if (unlikely(num == 0))
1301 goto skip_zeroout;
1302
1303 ext4_debug("going to zero out inode table in group %d\n",
1304 group);
1305 if (barrier)
1306 flags |= BLKDEV_IFL_BARRIER;
1307 ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS, flags);
1308 if (ret < 0)
1309 goto err_out;
1310
1311skip_zeroout:
1312 ext4_lock_group(sb, group);
1313 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
1314 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
1315 ext4_unlock_group(sb, group);
1316
1317 BUFFER_TRACE(group_desc_bh,
1318 "call ext4_handle_dirty_metadata");
1319 ret = ext4_handle_dirty_metadata(handle, NULL,
1320 group_desc_bh);
1321
1322err_out:
1323 up_write(&grp->alloc_sem);
1324 ext4_journal_stop(handle);
1325out:
1326 return ret;
1327}