ext4: add support for lazy inode table initialization

When the lazy_itable_init extended option is passed to mke2fs, it considerably speeds up filesystem creation because inode tables are not zeroed out. The fact that parts of the inode table are uninitialized is not a problem so long as the block group descriptors, which contain information regarding how much of the inode table has been initialized, has not been corrupted However, if the block group checksums are not valid, e2fsck must scan the entire inode table, and the the old, uninitialized data could potentially cause e2fsck to report false problems. Hence, it is important for the inode tables to be initialized as soon as possble. This commit adds this feature so that mke2fs can safely use the lazy inode table initialization feature to speed up formatting file systems. This is done via a new new kernel thread called ext4lazyinit, which is created on demand and destroyed, when it is no longer needed. There is only one thread for all ext4 filesystems in the system. When the first filesystem with inititable mount option is mounted, ext4lazyinit thread is created, then the filesystem can register its request in the request list. This thread then walks through the list of requests picking up scheduled requests and invoking ext4_init_inode_table(). Next schedule time for the request is computed by multiplying the time it took to zero out last inode table with wait multiplier, which can be set with the (init_itable=n) mount option (default is 10). We are doing this so we do not take the whole I/O bandwidth. When the thread is no longer necessary (request list is empty) it frees the appropriate structures and exits (and can be created later later by another filesystem). We do not disturb regular inode allocations in any way, it just do not care whether the inode table is, or is not zeroed. But when zeroing, we have to skip used inodes, obviously. Also we should prevent new inode allocations from the group, while zeroing is on the way. For that we take write alloc_sem lock in ext4_init_inode_table() and read alloc_sem in the ext4_claim_inode, so when we are unlucky and allocator hits the group which is currently being zeroed, it just has to wait. This can be suppresed using the mount option no_init_itable. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
author: Lukas Czerner <lczerner@redhat.com> 2010-10-27 21:30:05 -0400
committer: Theodore Ts'o <tytso@mit.edu> 2010-10-27 21:30:05 -0400
commit: bfff68738f1cb5c93dab1114634cea02aae9e7ba (patch)
tree: b6cdf3f26e86464c7088cab62d837eb32f559fb9 /fs/ext4/ialloc.c
parent: e6fa0be699449d28a20e815bfe9ce26725ec4962 (diff)
1 files changed, 120 insertions, 0 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..e428f23215c0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -107,6 +107,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return NULL;
        bitmap_blk = ext4_inode_bitmap(sb, desc);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
@@ -123,6 +124,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +135,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                return bh;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -712,8 +715,17 @@ static int ext4_claim_inode(struct super_block *sb,
 {
        int free = 0, retval = 0, count;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+        /*
+         * We have to be sure that new inode allocation does not race with
+         * inode table initialization, because otherwise we may end up
+         * allocating and writing new inode right before sb_issue_zeroout
+         * takes place and overwriting our new inode with zeroes. So we
+         * take alloc_sem to prevent it.
+         */
+        down_read(&grp->alloc_sem);
        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
@@ -724,6 +736,7 @@ static int ext4_claim_inode(struct super_block *sb,
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
                ext4_unlock_group(sb, group);
+                up_read(&grp->alloc_sem);
                ext4_error(sb, "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
                           ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +785,7 @@ static int ext4_claim_inode(struct super_block *sb,
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
        ext4_unlock_group(sb, group);
+        up_read(&grp->alloc_sem);
        return retval;
 }
@@ -1205,3 +1219,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
        }
        return count;
 }
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_claim_inode until we are finished.
+ */
+extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+                                 int barrier)
+{
+        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = NULL;
+        struct buffer_head *group_desc_bh;
+        handle_t *handle;
+        ext4_fsblk_t blk;
+        int num, ret = 0, used_blks = 0;
+        unsigned long flags = BLKDEV_IFL_WAIT;
+        /* This should not happen, but just to be sure check this */
+        if (sb->s_flags & MS_RDONLY) {
+                ret = 1;
+                goto out;
+        }
+        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+        if (!gdp)
+                goto out;
+        /*
+         * We do not need to lock this, because we are the only one
+         * handling this flag.
+         */
+        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+                goto out;
+        handle = ext4_journal_start_sb(sb, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        down_write(&grp->alloc_sem);
+        /*
+         * If inode bitmap was already initialized there may be some
+         * used inodes so we need to skip blocks with used inodes in
+         * inode table.
+         */
+        if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+                used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+                            ext4_itable_unused_count(sb, gdp)),
+                            sbi->s_inodes_per_block);
+        blk = ext4_inode_table(sb, gdp) + used_blks;
+        num = sbi->s_itb_per_group - used_blks;
+        BUFFER_TRACE(group_desc_bh, "get_write_access");
+        ret = ext4_journal_get_write_access(handle,
+                                            group_desc_bh);
+        if (ret)
+                goto err_out;
+        if (unlikely(num > EXT4_INODES_PER_GROUP(sb))) {
+                ext4_error(sb, "Something is wrong with group %u\n"
+                           "Used itable blocks: %d"
+                           "Itable blocks per group: %lu\n",
+                           group, used_blks, sbi->s_itb_per_group);
+                ret = 1;
+                goto err_out;
+        }
+        /*
+         * Skip zeroout if the inode table is full. But we set the ZEROED
+         * flag anyway, because obviously, when it is full it does not need
+         * further zeroing.
+         */
+        if (unlikely(num == 0))
+                goto skip_zeroout;
+        ext4_debug("going to zero out inode table in group %d\n",
+                   group);
+        if (barrier)
+                flags |= BLKDEV_IFL_BARRIER;
+        ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS, flags);
+        if (ret < 0)
+                goto err_out;
+skip_zeroout:
+        ext4_lock_group(sb, group);
+        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+        ext4_unlock_group(sb, group);
+        BUFFER_TRACE(group_desc_bh,
+                     "call ext4_handle_dirty_metadata");
+        ret = ext4_handle_dirty_metadata(handle, NULL,
+                                         group_desc_bh);
+err_out:
+        up_write(&grp->alloc_sem);
+        ext4_journal_stop(handle);
+out:
+        return ret;
+}
author	Lukas Czerner <lczerner@redhat.com>	2010-10-27 21:30:05 -0400
committer	Theodore Ts'o <tytso@mit.edu>	2010-10-27 21:30:05 -0400
commit	bfff68738f1cb5c93dab1114634cea02aae9e7ba (patch)
tree	b6cdf3f26e86464c7088cab62d837eb32f559fb9 /fs/ext4/ialloc.c
parent	e6fa0be699449d28a20e815bfe9ce26725ec4962 (diff)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 45853e0d1f21..e428f23215c0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c
@@ -107,6 +107,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
107	desc = ext4_get_group_desc(sb, block_group, NULL);	107	desc = ext4_get_group_desc(sb, block_group, NULL);
108	if (!desc)	108	if (!desc)
109	return NULL;	109	return NULL;
		110
110	bitmap_blk = ext4_inode_bitmap(sb, desc);	111	bitmap_blk = ext4_inode_bitmap(sb, desc);
111	bh = sb_getblk(sb, bitmap_blk);	112	bh = sb_getblk(sb, bitmap_blk);
112	if (unlikely(!bh)) {	113	if (unlikely(!bh)) {
@@ -123,6 +124,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
123	unlock_buffer(bh);	124	unlock_buffer(bh);
124	return bh;	125	return bh;
125	}	126	}
		127
126	ext4_lock_group(sb, block_group);	128	ext4_lock_group(sb, block_group);
127	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {	129	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
128	ext4_init_inode_bitmap(sb, bh, block_group, desc);	130	ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +135,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
133	return bh;	135	return bh;
134	}	136	}
135	ext4_unlock_group(sb, block_group);	137	ext4_unlock_group(sb, block_group);
		138
136	if (buffer_uptodate(bh)) {	139	if (buffer_uptodate(bh)) {
137	/*	140	/*
138	* if not uninit if bh is uptodate,	141	* if not uninit if bh is uptodate,
@@ -712,8 +715,17 @@ static int ext4_claim_inode(struct super_block *sb,
712	{	715	{
713	int free = 0, retval = 0, count;	716	int free = 0, retval = 0, count;
714	struct ext4_sb_info *sbi = EXT4_SB(sb);	717	struct ext4_sb_info *sbi = EXT4_SB(sb);
		718	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
715	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);	719	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
716		720
		721	/*
		722	* We have to be sure that new inode allocation does not race with
		723	* inode table initialization, because otherwise we may end up
		724	* allocating and writing new inode right before sb_issue_zeroout
		725	* takes place and overwriting our new inode with zeroes. So we
		726	* take alloc_sem to prevent it.
		727	*/
		728	down_read(&grp->alloc_sem);
717	ext4_lock_group(sb, group);	729	ext4_lock_group(sb, group);
718	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {	730	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
719	/* not a free inode */	731	/* not a free inode */
@@ -724,6 +736,7 @@ static int ext4_claim_inode(struct super_block *sb,
724	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) \|\|	736	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) \|\|
725	ino > EXT4_INODES_PER_GROUP(sb)) {	737	ino > EXT4_INODES_PER_GROUP(sb)) {
726	ext4_unlock_group(sb, group);	738	ext4_unlock_group(sb, group);
		739	up_read(&grp->alloc_sem);
727	ext4_error(sb, "reserved inode or inode > inodes count - "	740	ext4_error(sb, "reserved inode or inode > inodes count - "
728	"block_group = %u, inode=%lu", group,	741	"block_group = %u, inode=%lu", group,
729	ino + group * EXT4_INODES_PER_GROUP(sb));	742	ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +785,7 @@ static int ext4_claim_inode(struct super_block *sb,
772	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);	785	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773	err_ret:	786	err_ret:
774	ext4_unlock_group(sb, group);	787	ext4_unlock_group(sb, group);
		788	up_read(&grp->alloc_sem);
775	return retval;	789	return retval;
776	}	790	}
777		791
@@ -1205,3 +1219,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1205	}	1219	}
1206	return count;	1220	return count;
1207	}	1221	}
		1222
		1223	/*
		1224	* Zeroes not yet zeroed inode table - just write zeroes through the whole
		1225	* inode table. Must be called without any spinlock held. The only place
		1226	* where it is called from on active part of filesystem is ext4lazyinit
		1227	* thread, so we do not need any special locks, however we have to prevent
		1228	* inode allocation from the current group, so we take alloc_sem lock, to
		1229	* block ext4_claim_inode until we are finished.
		1230	*/
		1231	extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
		1232	int barrier)
		1233	{
		1234	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
		1235	struct ext4_sb_info *sbi = EXT4_SB(sb);
		1236	struct ext4_group_desc *gdp = NULL;
		1237	struct buffer_head *group_desc_bh;
		1238	handle_t *handle;
		1239	ext4_fsblk_t blk;
		1240	int num, ret = 0, used_blks = 0;
		1241	unsigned long flags = BLKDEV_IFL_WAIT;
		1242
		1243	/* This should not happen, but just to be sure check this */
		1244	if (sb->s_flags & MS_RDONLY) {
		1245	ret = 1;
		1246	goto out;
		1247	}
		1248
		1249	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
		1250	if (!gdp)
		1251	goto out;
		1252
		1253	/*
		1254	* We do not need to lock this, because we are the only one
		1255	* handling this flag.
		1256	*/
		1257	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
		1258	goto out;
		1259
		1260	handle = ext4_journal_start_sb(sb, 1);
		1261	if (IS_ERR(handle)) {
		1262	ret = PTR_ERR(handle);
		1263	goto out;
		1264	}
		1265
		1266	down_write(&grp->alloc_sem);
		1267	/*
		1268	* If inode bitmap was already initialized there may be some
		1269	* used inodes so we need to skip blocks with used inodes in
		1270	* inode table.
		1271	*/
		1272	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
		1273	used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
		1274	ext4_itable_unused_count(sb, gdp)),
		1275	sbi->s_inodes_per_block);
		1276
		1277	blk = ext4_inode_table(sb, gdp) + used_blks;
		1278	num = sbi->s_itb_per_group - used_blks;
		1279
		1280	BUFFER_TRACE(group_desc_bh, "get_write_access");
		1281	ret = ext4_journal_get_write_access(handle,
		1282	group_desc_bh);
		1283	if (ret)
		1284	goto err_out;
		1285
		1286	if (unlikely(num > EXT4_INODES_PER_GROUP(sb))) {
		1287	ext4_error(sb, "Something is wrong with group %u\n"
		1288	"Used itable blocks: %d"
		1289	"Itable blocks per group: %lu\n",
		1290	group, used_blks, sbi->s_itb_per_group);
		1291	ret = 1;
		1292	goto err_out;
		1293	}
		1294
		1295	/*
		1296	* Skip zeroout if the inode table is full. But we set the ZEROED
		1297	* flag anyway, because obviously, when it is full it does not need
		1298	* further zeroing.
		1299	*/
		1300	if (unlikely(num == 0))
		1301	goto skip_zeroout;
		1302
		1303	ext4_debug("going to zero out inode table in group %d\n",
		1304	group);
		1305	if (barrier)
		1306	flags \|= BLKDEV_IFL_BARRIER;
		1307	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS, flags);
		1308	if (ret < 0)
		1309	goto err_out;
		1310
		1311	skip_zeroout:
		1312	ext4_lock_group(sb, group);
		1313	gdp->bg_flags \|= cpu_to_le16(EXT4_BG_INODE_ZEROED);
		1314	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
		1315	ext4_unlock_group(sb, group);
		1316
		1317	BUFFER_TRACE(group_desc_bh,
		1318	"call ext4_handle_dirty_metadata");
		1319	ret = ext4_handle_dirty_metadata(handle, NULL,
		1320	group_desc_bh);
		1321
		1322	err_out:
		1323	up_write(&grp->alloc_sem);
		1324	ext4_journal_stop(handle);
		1325	out:
		1326	return ret;
		1327	}