aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLukas Czerner <lczerner@redhat.com>2010-10-27 21:30:05 -0400
committerTheodore Ts'o <tytso@mit.edu>2010-10-27 21:30:05 -0400
commitbfff68738f1cb5c93dab1114634cea02aae9e7ba (patch)
treeb6cdf3f26e86464c7088cab62d837eb32f559fb9
parente6fa0be699449d28a20e815bfe9ce26725ec4962 (diff)
ext4: add support for lazy inode table initialization
When the lazy_itable_init extended option is passed to mke2fs, it considerably speeds up filesystem creation because inode tables are not zeroed out. The fact that parts of the inode table are uninitialized is not a problem so long as the block group descriptors, which contain information regarding how much of the inode table has been initialized, has not been corrupted However, if the block group checksums are not valid, e2fsck must scan the entire inode table, and the the old, uninitialized data could potentially cause e2fsck to report false problems. Hence, it is important for the inode tables to be initialized as soon as possble. This commit adds this feature so that mke2fs can safely use the lazy inode table initialization feature to speed up formatting file systems. This is done via a new new kernel thread called ext4lazyinit, which is created on demand and destroyed, when it is no longer needed. There is only one thread for all ext4 filesystems in the system. When the first filesystem with inititable mount option is mounted, ext4lazyinit thread is created, then the filesystem can register its request in the request list. This thread then walks through the list of requests picking up scheduled requests and invoking ext4_init_inode_table(). Next schedule time for the request is computed by multiplying the time it took to zero out last inode table with wait multiplier, which can be set with the (init_itable=n) mount option (default is 10). We are doing this so we do not take the whole I/O bandwidth. When the thread is no longer necessary (request list is empty) it frees the appropriate structures and exits (and can be created later later by another filesystem). We do not disturb regular inode allocations in any way, it just do not care whether the inode table is, or is not zeroed. But when zeroing, we have to skip used inodes, obviously. Also we should prevent new inode allocations from the group, while zeroing is on the way. For that we take write alloc_sem lock in ext4_init_inode_table() and read alloc_sem in the ext4_claim_inode, so when we are unlucky and allocator hits the group which is currently being zeroed, it just has to wait. This can be suppresed using the mount option no_init_itable. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--Documentation/filesystems/ext4.txt14
-rw-r--r--fs/ext4/ext4.h40
-rw-r--r--fs/ext4/ialloc.c120
-rw-r--r--fs/ext4/super.c440
4 files changed, 611 insertions, 3 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index e1def1786e50..6ab9442d7eeb 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -353,6 +353,20 @@ noauto_da_alloc replacing existing files via patterns such as
353 system crashes before the delayed allocation 353 system crashes before the delayed allocation
354 blocks are forced to disk. 354 blocks are forced to disk.
355 355
356noinit_itable Do not initialize any uninitialized inode table
357 blocks in the background. This feature may be
358 used by installation CD's so that the install
359 process can complete as quickly as possible; the
360 inode table initialization process would then be
361 deferred until the next time the file system
362 is unmounted.
363
364init_itable=n The lazy itable init code will wait n times the
365 number of milliseconds it took to zero out the
366 previous block group's inode table. This
367 minimizes the impact on the systme performance
368 while file system's inode table is being initialized.
369
356discard Controls whether ext4 should issue discard/TRIM 370discard Controls whether ext4 should issue discard/TRIM
357nodiscard(*) commands to the underlying block device when 371nodiscard(*) commands to the underlying block device when
358 blocks are freed. This is useful for SSD devices 372 blocks are freed. This is useful for SSD devices
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b364b9df09b3..0fe078d368d0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -890,6 +890,7 @@ struct ext4_inode_info {
890#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 890#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
891#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 891#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
892#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ 892#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
893#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
893 894
894#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 895#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
895#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 896#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -1173,6 +1174,11 @@ struct ext4_sb_info {
1173 1174
1174 /* timer for periodic error stats printing */ 1175 /* timer for periodic error stats printing */
1175 struct timer_list s_err_report; 1176 struct timer_list s_err_report;
1177
1178 /* Lazy inode table initialization info */
1179 struct ext4_li_request *s_li_request;
1180 /* Wait multiplier for lazy initialization thread */
1181 unsigned int s_li_wait_mult;
1176}; 1182};
1177 1183
1178static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1184static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1537,6 +1543,38 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1537extern struct proc_dir_entry *ext4_proc_root; 1543extern struct proc_dir_entry *ext4_proc_root;
1538 1544
1539/* 1545/*
1546 * Timeout and state flag for lazy initialization inode thread.
1547 */
1548#define EXT4_DEF_LI_WAIT_MULT 10
1549#define EXT4_DEF_LI_MAX_START_DELAY 5
1550#define EXT4_LAZYINIT_QUIT 0x0001
1551#define EXT4_LAZYINIT_RUNNING 0x0002
1552
1553/*
1554 * Lazy inode table initialization info
1555 */
1556struct ext4_lazy_init {
1557 unsigned long li_state;
1558
1559 wait_queue_head_t li_wait_daemon;
1560 wait_queue_head_t li_wait_task;
1561 struct timer_list li_timer;
1562 struct task_struct *li_task;
1563
1564 struct list_head li_request_list;
1565 struct mutex li_list_mtx;
1566};
1567
1568struct ext4_li_request {
1569 struct super_block *lr_super;
1570 struct ext4_sb_info *lr_sbi;
1571 ext4_group_t lr_next_group;
1572 struct list_head lr_request;
1573 unsigned long lr_next_sched;
1574 unsigned long lr_timeout;
1575};
1576
1577/*
1540 * Function prototypes 1578 * Function prototypes
1541 */ 1579 */
1542 1580
@@ -1611,6 +1649,8 @@ extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
1611 ext4_group_t group, 1649 ext4_group_t group,
1612 struct ext4_group_desc *desc); 1650 struct ext4_group_desc *desc);
1613extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); 1651extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1652extern int ext4_init_inode_table(struct super_block *sb,
1653 ext4_group_t group, int barrier);
1614 1654
1615/* mballoc.c */ 1655/* mballoc.c */
1616extern long ext4_mb_stats; 1656extern long ext4_mb_stats;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 45853e0d1f21..e428f23215c0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -107,6 +107,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
107 desc = ext4_get_group_desc(sb, block_group, NULL); 107 desc = ext4_get_group_desc(sb, block_group, NULL);
108 if (!desc) 108 if (!desc)
109 return NULL; 109 return NULL;
110
110 bitmap_blk = ext4_inode_bitmap(sb, desc); 111 bitmap_blk = ext4_inode_bitmap(sb, desc);
111 bh = sb_getblk(sb, bitmap_blk); 112 bh = sb_getblk(sb, bitmap_blk);
112 if (unlikely(!bh)) { 113 if (unlikely(!bh)) {
@@ -123,6 +124,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
123 unlock_buffer(bh); 124 unlock_buffer(bh);
124 return bh; 125 return bh;
125 } 126 }
127
126 ext4_lock_group(sb, block_group); 128 ext4_lock_group(sb, block_group);
127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 129 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
128 ext4_init_inode_bitmap(sb, bh, block_group, desc); 130 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -133,6 +135,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
133 return bh; 135 return bh;
134 } 136 }
135 ext4_unlock_group(sb, block_group); 137 ext4_unlock_group(sb, block_group);
138
136 if (buffer_uptodate(bh)) { 139 if (buffer_uptodate(bh)) {
137 /* 140 /*
138 * if not uninit if bh is uptodate, 141 * if not uninit if bh is uptodate,
@@ -712,8 +715,17 @@ static int ext4_claim_inode(struct super_block *sb,
712{ 715{
713 int free = 0, retval = 0, count; 716 int free = 0, retval = 0, count;
714 struct ext4_sb_info *sbi = EXT4_SB(sb); 717 struct ext4_sb_info *sbi = EXT4_SB(sb);
718 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
715 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 719 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
716 720
721 /*
722 * We have to be sure that new inode allocation does not race with
723 * inode table initialization, because otherwise we may end up
724 * allocating and writing new inode right before sb_issue_zeroout
725 * takes place and overwriting our new inode with zeroes. So we
726 * take alloc_sem to prevent it.
727 */
728 down_read(&grp->alloc_sem);
717 ext4_lock_group(sb, group); 729 ext4_lock_group(sb, group);
718 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { 730 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
719 /* not a free inode */ 731 /* not a free inode */
@@ -724,6 +736,7 @@ static int ext4_claim_inode(struct super_block *sb,
724 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 736 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
725 ino > EXT4_INODES_PER_GROUP(sb)) { 737 ino > EXT4_INODES_PER_GROUP(sb)) {
726 ext4_unlock_group(sb, group); 738 ext4_unlock_group(sb, group);
739 up_read(&grp->alloc_sem);
727 ext4_error(sb, "reserved inode or inode > inodes count - " 740 ext4_error(sb, "reserved inode or inode > inodes count - "
728 "block_group = %u, inode=%lu", group, 741 "block_group = %u, inode=%lu", group,
729 ino + group * EXT4_INODES_PER_GROUP(sb)); 742 ino + group * EXT4_INODES_PER_GROUP(sb));
@@ -772,6 +785,7 @@ static int ext4_claim_inode(struct super_block *sb,
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 785 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773err_ret: 786err_ret:
774 ext4_unlock_group(sb, group); 787 ext4_unlock_group(sb, group);
788 up_read(&grp->alloc_sem);
775 return retval; 789 return retval;
776} 790}
777 791
@@ -1205,3 +1219,109 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1205 } 1219 }
1206 return count; 1220 return count;
1207} 1221}
1222
1223/*
1224 * Zeroes not yet zeroed inode table - just write zeroes through the whole
1225 * inode table. Must be called without any spinlock held. The only place
1226 * where it is called from on active part of filesystem is ext4lazyinit
1227 * thread, so we do not need any special locks, however we have to prevent
1228 * inode allocation from the current group, so we take alloc_sem lock, to
1229 * block ext4_claim_inode until we are finished.
1230 */
1231extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1232 int barrier)
1233{
1234 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1235 struct ext4_sb_info *sbi = EXT4_SB(sb);
1236 struct ext4_group_desc *gdp = NULL;
1237 struct buffer_head *group_desc_bh;
1238 handle_t *handle;
1239 ext4_fsblk_t blk;
1240 int num, ret = 0, used_blks = 0;
1241 unsigned long flags = BLKDEV_IFL_WAIT;
1242
1243 /* This should not happen, but just to be sure check this */
1244 if (sb->s_flags & MS_RDONLY) {
1245 ret = 1;
1246 goto out;
1247 }
1248
1249 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1250 if (!gdp)
1251 goto out;
1252
1253 /*
1254 * We do not need to lock this, because we are the only one
1255 * handling this flag.
1256 */
1257 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
1258 goto out;
1259
1260 handle = ext4_journal_start_sb(sb, 1);
1261 if (IS_ERR(handle)) {
1262 ret = PTR_ERR(handle);
1263 goto out;
1264 }
1265
1266 down_write(&grp->alloc_sem);
1267 /*
1268 * If inode bitmap was already initialized there may be some
1269 * used inodes so we need to skip blocks with used inodes in
1270 * inode table.
1271 */
1272 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
1273 used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
1274 ext4_itable_unused_count(sb, gdp)),
1275 sbi->s_inodes_per_block);
1276
1277 blk = ext4_inode_table(sb, gdp) + used_blks;
1278 num = sbi->s_itb_per_group - used_blks;
1279
1280 BUFFER_TRACE(group_desc_bh, "get_write_access");
1281 ret = ext4_journal_get_write_access(handle,
1282 group_desc_bh);
1283 if (ret)
1284 goto err_out;
1285
1286 if (unlikely(num > EXT4_INODES_PER_GROUP(sb))) {
1287 ext4_error(sb, "Something is wrong with group %u\n"
1288 "Used itable blocks: %d"
1289 "Itable blocks per group: %lu\n",
1290 group, used_blks, sbi->s_itb_per_group);
1291 ret = 1;
1292 goto err_out;
1293 }
1294
1295 /*
1296 * Skip zeroout if the inode table is full. But we set the ZEROED
1297 * flag anyway, because obviously, when it is full it does not need
1298 * further zeroing.
1299 */
1300 if (unlikely(num == 0))
1301 goto skip_zeroout;
1302
1303 ext4_debug("going to zero out inode table in group %d\n",
1304 group);
1305 if (barrier)
1306 flags |= BLKDEV_IFL_BARRIER;
1307 ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS, flags);
1308 if (ret < 0)
1309 goto err_out;
1310
1311skip_zeroout:
1312 ext4_lock_group(sb, group);
1313 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
1314 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
1315 ext4_unlock_group(sb, group);
1316
1317 BUFFER_TRACE(group_desc_bh,
1318 "call ext4_handle_dirty_metadata");
1319 ret = ext4_handle_dirty_metadata(handle, NULL,
1320 group_desc_bh);
1321
1322err_out:
1323 up_write(&grp->alloc_sem);
1324 ext4_journal_stop(handle);
1325out:
1326 return ret;
1327}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 751997d2cefe..5066537e5a38 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -41,6 +41,9 @@
41#include <linux/crc16.h> 41#include <linux/crc16.h>
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
43 43
44#include <linux/kthread.h>
45#include <linux/freezer.h>
46
44#include "ext4.h" 47#include "ext4.h"
45#include "ext4_jbd2.h" 48#include "ext4_jbd2.h"
46#include "xattr.h" 49#include "xattr.h"
@@ -52,6 +55,8 @@
52 55
53struct proc_dir_entry *ext4_proc_root; 56struct proc_dir_entry *ext4_proc_root;
54static struct kset *ext4_kset; 57static struct kset *ext4_kset;
58struct ext4_lazy_init *ext4_li_info;
59struct mutex ext4_li_mtx;
55 60
56static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 61static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
57 unsigned long journal_devnum); 62 unsigned long journal_devnum);
@@ -70,6 +75,8 @@ static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 75static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags, 76static int ext4_get_sb(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt); 77 const char *dev_name, void *data, struct vfsmount *mnt);
78static void ext4_destroy_lazyinit_thread(void);
79static void ext4_unregister_li_request(struct super_block *sb);
73 80
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 81#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = { 82static struct file_system_type ext3_fs_type = {
@@ -720,6 +727,7 @@ static void ext4_put_super(struct super_block *sb)
720 } 727 }
721 728
722 del_timer(&sbi->s_err_report); 729 del_timer(&sbi->s_err_report);
730 ext4_unregister_li_request(sb);
723 ext4_release_system_zone(sb); 731 ext4_release_system_zone(sb);
724 ext4_mb_release(sb); 732 ext4_mb_release(sb);
725 ext4_ext_release(sb); 733 ext4_ext_release(sb);
@@ -1046,6 +1054,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1046 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) 1054 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1047 seq_puts(seq, ",block_validity"); 1055 seq_puts(seq, ",block_validity");
1048 1056
1057 if (!test_opt(sb, INIT_INODE_TABLE))
1058 seq_puts(seq, ",noinit_inode_table");
1059 else if (sbi->s_li_wait_mult)
1060 seq_printf(seq, ",init_inode_table=%u",
1061 (unsigned) sbi->s_li_wait_mult);
1062
1049 ext4_show_quota_options(seq, sb); 1063 ext4_show_quota_options(seq, sb);
1050 1064
1051 return 0; 1065 return 0;
@@ -1220,6 +1234,7 @@ enum {
1220 Opt_inode_readahead_blks, Opt_journal_ioprio, 1234 Opt_inode_readahead_blks, Opt_journal_ioprio,
1221 Opt_dioread_nolock, Opt_dioread_lock, 1235 Opt_dioread_nolock, Opt_dioread_lock,
1222 Opt_discard, Opt_nodiscard, 1236 Opt_discard, Opt_nodiscard,
1237 Opt_init_inode_table, Opt_noinit_inode_table,
1223}; 1238};
1224 1239
1225static const match_table_t tokens = { 1240static const match_table_t tokens = {
@@ -1290,6 +1305,9 @@ static const match_table_t tokens = {
1290 {Opt_dioread_lock, "dioread_lock"}, 1305 {Opt_dioread_lock, "dioread_lock"},
1291 {Opt_discard, "discard"}, 1306 {Opt_discard, "discard"},
1292 {Opt_nodiscard, "nodiscard"}, 1307 {Opt_nodiscard, "nodiscard"},
1308 {Opt_init_inode_table, "init_itable=%u"},
1309 {Opt_init_inode_table, "init_itable"},
1310 {Opt_noinit_inode_table, "noinit_itable"},
1293 {Opt_err, NULL}, 1311 {Opt_err, NULL},
1294}; 1312};
1295 1313
@@ -1760,6 +1778,20 @@ set_qf_format:
1760 case Opt_dioread_lock: 1778 case Opt_dioread_lock:
1761 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); 1779 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1762 break; 1780 break;
1781 case Opt_init_inode_table:
1782 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
1783 if (args[0].from) {
1784 if (match_int(&args[0], &option))
1785 return 0;
1786 } else
1787 option = EXT4_DEF_LI_WAIT_MULT;
1788 if (option < 0)
1789 return 0;
1790 sbi->s_li_wait_mult = option;
1791 break;
1792 case Opt_noinit_inode_table:
1793 clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
1794 break;
1763 default: 1795 default:
1764 ext4_msg(sb, KERN_ERR, 1796 ext4_msg(sb, KERN_ERR,
1765 "Unrecognized mount option \"%s\" " 1797 "Unrecognized mount option \"%s\" "
@@ -1943,7 +1975,8 @@ int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
1943} 1975}
1944 1976
1945/* Called at mount-time, super-block is locked */ 1977/* Called at mount-time, super-block is locked */
1946static int ext4_check_descriptors(struct super_block *sb) 1978static int ext4_check_descriptors(struct super_block *sb,
1979 ext4_group_t *first_not_zeroed)
1947{ 1980{
1948 struct ext4_sb_info *sbi = EXT4_SB(sb); 1981 struct ext4_sb_info *sbi = EXT4_SB(sb);
1949 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); 1982 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1952,7 +1985,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1952 ext4_fsblk_t inode_bitmap; 1985 ext4_fsblk_t inode_bitmap;
1953 ext4_fsblk_t inode_table; 1986 ext4_fsblk_t inode_table;
1954 int flexbg_flag = 0; 1987 int flexbg_flag = 0;
1955 ext4_group_t i; 1988 ext4_group_t i, grp = sbi->s_groups_count;
1956 1989
1957 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 1990 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1958 flexbg_flag = 1; 1991 flexbg_flag = 1;
@@ -1968,6 +2001,10 @@ static int ext4_check_descriptors(struct super_block *sb)
1968 last_block = first_block + 2001 last_block = first_block +
1969 (EXT4_BLOCKS_PER_GROUP(sb) - 1); 2002 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1970 2003
2004 if ((grp == sbi->s_groups_count) &&
2005 !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2006 grp = i;
2007
1971 block_bitmap = ext4_block_bitmap(sb, gdp); 2008 block_bitmap = ext4_block_bitmap(sb, gdp);
1972 if (block_bitmap < first_block || block_bitmap > last_block) { 2009 if (block_bitmap < first_block || block_bitmap > last_block) {
1973 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2010 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -2005,6 +2042,8 @@ static int ext4_check_descriptors(struct super_block *sb)
2005 if (!flexbg_flag) 2042 if (!flexbg_flag)
2006 first_block += EXT4_BLOCKS_PER_GROUP(sb); 2043 first_block += EXT4_BLOCKS_PER_GROUP(sb);
2007 } 2044 }
2045 if (NULL != first_not_zeroed)
2046 *first_not_zeroed = grp;
2008 2047
2009 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); 2048 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
2010 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); 2049 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@ -2543,6 +2582,378 @@ static void print_daily_error_info(unsigned long arg)
2543 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 2582 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2544} 2583}
2545 2584
2585static void ext4_lazyinode_timeout(unsigned long data)
2586{
2587 struct task_struct *p = (struct task_struct *)data;
2588 wake_up_process(p);
2589}
2590
2591/* Find next suitable group and run ext4_init_inode_table */
2592static int ext4_run_li_request(struct ext4_li_request *elr)
2593{
2594 struct ext4_group_desc *gdp = NULL;
2595 ext4_group_t group, ngroups;
2596 struct super_block *sb;
2597 unsigned long timeout = 0;
2598 int ret = 0;
2599
2600 sb = elr->lr_super;
2601 ngroups = EXT4_SB(sb)->s_groups_count;
2602
2603 for (group = elr->lr_next_group; group < ngroups; group++) {
2604 gdp = ext4_get_group_desc(sb, group, NULL);
2605 if (!gdp) {
2606 ret = 1;
2607 break;
2608 }
2609
2610 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2611 break;
2612 }
2613
2614 if (group == ngroups)
2615 ret = 1;
2616
2617 if (!ret) {
2618 timeout = jiffies;
2619 ret = ext4_init_inode_table(sb, group,
2620 elr->lr_timeout ? 0 : 1);
2621 if (elr->lr_timeout == 0) {
2622 timeout = jiffies - timeout;
2623 if (elr->lr_sbi->s_li_wait_mult)
2624 timeout *= elr->lr_sbi->s_li_wait_mult;
2625 else
2626 timeout *= 20;
2627 elr->lr_timeout = timeout;
2628 }
2629 elr->lr_next_sched = jiffies + elr->lr_timeout;
2630 elr->lr_next_group = group + 1;
2631 }
2632
2633 return ret;
2634}
2635
2636/*
2637 * Remove lr_request from the list_request and free the
2638 * request tructure. Should be called with li_list_mtx held
2639 */
2640static void ext4_remove_li_request(struct ext4_li_request *elr)
2641{
2642 struct ext4_sb_info *sbi;
2643
2644 if (!elr)
2645 return;
2646
2647 sbi = elr->lr_sbi;
2648
2649 list_del(&elr->lr_request);
2650 sbi->s_li_request = NULL;
2651 kfree(elr);
2652}
2653
2654static void ext4_unregister_li_request(struct super_block *sb)
2655{
2656 struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
2657
2658 if (!ext4_li_info)
2659 return;
2660
2661 mutex_lock(&ext4_li_info->li_list_mtx);
2662 ext4_remove_li_request(elr);
2663 mutex_unlock(&ext4_li_info->li_list_mtx);
2664}
2665
2666/*
2667 * This is the function where ext4lazyinit thread lives. It walks
2668 * through the request list searching for next scheduled filesystem.
2669 * When such a fs is found, run the lazy initialization request
2670 * (ext4_rn_li_request) and keep track of the time spend in this
2671 * function. Based on that time we compute next schedule time of
2672 * the request. When walking through the list is complete, compute
2673 * next waking time and put itself into sleep.
2674 */
2675static int ext4_lazyinit_thread(void *arg)
2676{
2677 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2678 struct list_head *pos, *n;
2679 struct ext4_li_request *elr;
2680 unsigned long next_wakeup;
2681 DEFINE_WAIT(wait);
2682 int ret;
2683
2684 BUG_ON(NULL == eli);
2685
2686 eli->li_timer.data = (unsigned long)current;
2687 eli->li_timer.function = ext4_lazyinode_timeout;
2688
2689 eli->li_task = current;
2690 wake_up(&eli->li_wait_task);
2691
2692cont_thread:
2693 while (true) {
2694 next_wakeup = MAX_JIFFY_OFFSET;
2695
2696 mutex_lock(&eli->li_list_mtx);
2697 if (list_empty(&eli->li_request_list)) {
2698 mutex_unlock(&eli->li_list_mtx);
2699 goto exit_thread;
2700 }
2701
2702 list_for_each_safe(pos, n, &eli->li_request_list) {
2703 elr = list_entry(pos, struct ext4_li_request,
2704 lr_request);
2705
2706 if (time_after_eq(jiffies, elr->lr_next_sched))
2707 ret = ext4_run_li_request(elr);
2708
2709 if (ret) {
2710 ret = 0;
2711 ext4_remove_li_request(elr);
2712 continue;
2713 }
2714
2715 if (time_before(elr->lr_next_sched, next_wakeup))
2716 next_wakeup = elr->lr_next_sched;
2717 }
2718 mutex_unlock(&eli->li_list_mtx);
2719
2720 if (freezing(current))
2721 refrigerator();
2722
2723 if (time_after_eq(jiffies, next_wakeup)) {
2724 cond_resched();
2725 continue;
2726 }
2727
2728 eli->li_timer.expires = next_wakeup;
2729 add_timer(&eli->li_timer);
2730 prepare_to_wait(&eli->li_wait_daemon, &wait,
2731 TASK_INTERRUPTIBLE);
2732 if (time_before(jiffies, next_wakeup))
2733 schedule();
2734 finish_wait(&eli->li_wait_daemon, &wait);
2735 }
2736
2737exit_thread:
2738 /*
2739 * It looks like the request list is empty, but we need
2740 * to check it under the li_list_mtx lock, to prevent any
2741 * additions into it, and of course we should lock ext4_li_mtx
2742 * to atomically free the list and ext4_li_info, because at
2743 * this point another ext4 filesystem could be registering
2744 * new one.
2745 */
2746 mutex_lock(&ext4_li_mtx);
2747 mutex_lock(&eli->li_list_mtx);
2748 if (!list_empty(&eli->li_request_list)) {
2749 mutex_unlock(&eli->li_list_mtx);
2750 mutex_unlock(&ext4_li_mtx);
2751 goto cont_thread;
2752 }
2753 mutex_unlock(&eli->li_list_mtx);
2754 del_timer_sync(&ext4_li_info->li_timer);
2755 eli->li_task = NULL;
2756 wake_up(&eli->li_wait_task);
2757
2758 kfree(ext4_li_info);
2759 ext4_li_info = NULL;
2760 mutex_unlock(&ext4_li_mtx);
2761
2762 return 0;
2763}
2764
2765static void ext4_clear_request_list(void)
2766{
2767 struct list_head *pos, *n;
2768 struct ext4_li_request *elr;
2769
2770 mutex_lock(&ext4_li_info->li_list_mtx);
2771 if (list_empty(&ext4_li_info->li_request_list))
2772 return;
2773
2774 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
2775 elr = list_entry(pos, struct ext4_li_request,
2776 lr_request);
2777 ext4_remove_li_request(elr);
2778 }
2779 mutex_unlock(&ext4_li_info->li_list_mtx);
2780}
2781
2782static int ext4_run_lazyinit_thread(void)
2783{
2784 struct task_struct *t;
2785
2786 t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
2787 if (IS_ERR(t)) {
2788 int err = PTR_ERR(t);
2789 ext4_clear_request_list();
2790 del_timer_sync(&ext4_li_info->li_timer);
2791 kfree(ext4_li_info);
2792 ext4_li_info = NULL;
2793 printk(KERN_CRIT "EXT4: error %d creating inode table "
2794 "initialization thread\n",
2795 err);
2796 return err;
2797 }
2798 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2799
2800 wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
2801 return 0;
2802}
2803
2804/*
2805 * Check whether it make sense to run itable init. thread or not.
2806 * If there is at least one uninitialized inode table, return
2807 * corresponding group number, else the loop goes through all
2808 * groups and return total number of groups.
2809 */
2810static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
2811{
2812 ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
2813 struct ext4_group_desc *gdp = NULL;
2814
2815 for (group = 0; group < ngroups; group++) {
2816 gdp = ext4_get_group_desc(sb, group, NULL);
2817 if (!gdp)
2818 continue;
2819
2820 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2821 break;
2822 }
2823
2824 return group;
2825}
2826
2827static int ext4_li_info_new(void)
2828{
2829 struct ext4_lazy_init *eli = NULL;
2830
2831 eli = kzalloc(sizeof(*eli), GFP_KERNEL);
2832 if (!eli)
2833 return -ENOMEM;
2834
2835 eli->li_task = NULL;
2836 INIT_LIST_HEAD(&eli->li_request_list);
2837 mutex_init(&eli->li_list_mtx);
2838
2839 init_waitqueue_head(&eli->li_wait_daemon);
2840 init_waitqueue_head(&eli->li_wait_task);
2841 init_timer(&eli->li_timer);
2842 eli->li_state |= EXT4_LAZYINIT_QUIT;
2843
2844 ext4_li_info = eli;
2845
2846 return 0;
2847}
2848
2849static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
2850 ext4_group_t start)
2851{
2852 struct ext4_sb_info *sbi = EXT4_SB(sb);
2853 struct ext4_li_request *elr;
2854 unsigned long rnd;
2855
2856 elr = kzalloc(sizeof(*elr), GFP_KERNEL);
2857 if (!elr)
2858 return NULL;
2859
2860 elr->lr_super = sb;
2861 elr->lr_sbi = sbi;
2862 elr->lr_next_group = start;
2863
2864 /*
2865 * Randomize first schedule time of the request to
2866 * spread the inode table initialization requests
2867 * better.
2868 */
2869 get_random_bytes(&rnd, sizeof(rnd));
2870 elr->lr_next_sched = jiffies + (unsigned long)rnd %
2871 (EXT4_DEF_LI_MAX_START_DELAY * HZ);
2872
2873 return elr;
2874}
2875
2876static int ext4_register_li_request(struct super_block *sb,
2877 ext4_group_t first_not_zeroed)
2878{
2879 struct ext4_sb_info *sbi = EXT4_SB(sb);
2880 struct ext4_li_request *elr;
2881 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2882 int ret = 0;
2883
2884 if (sbi->s_li_request != NULL)
2885 goto out;
2886
2887 if (first_not_zeroed == ngroups ||
2888 (sb->s_flags & MS_RDONLY) ||
2889 !test_opt(sb, INIT_INODE_TABLE)) {
2890 sbi->s_li_request = NULL;
2891 goto out;
2892 }
2893
2894 if (first_not_zeroed == ngroups) {
2895 sbi->s_li_request = NULL;
2896 goto out;
2897 }
2898
2899 elr = ext4_li_request_new(sb, first_not_zeroed);
2900 if (!elr) {
2901 ret = -ENOMEM;
2902 goto out;
2903 }
2904
2905 mutex_lock(&ext4_li_mtx);
2906
2907 if (NULL == ext4_li_info) {
2908 ret = ext4_li_info_new();
2909 if (ret)
2910 goto out;
2911 }
2912
2913 mutex_lock(&ext4_li_info->li_list_mtx);
2914 list_add(&elr->lr_request, &ext4_li_info->li_request_list);
2915 mutex_unlock(&ext4_li_info->li_list_mtx);
2916
2917 sbi->s_li_request = elr;
2918
2919 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
2920 ret = ext4_run_lazyinit_thread();
2921 if (ret)
2922 goto out;
2923 }
2924
2925 mutex_unlock(&ext4_li_mtx);
2926
2927out:
2928 if (ret) {
2929 mutex_unlock(&ext4_li_mtx);
2930 kfree(elr);
2931 }
2932 return ret;
2933}
2934
2935/*
2936 * We do not need to lock anything since this is called on
2937 * module unload.
2938 */
2939static void ext4_destroy_lazyinit_thread(void)
2940{
2941 /*
2942 * If thread exited earlier
2943 * there's nothing to be done.
2944 */
2945 if (!ext4_li_info)
2946 return;
2947
2948 ext4_clear_request_list();
2949
2950 while (ext4_li_info->li_task) {
2951 wake_up(&ext4_li_info->li_wait_daemon);
2952 wait_event(ext4_li_info->li_wait_task,
2953 ext4_li_info->li_task == NULL);
2954 }
2955}
2956
2546static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2957static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2547 __releases(kernel_lock) 2958 __releases(kernel_lock)
2548 __acquires(kernel_lock) 2959 __acquires(kernel_lock)
@@ -2568,6 +2979,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2568 __u64 blocks_count; 2979 __u64 blocks_count;
2569 int err; 2980 int err;
2570 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 2981 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
2982 ext4_group_t first_not_zeroed;
2571 2983
2572 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2984 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2573 if (!sbi) 2985 if (!sbi)
@@ -2630,6 +3042,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2630 3042
2631 /* Set defaults before we parse the mount options */ 3043 /* Set defaults before we parse the mount options */
2632 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 3044 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3045 set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
2633 if (def_mount_opts & EXT4_DEFM_DEBUG) 3046 if (def_mount_opts & EXT4_DEFM_DEBUG)
2634 set_opt(sbi->s_mount_opt, DEBUG); 3047 set_opt(sbi->s_mount_opt, DEBUG);
2635 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3048 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@ -2909,7 +3322,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2909 goto failed_mount2; 3322 goto failed_mount2;
2910 } 3323 }
2911 } 3324 }
2912 if (!ext4_check_descriptors(sb)) { 3325 if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
2913 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 3326 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
2914 goto failed_mount2; 3327 goto failed_mount2;
2915 } 3328 }
@@ -3130,6 +3543,10 @@ no_journal:
3130 goto failed_mount4; 3543 goto failed_mount4;
3131 } 3544 }
3132 3545
3546 err = ext4_register_li_request(sb, first_not_zeroed);
3547 if (err)
3548 goto failed_mount4;
3549
3133 sbi->s_kobj.kset = ext4_kset; 3550 sbi->s_kobj.kset = ext4_kset;
3134 init_completion(&sbi->s_kobj_unregister); 3551 init_completion(&sbi->s_kobj_unregister);
3135 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, 3552 err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@ -3847,6 +4264,19 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3847 enable_quota = 1; 4264 enable_quota = 1;
3848 } 4265 }
3849 } 4266 }
4267
4268 /*
4269 * Reinitialize lazy itable initialization thread based on
4270 * current settings
4271 */
4272 if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
4273 ext4_unregister_li_request(sb);
4274 else {
4275 ext4_group_t first_not_zeroed;
4276 first_not_zeroed = ext4_has_uninit_itable(sb);
4277 ext4_register_li_request(sb, first_not_zeroed);
4278 }
4279
3850 ext4_setup_system_zone(sb); 4280 ext4_setup_system_zone(sb);
3851 if (sbi->s_journal == NULL) 4281 if (sbi->s_journal == NULL)
3852 ext4_commit_super(sb, 1); 4282 ext4_commit_super(sb, 1);
@@ -4317,6 +4747,9 @@ static int __init init_ext4_fs(void)
4317 err = register_filesystem(&ext4_fs_type); 4747 err = register_filesystem(&ext4_fs_type);
4318 if (err) 4748 if (err)
4319 goto out; 4749 goto out;
4750
4751 ext4_li_info = NULL;
4752 mutex_init(&ext4_li_mtx);
4320 return 0; 4753 return 0;
4321out: 4754out:
4322 unregister_as_ext2(); 4755 unregister_as_ext2();
@@ -4336,6 +4769,7 @@ out4:
4336 4769
4337static void __exit exit_ext4_fs(void) 4770static void __exit exit_ext4_fs(void)
4338{ 4771{
4772 ext4_destroy_lazyinit_thread();
4339 unregister_as_ext2(); 4773 unregister_as_ext2();
4340 unregister_as_ext3(); 4774 unregister_as_ext3();
4341 unregister_filesystem(&ext4_fs_type); 4775 unregister_filesystem(&ext4_fs_type);