aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/ialloc.c
diff options
context:
space:
mode:
authorAndreas Dilger <adilger@clusterfs.com>2007-10-16 18:38:25 -0400
committerTheodore Ts'o <tytso@mit.edu>2007-10-17 18:50:00 -0400
commit717d50e4971b81b96c0199c91cdf0039a8cb181a (patch)
treea8d68edbc1f064c76cbfee206e093d2c86c80ba0 /fs/ext4/ialloc.c
parent4074fe3736b1a43431dff870bf9055ac5dcf3f03 (diff)
Ext4: Uninitialized Block Groups
In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Diffstat (limited to 'fs/ext4/ialloc.c')
-rw-r--r--fs/ext4/ialloc.c146
1 files changed, 137 insertions, 9 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 38e9a0a705df..c61f37fd3f05 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -28,6 +28,7 @@
28 28
29#include "xattr.h" 29#include "xattr.h"
30#include "acl.h" 30#include "acl.h"
31#include "group.h"
31 32
32/* 33/*
33 * ialloc.c contains the inodes allocation and deallocation routines 34 * ialloc.c contains the inodes allocation and deallocation routines
@@ -43,6 +44,52 @@
43 * the free blocks count in the block. 44 * the free blocks count in the block.
44 */ 45 */
45 46
47/*
48 * To avoid calling the atomic setbit hundreds or thousands of times, we only
49 * need to use it within a single byte (to ensure we get endianness right).
50 * We can use memset for the rest of the bitmap as there are no other users.
51 */
52void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
53{
54 int i;
55
56 if (start_bit >= end_bit)
57 return;
58
59 ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
60 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
61 ext4_set_bit(i, bitmap);
62 if (i < end_bit)
63 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
64}
65
66/* Initializes an uninitialized inode bitmap */
67unsigned ext4_init_inode_bitmap(struct super_block *sb,
68 struct buffer_head *bh, int block_group,
69 struct ext4_group_desc *gdp)
70{
71 struct ext4_sb_info *sbi = EXT4_SB(sb);
72
73 J_ASSERT_BH(bh, buffer_locked(bh));
74
75 /* If checksum is bad mark all blocks and inodes use to prevent
76 * allocation, essentially implementing a per-group read-only flag. */
77 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
78 ext4_error(sb, __FUNCTION__, "Checksum bad for group %u\n",
79 block_group);
80 gdp->bg_free_blocks_count = 0;
81 gdp->bg_free_inodes_count = 0;
82 gdp->bg_itable_unused = 0;
83 memset(bh->b_data, 0xff, sb->s_blocksize);
84 return 0;
85 }
86
87 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
88 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
89 bh->b_data);
90
91 return EXT4_INODES_PER_GROUP(sb);
92}
46 93
47/* 94/*
48 * Read the inode allocation bitmap for a given block_group, reading 95 * Read the inode allocation bitmap for a given block_group, reading
@@ -59,8 +106,20 @@ read_inode_bitmap(struct super_block * sb, unsigned long block_group)
59 desc = ext4_get_group_desc(sb, block_group, NULL); 106 desc = ext4_get_group_desc(sb, block_group, NULL);
60 if (!desc) 107 if (!desc)
61 goto error_out; 108 goto error_out;
62 109 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
63 bh = sb_bread(sb, ext4_inode_bitmap(sb, desc)); 110 bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
111 if (!buffer_uptodate(bh)) {
112 lock_buffer(bh);
113 if (!buffer_uptodate(bh)) {
114 ext4_init_inode_bitmap(sb, bh, block_group,
115 desc);
116 set_buffer_uptodate(bh);
117 }
118 unlock_buffer(bh);
119 }
120 } else {
121 bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
122 }
64 if (!bh) 123 if (!bh)
65 ext4_error(sb, "read_inode_bitmap", 124 ext4_error(sb, "read_inode_bitmap",
66 "Cannot read inode bitmap - " 125 "Cannot read inode bitmap - "
@@ -169,6 +228,8 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
169 if (is_directory) 228 if (is_directory)
170 gdp->bg_used_dirs_count = cpu_to_le16( 229 gdp->bg_used_dirs_count = cpu_to_le16(
171 le16_to_cpu(gdp->bg_used_dirs_count) - 1); 230 le16_to_cpu(gdp->bg_used_dirs_count) - 1);
231 gdp->bg_checksum = ext4_group_desc_csum(sbi,
232 block_group, gdp);
172 spin_unlock(sb_bgl_lock(sbi, block_group)); 233 spin_unlock(sb_bgl_lock(sbi, block_group));
173 percpu_counter_inc(&sbi->s_freeinodes_counter); 234 percpu_counter_inc(&sbi->s_freeinodes_counter);
174 if (is_directory) 235 if (is_directory)
@@ -435,7 +496,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
435 struct ext4_sb_info *sbi; 496 struct ext4_sb_info *sbi;
436 int err = 0; 497 int err = 0;
437 struct inode *ret; 498 struct inode *ret;
438 int i; 499 int i, free = 0;
439 500
440 /* Cannot create files in a deleted directory */ 501 /* Cannot create files in a deleted directory */
441 if (!dir || !dir->i_nlink) 502 if (!dir || !dir->i_nlink)
@@ -517,11 +578,13 @@ repeat_in_this_group:
517 goto out; 578 goto out;
518 579
519got: 580got:
520 ino += group * EXT4_INODES_PER_GROUP(sb) + 1; 581 ino++;
521 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 582 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
522 ext4_error (sb, "ext4_new_inode", 583 ino > EXT4_INODES_PER_GROUP(sb)) {
523 "reserved inode or inode > inodes count - " 584 ext4_error(sb, __FUNCTION__,
524 "block_group = %d, inode=%lu", group, ino); 585 "reserved inode or inode > inodes count - "
586 "block_group = %d, inode=%lu", group,
587 ino + group * EXT4_INODES_PER_GROUP(sb));
525 err = -EIO; 588 err = -EIO;
526 goto fail; 589 goto fail;
527 } 590 }
@@ -529,13 +592,78 @@ got:
529 BUFFER_TRACE(bh2, "get_write_access"); 592 BUFFER_TRACE(bh2, "get_write_access");
530 err = ext4_journal_get_write_access(handle, bh2); 593 err = ext4_journal_get_write_access(handle, bh2);
531 if (err) goto fail; 594 if (err) goto fail;
595
596 /* We may have to initialize the block bitmap if it isn't already */
597 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
598 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
599 struct buffer_head *block_bh = read_block_bitmap(sb, group);
600
601 BUFFER_TRACE(block_bh, "get block bitmap access");
602 err = ext4_journal_get_write_access(handle, block_bh);
603 if (err) {
604 brelse(block_bh);
605 goto fail;
606 }
607
608 free = 0;
609 spin_lock(sb_bgl_lock(sbi, group));
610 /* recheck and clear flag under lock if we still need to */
611 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
612 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
613 free = ext4_free_blocks_after_init(sb, group, gdp);
614 gdp->bg_free_blocks_count = cpu_to_le16(free);
615 }
616 spin_unlock(sb_bgl_lock(sbi, group));
617
618 /* Don't need to dirty bitmap block if we didn't change it */
619 if (free) {
620 BUFFER_TRACE(block_bh, "dirty block bitmap");
621 err = ext4_journal_dirty_metadata(handle, block_bh);
622 }
623
624 brelse(block_bh);
625 if (err)
626 goto fail;
627 }
628
532 spin_lock(sb_bgl_lock(sbi, group)); 629 spin_lock(sb_bgl_lock(sbi, group));
630 /* If we didn't allocate from within the initialized part of the inode
631 * table then we need to initialize up to this inode. */
632 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
633 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
634 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
635
636 /* When marking the block group with
637 * ~EXT4_BG_INODE_UNINIT we don't want to depend
638 * on the value of bg_itable_unsed even though
639 * mke2fs could have initialized the same for us.
640 * Instead we calculated the value below
641 */
642
643 free = 0;
644 } else {
645 free = EXT4_INODES_PER_GROUP(sb) -
646 le16_to_cpu(gdp->bg_itable_unused);
647 }
648
649 /*
650 * Check the relative inode number against the last used
651 * relative inode number in this group. if it is greater
652 * we need to update the bg_itable_unused count
653 *
654 */
655 if (ino > free)
656 gdp->bg_itable_unused =
657 cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
658 }
659
533 gdp->bg_free_inodes_count = 660 gdp->bg_free_inodes_count =
534 cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); 661 cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
535 if (S_ISDIR(mode)) { 662 if (S_ISDIR(mode)) {
536 gdp->bg_used_dirs_count = 663 gdp->bg_used_dirs_count =
537 cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); 664 cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
538 } 665 }
666 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
539 spin_unlock(sb_bgl_lock(sbi, group)); 667 spin_unlock(sb_bgl_lock(sbi, group));
540 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 668 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
541 err = ext4_journal_dirty_metadata(handle, bh2); 669 err = ext4_journal_dirty_metadata(handle, bh2);
@@ -557,7 +685,7 @@ got:
557 inode->i_gid = current->fsgid; 685 inode->i_gid = current->fsgid;
558 inode->i_mode = mode; 686 inode->i_mode = mode;
559 687
560 inode->i_ino = ino; 688 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
561 /* This is the optimal IO size (for stat), not the fs block size */ 689 /* This is the optimal IO size (for stat), not the fs block size */
562 inode->i_blocks = 0; 690 inode->i_blocks = 0;
563 inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = 691 inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =