aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/balloc.c
diff options
context:
space:
mode:
authorAndreas Dilger <adilger@clusterfs.com>2007-10-16 18:38:25 -0400
committerTheodore Ts'o <tytso@mit.edu>2007-10-17 18:50:00 -0400
commit717d50e4971b81b96c0199c91cdf0039a8cb181a (patch)
treea8d68edbc1f064c76cbfee206e093d2c86c80ba0 /fs/ext4/balloc.c
parent4074fe3736b1a43431dff870bf9055ac5dcf3f03 (diff)
Ext4: Uninitialized Block Groups
In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Diffstat (limited to 'fs/ext4/balloc.c')
-rw-r--r--fs/ext4/balloc.c112
1 files changed, 109 insertions, 3 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b74bf4368441..5927687b3e79 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
20#include <linux/quotaops.h> 20#include <linux/quotaops.h>
21#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
22 22
23#include "group.h"
23/* 24/*
24 * balloc.c contains the blocks allocation and deallocation routines 25 * balloc.c contains the blocks allocation and deallocation routines
25 */ 26 */
@@ -42,6 +43,94 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
42 43
43} 44}
44 45
46/* Initializes an uninitialized block bitmap if given, and returns the
47 * number of blocks free in the group. */
48unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
49 int block_group, struct ext4_group_desc *gdp)
50{
51 unsigned long start;
52 int bit, bit_max;
53 unsigned free_blocks, group_blocks;
54 struct ext4_sb_info *sbi = EXT4_SB(sb);
55
56 if (bh) {
57 J_ASSERT_BH(bh, buffer_locked(bh));
58
59 /* If checksum is bad mark all blocks used to prevent allocation
60 * essentially implementing a per-group read-only flag. */
61 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
62 ext4_error(sb, __FUNCTION__,
63 "Checksum bad for group %u\n", block_group);
64 gdp->bg_free_blocks_count = 0;
65 gdp->bg_free_inodes_count = 0;
66 gdp->bg_itable_unused = 0;
67 memset(bh->b_data, 0xff, sb->s_blocksize);
68 return 0;
69 }
70 memset(bh->b_data, 0, sb->s_blocksize);
71 }
72
73 /* Check for superblock and gdt backups in this group */
74 bit_max = ext4_bg_has_super(sb, block_group);
75
76 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
77 block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
78 sbi->s_desc_per_block) {
79 if (bit_max) {
80 bit_max += ext4_bg_num_gdb(sb, block_group);
81 bit_max +=
82 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
83 }
84 } else { /* For META_BG_BLOCK_GROUPS */
85 int group_rel = (block_group -
86 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
87 EXT4_DESC_PER_BLOCK(sb);
88 if (group_rel == 0 || group_rel == 1 ||
89 (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
90 bit_max += 1;
91 }
92
93 if (block_group == sbi->s_groups_count - 1) {
94 /*
95 * Even though mke2fs always initialize first and last group
96 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
97 * to make sure we calculate the right free blocks
98 */
99 group_blocks = ext4_blocks_count(sbi->s_es) -
100 le32_to_cpu(sbi->s_es->s_first_data_block) -
101 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
102 } else {
103 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
104 }
105
106 free_blocks = group_blocks - bit_max;
107
108 if (bh) {
109 for (bit = 0; bit < bit_max; bit++)
110 ext4_set_bit(bit, bh->b_data);
111
112 start = block_group * EXT4_BLOCKS_PER_GROUP(sb) +
113 le32_to_cpu(sbi->s_es->s_first_data_block);
114
115 /* Set bits for block and inode bitmaps, and inode table */
116 ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data);
117 ext4_set_bit(ext4_inode_bitmap(sb, gdp) - start, bh->b_data);
118 for (bit = le32_to_cpu(gdp->bg_inode_table) - start,
119 bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++)
120 ext4_set_bit(bit, bh->b_data);
121
122 /*
123 * Also if the number of blocks within the group is
124 * less than the blocksize * 8 ( which is the size
125 * of bitmap ), set rest of the block bitmap to 1
126 */
127 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
128 }
129
130 return free_blocks - sbi->s_itb_per_group - 2;
131}
132
133
45/* 134/*
46 * The free blocks are managed by bitmaps. A file system contains several 135 * The free blocks are managed by bitmaps. A file system contains several
47 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap 136 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
@@ -119,7 +208,7 @@ block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
119 * 208 *
120 * Return buffer_head on success or NULL in case of failure. 209 * Return buffer_head on success or NULL in case of failure.
121 */ 210 */
122static struct buffer_head * 211struct buffer_head *
123read_block_bitmap(struct super_block *sb, unsigned int block_group) 212read_block_bitmap(struct super_block *sb, unsigned int block_group)
124{ 213{
125 int i; 214 int i;
@@ -127,11 +216,24 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
127 struct buffer_head * bh = NULL; 216 struct buffer_head * bh = NULL;
128 ext4_fsblk_t bitmap_blk; 217 ext4_fsblk_t bitmap_blk;
129 218
130 desc = ext4_get_group_desc (sb, block_group, NULL); 219 desc = ext4_get_group_desc(sb, block_group, NULL);
131 if (!desc) 220 if (!desc)
132 return NULL; 221 return NULL;
133 bitmap_blk = ext4_block_bitmap(sb, desc); 222 bitmap_blk = ext4_block_bitmap(sb, desc);
134 bh = sb_bread(sb, bitmap_blk); 223 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
224 bh = sb_getblk(sb, bitmap_blk);
225 if (!buffer_uptodate(bh)) {
226 lock_buffer(bh);
227 if (!buffer_uptodate(bh)) {
228 ext4_init_block_bitmap(sb, bh, block_group,
229 desc);
230 set_buffer_uptodate(bh);
231 }
232 unlock_buffer(bh);
233 }
234 } else {
235 bh = sb_bread(sb, bitmap_blk);
236 }
135 if (!bh) 237 if (!bh)
136 ext4_error (sb, __FUNCTION__, 238 ext4_error (sb, __FUNCTION__,
137 "Cannot read block bitmap - " 239 "Cannot read block bitmap - "
@@ -627,6 +729,7 @@ do_more:
627 desc->bg_free_blocks_count = 729 desc->bg_free_blocks_count =
628 cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + 730 cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
629 group_freed); 731 group_freed);
732 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
630 spin_unlock(sb_bgl_lock(sbi, block_group)); 733 spin_unlock(sb_bgl_lock(sbi, block_group));
631 percpu_counter_add(&sbi->s_freeblocks_counter, count); 734 percpu_counter_add(&sbi->s_freeblocks_counter, count);
632 735
@@ -1685,8 +1788,11 @@ allocated:
1685 ret_block, goal_hits, goal_attempts); 1788 ret_block, goal_hits, goal_attempts);
1686 1789
1687 spin_lock(sb_bgl_lock(sbi, group_no)); 1790 spin_lock(sb_bgl_lock(sbi, group_no));
1791 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1792 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1688 gdp->bg_free_blocks_count = 1793 gdp->bg_free_blocks_count =
1689 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num); 1794 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
1795 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1690 spin_unlock(sb_bgl_lock(sbi, group_no)); 1796 spin_unlock(sb_bgl_lock(sbi, group_no));
1691 percpu_counter_sub(&sbi->s_freeblocks_counter, num); 1797 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1692 1798