aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndreas Dilger <adilger@clusterfs.com>2007-10-16 18:38:25 -0400
committerTheodore Ts'o <tytso@mit.edu>2007-10-17 18:50:00 -0400
commit717d50e4971b81b96c0199c91cdf0039a8cb181a (patch)
treea8d68edbc1f064c76cbfee206e093d2c86c80ba0
parent4074fe3736b1a43431dff870bf9055ac5dcf3f03 (diff)
Ext4: Uninitialized Block Groups
In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/ext4/balloc.c112
-rw-r--r--fs/ext4/group.h27
-rw-r--r--fs/ext4/ialloc.c146
-rw-r--r--fs/ext4/resize.c21
-rw-r--r--fs/ext4/super.c47
-rw-r--r--include/linux/ext4_fs.h16
7 files changed, 335 insertions, 35 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index d8062745716a..e31f3691b151 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -140,6 +140,7 @@ config EXT4DEV_FS
140 tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)" 140 tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
141 depends on EXPERIMENTAL 141 depends on EXPERIMENTAL
142 select JBD2 142 select JBD2
143 select CRC16
143 help 144 help
144 Ext4dev is a predecessor filesystem of the next generation 145 Ext4dev is a predecessor filesystem of the next generation
145 extended fs ext4, based on ext3 filesystem code. It will be 146 extended fs ext4, based on ext3 filesystem code. It will be
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b74bf4368441..5927687b3e79 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
20#include <linux/quotaops.h> 20#include <linux/quotaops.h>
21#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
22 22
23#include "group.h"
23/* 24/*
24 * balloc.c contains the blocks allocation and deallocation routines 25 * balloc.c contains the blocks allocation and deallocation routines
25 */ 26 */
@@ -42,6 +43,94 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
42 43
43} 44}
44 45
46/* Initializes an uninitialized block bitmap if given, and returns the
47 * number of blocks free in the group. */
48unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
49 int block_group, struct ext4_group_desc *gdp)
50{
51 unsigned long start;
52 int bit, bit_max;
53 unsigned free_blocks, group_blocks;
54 struct ext4_sb_info *sbi = EXT4_SB(sb);
55
56 if (bh) {
57 J_ASSERT_BH(bh, buffer_locked(bh));
58
59 /* If checksum is bad mark all blocks used to prevent allocation
60 * essentially implementing a per-group read-only flag. */
61 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
62 ext4_error(sb, __FUNCTION__,
63 "Checksum bad for group %u\n", block_group);
64 gdp->bg_free_blocks_count = 0;
65 gdp->bg_free_inodes_count = 0;
66 gdp->bg_itable_unused = 0;
67 memset(bh->b_data, 0xff, sb->s_blocksize);
68 return 0;
69 }
70 memset(bh->b_data, 0, sb->s_blocksize);
71 }
72
73 /* Check for superblock and gdt backups in this group */
74 bit_max = ext4_bg_has_super(sb, block_group);
75
76 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
77 block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
78 sbi->s_desc_per_block) {
79 if (bit_max) {
80 bit_max += ext4_bg_num_gdb(sb, block_group);
81 bit_max +=
82 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
83 }
84 } else { /* For META_BG_BLOCK_GROUPS */
85 int group_rel = (block_group -
86 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
87 EXT4_DESC_PER_BLOCK(sb);
88 if (group_rel == 0 || group_rel == 1 ||
89 (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
90 bit_max += 1;
91 }
92
93 if (block_group == sbi->s_groups_count - 1) {
94 /*
95 * Even though mke2fs always initialize first and last group
96 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
97 * to make sure we calculate the right free blocks
98 */
99 group_blocks = ext4_blocks_count(sbi->s_es) -
100 le32_to_cpu(sbi->s_es->s_first_data_block) -
101 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
102 } else {
103 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
104 }
105
106 free_blocks = group_blocks - bit_max;
107
108 if (bh) {
109 for (bit = 0; bit < bit_max; bit++)
110 ext4_set_bit(bit, bh->b_data);
111
112 start = block_group * EXT4_BLOCKS_PER_GROUP(sb) +
113 le32_to_cpu(sbi->s_es->s_first_data_block);
114
115 /* Set bits for block and inode bitmaps, and inode table */
116 ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data);
117 ext4_set_bit(ext4_inode_bitmap(sb, gdp) - start, bh->b_data);
118 for (bit = le32_to_cpu(gdp->bg_inode_table) - start,
119 bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++)
120 ext4_set_bit(bit, bh->b_data);
121
122 /*
123 * Also if the number of blocks within the group is
124 * less than the blocksize * 8 ( which is the size
125 * of bitmap ), set rest of the block bitmap to 1
126 */
127 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
128 }
129
130 return free_blocks - sbi->s_itb_per_group - 2;
131}
132
133
45/* 134/*
46 * The free blocks are managed by bitmaps. A file system contains several 135 * The free blocks are managed by bitmaps. A file system contains several
47 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap 136 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
@@ -119,7 +208,7 @@ block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
119 * 208 *
120 * Return buffer_head on success or NULL in case of failure. 209 * Return buffer_head on success or NULL in case of failure.
121 */ 210 */
122static struct buffer_head * 211struct buffer_head *
123read_block_bitmap(struct super_block *sb, unsigned int block_group) 212read_block_bitmap(struct super_block *sb, unsigned int block_group)
124{ 213{
125 int i; 214 int i;
@@ -127,11 +216,24 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
127 struct buffer_head * bh = NULL; 216 struct buffer_head * bh = NULL;
128 ext4_fsblk_t bitmap_blk; 217 ext4_fsblk_t bitmap_blk;
129 218
130 desc = ext4_get_group_desc (sb, block_group, NULL); 219 desc = ext4_get_group_desc(sb, block_group, NULL);
131 if (!desc) 220 if (!desc)
132 return NULL; 221 return NULL;
133 bitmap_blk = ext4_block_bitmap(sb, desc); 222 bitmap_blk = ext4_block_bitmap(sb, desc);
134 bh = sb_bread(sb, bitmap_blk); 223 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
224 bh = sb_getblk(sb, bitmap_blk);
225 if (!buffer_uptodate(bh)) {
226 lock_buffer(bh);
227 if (!buffer_uptodate(bh)) {
228 ext4_init_block_bitmap(sb, bh, block_group,
229 desc);
230 set_buffer_uptodate(bh);
231 }
232 unlock_buffer(bh);
233 }
234 } else {
235 bh = sb_bread(sb, bitmap_blk);
236 }
135 if (!bh) 237 if (!bh)
136 ext4_error (sb, __FUNCTION__, 238 ext4_error (sb, __FUNCTION__,
137 "Cannot read block bitmap - " 239 "Cannot read block bitmap - "
@@ -627,6 +729,7 @@ do_more:
627 desc->bg_free_blocks_count = 729 desc->bg_free_blocks_count =
628 cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + 730 cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
629 group_freed); 731 group_freed);
732 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
630 spin_unlock(sb_bgl_lock(sbi, block_group)); 733 spin_unlock(sb_bgl_lock(sbi, block_group));
631 percpu_counter_add(&sbi->s_freeblocks_counter, count); 734 percpu_counter_add(&sbi->s_freeblocks_counter, count);
632 735
@@ -1685,8 +1788,11 @@ allocated:
1685 ret_block, goal_hits, goal_attempts); 1788 ret_block, goal_hits, goal_attempts);
1686 1789
1687 spin_lock(sb_bgl_lock(sbi, group_no)); 1790 spin_lock(sb_bgl_lock(sbi, group_no));
1791 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1792 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1688 gdp->bg_free_blocks_count = 1793 gdp->bg_free_blocks_count =
1689 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num); 1794 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
1795 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1690 spin_unlock(sb_bgl_lock(sbi, group_no)); 1796 spin_unlock(sb_bgl_lock(sbi, group_no));
1691 percpu_counter_sub(&sbi->s_freeblocks_counter, num); 1797 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1692 1798
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
new file mode 100644
index 000000000000..1577910bb58b
--- /dev/null
+++ b/fs/ext4/group.h
@@ -0,0 +1,27 @@
1/*
2 * linux/fs/ext4/group.h
3 *
4 * Copyright (C) 2007 Cluster File Systems, Inc
5 *
6 * Author: Andreas Dilger <adilger@clusterfs.com>
7 */
8
9#ifndef _LINUX_EXT4_GROUP_H
10#define _LINUX_EXT4_GROUP_H
11
12extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
13 struct ext4_group_desc *gdp);
14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
15 struct ext4_group_desc *gdp);
16struct buffer_head *read_block_bitmap(struct super_block *sb,
17 unsigned int block_group);
18extern unsigned ext4_init_block_bitmap(struct super_block *sb,
19 struct buffer_head *bh, int group,
20 struct ext4_group_desc *desc);
21#define ext4_free_blocks_after_init(sb, group, desc) \
22 ext4_init_block_bitmap(sb, NULL, group, desc)
23extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
24 struct buffer_head *bh, int group,
25 struct ext4_group_desc *desc);
26extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
27#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 38e9a0a705df..c61f37fd3f05 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -28,6 +28,7 @@
28 28
29#include "xattr.h" 29#include "xattr.h"
30#include "acl.h" 30#include "acl.h"
31#include "group.h"
31 32
32/* 33/*
33 * ialloc.c contains the inodes allocation and deallocation routines 34 * ialloc.c contains the inodes allocation and deallocation routines
@@ -43,6 +44,52 @@
43 * the free blocks count in the block. 44 * the free blocks count in the block.
44 */ 45 */
45 46
47/*
48 * To avoid calling the atomic setbit hundreds or thousands of times, we only
49 * need to use it within a single byte (to ensure we get endianness right).
50 * We can use memset for the rest of the bitmap as there are no other users.
51 */
52void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
53{
54 int i;
55
56 if (start_bit >= end_bit)
57 return;
58
59 ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
60 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
61 ext4_set_bit(i, bitmap);
62 if (i < end_bit)
63 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
64}
65
66/* Initializes an uninitialized inode bitmap */
67unsigned ext4_init_inode_bitmap(struct super_block *sb,
68 struct buffer_head *bh, int block_group,
69 struct ext4_group_desc *gdp)
70{
71 struct ext4_sb_info *sbi = EXT4_SB(sb);
72
73 J_ASSERT_BH(bh, buffer_locked(bh));
74
75 /* If checksum is bad mark all blocks and inodes use to prevent
76 * allocation, essentially implementing a per-group read-only flag. */
77 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
78 ext4_error(sb, __FUNCTION__, "Checksum bad for group %u\n",
79 block_group);
80 gdp->bg_free_blocks_count = 0;
81 gdp->bg_free_inodes_count = 0;
82 gdp->bg_itable_unused = 0;
83 memset(bh->b_data, 0xff, sb->s_blocksize);
84 return 0;
85 }
86
87 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
88 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
89 bh->b_data);
90
91 return EXT4_INODES_PER_GROUP(sb);
92}
46 93
47/* 94/*
48 * Read the inode allocation bitmap for a given block_group, reading 95 * Read the inode allocation bitmap for a given block_group, reading
@@ -59,8 +106,20 @@ read_inode_bitmap(struct super_block * sb, unsigned long block_group)
59 desc = ext4_get_group_desc(sb, block_group, NULL); 106 desc = ext4_get_group_desc(sb, block_group, NULL);
60 if (!desc) 107 if (!desc)
61 goto error_out; 108 goto error_out;
62 109 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
63 bh = sb_bread(sb, ext4_inode_bitmap(sb, desc)); 110 bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
111 if (!buffer_uptodate(bh)) {
112 lock_buffer(bh);
113 if (!buffer_uptodate(bh)) {
114 ext4_init_inode_bitmap(sb, bh, block_group,
115 desc);
116 set_buffer_uptodate(bh);
117 }
118 unlock_buffer(bh);
119 }
120 } else {
121 bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
122 }
64 if (!bh) 123 if (!bh)
65 ext4_error(sb, "read_inode_bitmap", 124 ext4_error(sb, "read_inode_bitmap",
66 "Cannot read inode bitmap - " 125 "Cannot read inode bitmap - "
@@ -169,6 +228,8 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
169 if (is_directory) 228 if (is_directory)
170 gdp->bg_used_dirs_count = cpu_to_le16( 229 gdp->bg_used_dirs_count = cpu_to_le16(
171 le16_to_cpu(gdp->bg_used_dirs_count) - 1); 230 le16_to_cpu(gdp->bg_used_dirs_count) - 1);
231 gdp->bg_checksum = ext4_group_desc_csum(sbi,
232 block_group, gdp);
172 spin_unlock(sb_bgl_lock(sbi, block_group)); 233 spin_unlock(sb_bgl_lock(sbi, block_group));
173 percpu_counter_inc(&sbi->s_freeinodes_counter); 234 percpu_counter_inc(&sbi->s_freeinodes_counter);
174 if (is_directory) 235 if (is_directory)
@@ -435,7 +496,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
435 struct ext4_sb_info *sbi; 496 struct ext4_sb_info *sbi;
436 int err = 0; 497 int err = 0;
437 struct inode *ret; 498 struct inode *ret;
438 int i; 499 int i, free = 0;
439 500
440 /* Cannot create files in a deleted directory */ 501 /* Cannot create files in a deleted directory */
441 if (!dir || !dir->i_nlink) 502 if (!dir || !dir->i_nlink)
@@ -517,11 +578,13 @@ repeat_in_this_group:
517 goto out; 578 goto out;
518 579
519got: 580got:
520 ino += group * EXT4_INODES_PER_GROUP(sb) + 1; 581 ino++;
521 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 582 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
522 ext4_error (sb, "ext4_new_inode", 583 ino > EXT4_INODES_PER_GROUP(sb)) {
523 "reserved inode or inode > inodes count - " 584 ext4_error(sb, __FUNCTION__,
524 "block_group = %d, inode=%lu", group, ino); 585 "reserved inode or inode > inodes count - "
586 "block_group = %d, inode=%lu", group,
587 ino + group * EXT4_INODES_PER_GROUP(sb));
525 err = -EIO; 588 err = -EIO;
526 goto fail; 589 goto fail;
527 } 590 }
@@ -529,13 +592,78 @@ got:
529 BUFFER_TRACE(bh2, "get_write_access"); 592 BUFFER_TRACE(bh2, "get_write_access");
530 err = ext4_journal_get_write_access(handle, bh2); 593 err = ext4_journal_get_write_access(handle, bh2);
531 if (err) goto fail; 594 if (err) goto fail;
595
596 /* We may have to initialize the block bitmap if it isn't already */
597 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
598 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
599 struct buffer_head *block_bh = read_block_bitmap(sb, group);
600
601 BUFFER_TRACE(block_bh, "get block bitmap access");
602 err = ext4_journal_get_write_access(handle, block_bh);
603 if (err) {
604 brelse(block_bh);
605 goto fail;
606 }
607
608 free = 0;
609 spin_lock(sb_bgl_lock(sbi, group));
610 /* recheck and clear flag under lock if we still need to */
611 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
612 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
613 free = ext4_free_blocks_after_init(sb, group, gdp);
614 gdp->bg_free_blocks_count = cpu_to_le16(free);
615 }
616 spin_unlock(sb_bgl_lock(sbi, group));
617
618 /* Don't need to dirty bitmap block if we didn't change it */
619 if (free) {
620 BUFFER_TRACE(block_bh, "dirty block bitmap");
621 err = ext4_journal_dirty_metadata(handle, block_bh);
622 }
623
624 brelse(block_bh);
625 if (err)
626 goto fail;
627 }
628
532 spin_lock(sb_bgl_lock(sbi, group)); 629 spin_lock(sb_bgl_lock(sbi, group));
630 /* If we didn't allocate from within the initialized part of the inode
631 * table then we need to initialize up to this inode. */
632 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
633 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
634 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
635
636 /* When marking the block group with
637 * ~EXT4_BG_INODE_UNINIT we don't want to depend
638 * on the value of bg_itable_unsed even though
639 * mke2fs could have initialized the same for us.
640 * Instead we calculated the value below
641 */
642
643 free = 0;
644 } else {
645 free = EXT4_INODES_PER_GROUP(sb) -
646 le16_to_cpu(gdp->bg_itable_unused);
647 }
648
649 /*
650 * Check the relative inode number against the last used
651 * relative inode number in this group. if it is greater
652 * we need to update the bg_itable_unused count
653 *
654 */
655 if (ino > free)
656 gdp->bg_itable_unused =
657 cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
658 }
659
533 gdp->bg_free_inodes_count = 660 gdp->bg_free_inodes_count =
534 cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); 661 cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
535 if (S_ISDIR(mode)) { 662 if (S_ISDIR(mode)) {
536 gdp->bg_used_dirs_count = 663 gdp->bg_used_dirs_count =
537 cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); 664 cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
538 } 665 }
666 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
539 spin_unlock(sb_bgl_lock(sbi, group)); 667 spin_unlock(sb_bgl_lock(sbi, group));
540 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 668 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
541 err = ext4_journal_dirty_metadata(handle, bh2); 669 err = ext4_journal_dirty_metadata(handle, bh2);
@@ -557,7 +685,7 @@ got:
557 inode->i_gid = current->fsgid; 685 inode->i_gid = current->fsgid;
558 inode->i_mode = mode; 686 inode->i_mode = mode;
559 687
560 inode->i_ino = ino; 688 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
561 /* This is the optimal IO size (for stat), not the fs block size */ 689 /* This is the optimal IO size (for stat), not the fs block size */
562 inode->i_blocks = 0; 690 inode->i_blocks = 0;
563 inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = 691 inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 472fc0d3e1c0..0a7e914c495a 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -16,6 +16,7 @@
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18 18
19#include "group.h"
19 20
20#define outside(b, first, last) ((b) < (first) || (b) >= (last)) 21#define outside(b, first, last) ((b) < (first) || (b) >= (last))
21#define inside(b, first, last) ((b) >= (first) && (b) < (last)) 22#define inside(b, first, last) ((b) >= (first) && (b) < (last))
@@ -140,25 +141,6 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
140} 141}
141 142
142/* 143/*
143 * To avoid calling the atomic setbit hundreds or thousands of times, we only
144 * need to use it within a single byte (to ensure we get endianness right).
145 * We can use memset for the rest of the bitmap as there are no other users.
146 */
147static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
148{
149 int i;
150
151 if (start_bit >= end_bit)
152 return;
153
154 ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
155 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
156 ext4_set_bit(i, bitmap);
157 if (i < end_bit)
158 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
159}
160
161/*
162 * Set up the block and inode bitmaps, and the inode table for the new group. 144 * Set up the block and inode bitmaps, and the inode table for the new group.
163 * This doesn't need to be part of the main transaction, since we are only 145 * This doesn't need to be part of the main transaction, since we are only
164 * changing blocks outside the actual filesystem. We still do journaling to 146 * changing blocks outside the actual filesystem. We still do journaling to
@@ -842,6 +824,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
842 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ 824 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
843 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); 825 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
844 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); 826 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
827 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
845 828
846 /* 829 /*
847 * Make the new blocks and inodes valid next. We do this before 830 * Make the new blocks and inodes valid next. We do this before
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e2bdf93693a6..dd4ff9c87358 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -37,12 +37,14 @@
37#include <linux/quotaops.h> 37#include <linux/quotaops.h>
38#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39#include <linux/log2.h> 39#include <linux/log2.h>
40#include <linux/crc16.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42 43
43#include "xattr.h" 44#include "xattr.h"
44#include "acl.h" 45#include "acl.h"
45#include "namei.h" 46#include "namei.h"
47#include "group.h"
46 48
47static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 49static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
48 unsigned long journal_devnum); 50 unsigned long journal_devnum);
@@ -1308,6 +1310,43 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1308 return res; 1310 return res;
1309} 1311}
1310 1312
1313__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1314 struct ext4_group_desc *gdp)
1315{
1316 __u16 crc = 0;
1317
1318 if (sbi->s_es->s_feature_ro_compat &
1319 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
1320 int offset = offsetof(struct ext4_group_desc, bg_checksum);
1321 __le32 le_group = cpu_to_le32(block_group);
1322
1323 crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
1324 crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
1325 crc = crc16(crc, (__u8 *)gdp, offset);
1326 offset += sizeof(gdp->bg_checksum); /* skip checksum */
1327 /* for checksum of struct ext4_group_desc do the rest...*/
1328 if ((sbi->s_es->s_feature_incompat &
1329 cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
1330 offset < le16_to_cpu(sbi->s_es->s_desc_size))
1331 crc = crc16(crc, (__u8 *)gdp + offset,
1332 le16_to_cpu(sbi->s_es->s_desc_size) -
1333 offset);
1334 }
1335
1336 return cpu_to_le16(crc);
1337}
1338
1339int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
1340 struct ext4_group_desc *gdp)
1341{
1342 if ((sbi->s_es->s_feature_ro_compat &
1343 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
1344 (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
1345 return 0;
1346
1347 return 1;
1348}
1349
1311/* Called at mount-time, super-block is locked */ 1350/* Called at mount-time, super-block is locked */
1312static int ext4_check_descriptors (struct super_block * sb) 1351static int ext4_check_descriptors (struct super_block * sb)
1313{ 1352{
@@ -1362,6 +1401,14 @@ static int ext4_check_descriptors (struct super_block * sb)
1362 i, inode_table); 1401 i, inode_table);
1363 return 0; 1402 return 0;
1364 } 1403 }
1404 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1405 ext4_error(sb, __FUNCTION__,
1406 "Checksum for group %d failed (%u!=%u)\n", i,
1407 le16_to_cpu(ext4_group_desc_csum(sbi, i,
1408 gdp)),
1409 le16_to_cpu(gdp->bg_checksum));
1410 return 0;
1411 }
1365 first_block += EXT4_BLOCKS_PER_GROUP(sb); 1412 first_block += EXT4_BLOCKS_PER_GROUP(sb);
1366 gdp = (struct ext4_group_desc *) 1413 gdp = (struct ext4_group_desc *)
1367 ((__u8 *)gdp + EXT4_DESC_SIZE(sb)); 1414 ((__u8 *)gdp + EXT4_DESC_SIZE(sb));
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index 151738af6be2..b77b59fe7f56 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -105,19 +105,25 @@
105 */ 105 */
106struct ext4_group_desc 106struct ext4_group_desc
107{ 107{
108 __le32 bg_block_bitmap; /* Blocks bitmap block */ 108 __le32 bg_block_bitmap; /* Blocks bitmap block */
109 __le32 bg_inode_bitmap; /* Inodes bitmap block */ 109 __le32 bg_inode_bitmap; /* Inodes bitmap block */
110 __le32 bg_inode_table; /* Inodes table block */ 110 __le32 bg_inode_table; /* Inodes table block */
111 __le16 bg_free_blocks_count; /* Free blocks count */ 111 __le16 bg_free_blocks_count; /* Free blocks count */
112 __le16 bg_free_inodes_count; /* Free inodes count */ 112 __le16 bg_free_inodes_count; /* Free inodes count */
113 __le16 bg_used_dirs_count; /* Directories count */ 113 __le16 bg_used_dirs_count; /* Directories count */
114 __u16 bg_flags; 114 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
115 __u32 bg_reserved[3]; 115 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
116 __le16 bg_itable_unused; /* Unused inodes count */
117 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
116 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ 118 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
117 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ 119 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
118 __le32 bg_inode_table_hi; /* Inodes table block MSB */ 120 __le32 bg_inode_table_hi; /* Inodes table block MSB */
119}; 121};
120 122
123#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
124#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
125#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
126
121#ifdef __KERNEL__ 127#ifdef __KERNEL__
122#include <linux/ext4_fs_i.h> 128#include <linux/ext4_fs_i.h>
123#include <linux/ext4_fs_sb.h> 129#include <linux/ext4_fs_sb.h>
@@ -665,6 +671,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
665#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 671#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
666#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 672#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
667#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 673#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
674#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
668#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 675#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
669#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 676#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
670 677
@@ -684,6 +691,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
684 EXT4_FEATURE_INCOMPAT_64BIT) 691 EXT4_FEATURE_INCOMPAT_64BIT)
685#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 692#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
686 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 693 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
694 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
687 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ 695 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
688 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ 696 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
689 EXT4_FEATURE_RO_COMPAT_BTREE_DIR) 697 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)