aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-01-29 06:43:38 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-01-29 06:43:38 -0500
commit8cd226ca3f64f28c8123ebfaa6afe8dc8c18b174 (patch)
tree6a668a8e899dca090ded0d3b8d6badda8f97d1b0 /fs
parent6b11d8179d1c6e560edc02c40a53b65fde83bf3f (diff)
parent4019191be7316ed4a39e1c1c2b623baa7dc6c843 (diff)
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (50 commits) jbd2: sparse pointer use of zero as null jbd2: Use round-jiffies() function for the "5 second" ext4/jbd2 wakeup jbd2: Mark jbd2 slabs as SLAB_TEMPORARY jbd2: add lockdep support ext4: Use the ext4_ext_actual_len() helper function ext4: fix uniniatilized extent splitting error ext4: Check for return value from sb_set_blocksize ext4: Add stripe= option to /proc/mounts ext4: Enable the multiblock allocator by default ext4: Add multi block allocator for ext4 ext4: Add new functions for searching extent tree ext4: Add ext4_find_next_bit() ext4: fix up EXT4FS_DEBUG builds ext4: Fix ext4_show_options to show the correct mount options. ext4: Add EXT4_IOC_MIGRATE ioctl ext4: Add inode version support in ext4 vfs: Add 64 bit i_version support ext4: Add the journal checksum feature jbd2: jbd2 stats through procfs ext4: Take read lock during overwrite case. ...
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/afs/dir.c9
-rw-r--r--fs/afs/inode.c3
-rw-r--r--fs/buffer.c44
-rw-r--r--fs/ext2/super.c32
-rw-r--r--fs/ext3/super.c32
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/balloc.c247
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/extents.c481
-rw-r--r--fs/ext4/file.c23
-rw-r--r--fs/ext4/group.h8
-rw-r--r--fs/ext4/ialloc.c141
-rw-r--r--fs/ext4/inode.c360
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c4552
-rw-r--r--fs/ext4/migrate.c560
-rw-r--r--fs/ext4/namei.c135
-rw-r--r--fs/ext4/resize.c28
-rw-r--r--fs/ext4/super.c379
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/inode.c5
-rw-r--r--fs/jbd2/checkpoint.c22
-rw-r--r--fs/jbd2/commit.c255
-rw-r--r--fs/jbd2/journal.c368
-rw-r--r--fs/jbd2/recovery.c151
-rw-r--r--fs/jbd2/revoke.c6
-rw-r--r--fs/jbd2/transaction.c34
-rw-r--r--fs/read_write.c1
29 files changed, 7183 insertions, 721 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 9656139d2e99..219ec06a8c7e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -236,6 +236,7 @@ config JBD_DEBUG
236 236
237config JBD2 237config JBD2
238 tristate 238 tristate
239 select CRC32
239 help 240 help
240 This is a generic journaling layer for block devices that support 241 This is a generic journaling layer for block devices that support
241 both 32-bit and 64-bit block numbers. It is currently used by 242 both 32-bit and 64-bit block numbers. It is currently used by
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 33fe39ad4e03..0cc3597c1197 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -546,11 +546,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
546 dentry->d_op = &afs_fs_dentry_operations; 546 dentry->d_op = &afs_fs_dentry_operations;
547 547
548 d_add(dentry, inode); 548 d_add(dentry, inode);
549 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }", 549 _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
550 fid.vnode, 550 fid.vnode,
551 fid.unique, 551 fid.unique,
552 dentry->d_inode->i_ino, 552 dentry->d_inode->i_ino,
553 dentry->d_inode->i_version); 553 (unsigned long long)dentry->d_inode->i_version);
554 554
555 return NULL; 555 return NULL;
556} 556}
@@ -630,9 +630,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
630 * been deleted and replaced, and the original vnode ID has 630 * been deleted and replaced, and the original vnode ID has
631 * been reused */ 631 * been reused */
632 if (fid.unique != vnode->fid.unique) { 632 if (fid.unique != vnode->fid.unique) {
633 _debug("%s: file deleted (uq %u -> %u I:%lu)", 633 _debug("%s: file deleted (uq %u -> %u I:%llu)",
634 dentry->d_name.name, fid.unique, 634 dentry->d_name.name, fid.unique,
635 vnode->fid.unique, dentry->d_inode->i_version); 635 vnode->fid.unique,
636 (unsigned long long)dentry->d_inode->i_version);
636 spin_lock(&vnode->lock); 637 spin_lock(&vnode->lock);
637 set_bit(AFS_VNODE_DELETED, &vnode->flags); 638 set_bit(AFS_VNODE_DELETED, &vnode->flags);
638 spin_unlock(&vnode->lock); 639 spin_unlock(&vnode->lock);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d196840127c6..84750c8e9f95 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -301,7 +301,8 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
301 301
302 inode = dentry->d_inode; 302 inode = dentry->d_inode;
303 303
304 _enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version); 304 _enter("{ ino=%lu v=%llu }", inode->i_ino,
305 (unsigned long long)inode->i_version);
305 306
306 generic_fillattr(inode, stat); 307 generic_fillattr(inode, stat);
307 return 0; 308 return 0;
diff --git a/fs/buffer.c b/fs/buffer.c
index 7249e014819e..456c9ab7705b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3213,6 +3213,50 @@ static int buffer_cpu_notify(struct notifier_block *self,
3213 return NOTIFY_OK; 3213 return NOTIFY_OK;
3214} 3214}
3215 3215
3216/**
3217 * bh_uptodate_or_lock: Test whether the buffer is uptodate
3218 * @bh: struct buffer_head
3219 *
3220 * Return true if the buffer is up-to-date and false,
3221 * with the buffer locked, if not.
3222 */
3223int bh_uptodate_or_lock(struct buffer_head *bh)
3224{
3225 if (!buffer_uptodate(bh)) {
3226 lock_buffer(bh);
3227 if (!buffer_uptodate(bh))
3228 return 0;
3229 unlock_buffer(bh);
3230 }
3231 return 1;
3232}
3233EXPORT_SYMBOL(bh_uptodate_or_lock);
3234
3235/**
3236 * bh_submit_read: Submit a locked buffer for reading
3237 * @bh: struct buffer_head
3238 *
3239 * Returns zero on success and -EIO on error.
3240 */
3241int bh_submit_read(struct buffer_head *bh)
3242{
3243 BUG_ON(!buffer_locked(bh));
3244
3245 if (buffer_uptodate(bh)) {
3246 unlock_buffer(bh);
3247 return 0;
3248 }
3249
3250 get_bh(bh);
3251 bh->b_end_io = end_buffer_read_sync;
3252 submit_bh(READ, bh);
3253 wait_on_buffer(bh);
3254 if (buffer_uptodate(bh))
3255 return 0;
3256 return -EIO;
3257}
3258EXPORT_SYMBOL(bh_submit_read);
3259
3216void __init buffer_init(void) 3260void __init buffer_init(void)
3217{ 3261{
3218 int nrpages; 3262 int nrpages;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 154e25f13d77..6abaf75163f0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -680,11 +680,31 @@ static int ext2_check_descriptors (struct super_block * sb)
680static loff_t ext2_max_size(int bits) 680static loff_t ext2_max_size(int bits)
681{ 681{
682 loff_t res = EXT2_NDIR_BLOCKS; 682 loff_t res = EXT2_NDIR_BLOCKS;
683 /* This constant is calculated to be the largest file size for a 683 int meta_blocks;
684 * dense, 4k-blocksize file such that the total number of 684 loff_t upper_limit;
685
686 /* This is calculated to be the largest file size for a
687 * dense, file such that the total number of
685 * sectors in the file, including data and all indirect blocks, 688 * sectors in the file, including data and all indirect blocks,
686 * does not exceed 2^32. */ 689 * does not exceed 2^32 -1
687 const loff_t upper_limit = 0x1ff7fffd000LL; 690 * __u32 i_blocks representing the total number of
691 * 512 bytes blocks of the file
692 */
693 upper_limit = (1LL << 32) - 1;
694
695 /* total blocks in file system block size */
696 upper_limit >>= (bits - 9);
697
698
699 /* indirect blocks */
700 meta_blocks = 1;
701 /* double indirect blocks */
702 meta_blocks += 1 + (1LL << (bits-2));
703 /* tripple indirect blocks */
704 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
705
706 upper_limit -= meta_blocks;
707 upper_limit <<= bits;
688 708
689 res += 1LL << (bits-2); 709 res += 1LL << (bits-2);
690 res += 1LL << (2*(bits-2)); 710 res += 1LL << (2*(bits-2));
@@ -692,6 +712,10 @@ static loff_t ext2_max_size(int bits)
692 res <<= bits; 712 res <<= bits;
693 if (res > upper_limit) 713 if (res > upper_limit)
694 res = upper_limit; 714 res = upper_limit;
715
716 if (res > MAX_LFS_FILESIZE)
717 res = MAX_LFS_FILESIZE;
718
695 return res; 719 return res;
696} 720}
697 721
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index cb14de1502c3..f3675cc630e9 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1436,11 +1436,31 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1436static loff_t ext3_max_size(int bits) 1436static loff_t ext3_max_size(int bits)
1437{ 1437{
1438 loff_t res = EXT3_NDIR_BLOCKS; 1438 loff_t res = EXT3_NDIR_BLOCKS;
1439 /* This constant is calculated to be the largest file size for a 1439 int meta_blocks;
1440 * dense, 4k-blocksize file such that the total number of 1440 loff_t upper_limit;
1441
1442 /* This is calculated to be the largest file size for a
1443 * dense, file such that the total number of
1441 * sectors in the file, including data and all indirect blocks, 1444 * sectors in the file, including data and all indirect blocks,
1442 * does not exceed 2^32. */ 1445 * does not exceed 2^32 -1
1443 const loff_t upper_limit = 0x1ff7fffd000LL; 1446 * __u32 i_blocks representing the total number of
1447 * 512 bytes blocks of the file
1448 */
1449 upper_limit = (1LL << 32) - 1;
1450
1451 /* total blocks in file system block size */
1452 upper_limit >>= (bits - 9);
1453
1454
1455 /* indirect blocks */
1456 meta_blocks = 1;
1457 /* double indirect blocks */
1458 meta_blocks += 1 + (1LL << (bits-2));
1459 /* tripple indirect blocks */
1460 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
1461
1462 upper_limit -= meta_blocks;
1463 upper_limit <<= bits;
1444 1464
1445 res += 1LL << (bits-2); 1465 res += 1LL << (bits-2);
1446 res += 1LL << (2*(bits-2)); 1466 res += 1LL << (2*(bits-2));
@@ -1448,6 +1468,10 @@ static loff_t ext3_max_size(int bits)
1448 res <<= bits; 1468 res <<= bits;
1449 if (res > upper_limit) 1469 if (res > upper_limit)
1450 res = upper_limit; 1470 res = upper_limit;
1471
1472 if (res > MAX_LFS_FILESIZE)
1473 res = MAX_LFS_FILESIZE;
1474
1451 return res; 1475 return res;
1452} 1476}
1453 1477
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ae6e7e502ac9..ac6fa8ca0a2f 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
6 6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o 9 ext4_jbd2.o migrate.o mballoc.o
10 10
11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o 12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 71ee95e534fd..ac75ea953d83 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -29,7 +29,7 @@
29 * Calculate the block group number and offset, given a block number 29 * Calculate the block group number and offset, given a block number
30 */ 30 */
31void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 31void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
32 unsigned long *blockgrpp, ext4_grpblk_t *offsetp) 32 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
33{ 33{
34 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 34 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
35 ext4_grpblk_t offset; 35 ext4_grpblk_t offset;
@@ -46,7 +46,7 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
46/* Initializes an uninitialized block bitmap if given, and returns the 46/* Initializes an uninitialized block bitmap if given, and returns the
47 * number of blocks free in the group. */ 47 * number of blocks free in the group. */
48unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, 48unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
49 int block_group, struct ext4_group_desc *gdp) 49 ext4_group_t block_group, struct ext4_group_desc *gdp)
50{ 50{
51 unsigned long start; 51 unsigned long start;
52 int bit, bit_max; 52 int bit, bit_max;
@@ -60,7 +60,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
60 * essentially implementing a per-group read-only flag. */ 60 * essentially implementing a per-group read-only flag. */
61 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 61 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
62 ext4_error(sb, __FUNCTION__, 62 ext4_error(sb, __FUNCTION__,
63 "Checksum bad for group %u\n", block_group); 63 "Checksum bad for group %lu\n", block_group);
64 gdp->bg_free_blocks_count = 0; 64 gdp->bg_free_blocks_count = 0;
65 gdp->bg_free_inodes_count = 0; 65 gdp->bg_free_inodes_count = 0;
66 gdp->bg_itable_unused = 0; 66 gdp->bg_itable_unused = 0;
@@ -153,7 +153,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
153 * group descriptor 153 * group descriptor
154 */ 154 */
155struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 155struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
156 unsigned int block_group, 156 ext4_group_t block_group,
157 struct buffer_head ** bh) 157 struct buffer_head ** bh)
158{ 158{
159 unsigned long group_desc; 159 unsigned long group_desc;
@@ -164,7 +164,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
164 if (block_group >= sbi->s_groups_count) { 164 if (block_group >= sbi->s_groups_count) {
165 ext4_error (sb, "ext4_get_group_desc", 165 ext4_error (sb, "ext4_get_group_desc",
166 "block_group >= groups_count - " 166 "block_group >= groups_count - "
167 "block_group = %d, groups_count = %lu", 167 "block_group = %lu, groups_count = %lu",
168 block_group, sbi->s_groups_count); 168 block_group, sbi->s_groups_count);
169 169
170 return NULL; 170 return NULL;
@@ -176,7 +176,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
176 if (!sbi->s_group_desc[group_desc]) { 176 if (!sbi->s_group_desc[group_desc]) {
177 ext4_error (sb, "ext4_get_group_desc", 177 ext4_error (sb, "ext4_get_group_desc",
178 "Group descriptor not loaded - " 178 "Group descriptor not loaded - "
179 "block_group = %d, group_desc = %lu, desc = %lu", 179 "block_group = %lu, group_desc = %lu, desc = %lu",
180 block_group, group_desc, offset); 180 block_group, group_desc, offset);
181 return NULL; 181 return NULL;
182 } 182 }
@@ -189,18 +189,70 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
189 return desc; 189 return desc;
190} 190}
191 191
192static int ext4_valid_block_bitmap(struct super_block *sb,
193 struct ext4_group_desc *desc,
194 unsigned int block_group,
195 struct buffer_head *bh)
196{
197 ext4_grpblk_t offset;
198 ext4_grpblk_t next_zero_bit;
199 ext4_fsblk_t bitmap_blk;
200 ext4_fsblk_t group_first_block;
201
202 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
203 /* with FLEX_BG, the inode/block bitmaps and itable
204 * blocks may not be in the group at all
205 * so the bitmap validation will be skipped for those groups
206 * or it has to also read the block group where the bitmaps
207 * are located to verify they are set.
208 */
209 return 1;
210 }
211 group_first_block = ext4_group_first_block_no(sb, block_group);
212
213 /* check whether block bitmap block number is set */
214 bitmap_blk = ext4_block_bitmap(sb, desc);
215 offset = bitmap_blk - group_first_block;
216 if (!ext4_test_bit(offset, bh->b_data))
217 /* bad block bitmap */
218 goto err_out;
219
220 /* check whether the inode bitmap block number is set */
221 bitmap_blk = ext4_inode_bitmap(sb, desc);
222 offset = bitmap_blk - group_first_block;
223 if (!ext4_test_bit(offset, bh->b_data))
224 /* bad block bitmap */
225 goto err_out;
226
227 /* check whether the inode table block number is set */
228 bitmap_blk = ext4_inode_table(sb, desc);
229 offset = bitmap_blk - group_first_block;
230 next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
231 offset + EXT4_SB(sb)->s_itb_per_group,
232 offset);
233 if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
234 /* good bitmap for inode tables */
235 return 1;
236
237err_out:
238 ext4_error(sb, __FUNCTION__,
239 "Invalid block bitmap - "
240 "block_group = %d, block = %llu",
241 block_group, bitmap_blk);
242 return 0;
243}
192/** 244/**
193 * read_block_bitmap() 245 * read_block_bitmap()
194 * @sb: super block 246 * @sb: super block
195 * @block_group: given block group 247 * @block_group: given block group
196 * 248 *
197 * Read the bitmap for a given block_group, reading into the specified 249 * Read the bitmap for a given block_group,and validate the
198 * slot in the superblock's bitmap cache. 250 * bits for block/inode/inode tables are set in the bitmaps
199 * 251 *
200 * Return buffer_head on success or NULL in case of failure. 252 * Return buffer_head on success or NULL in case of failure.
201 */ 253 */
202struct buffer_head * 254struct buffer_head *
203read_block_bitmap(struct super_block *sb, unsigned int block_group) 255read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
204{ 256{
205 struct ext4_group_desc * desc; 257 struct ext4_group_desc * desc;
206 struct buffer_head * bh = NULL; 258 struct buffer_head * bh = NULL;
@@ -210,25 +262,36 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
210 if (!desc) 262 if (!desc)
211 return NULL; 263 return NULL;
212 bitmap_blk = ext4_block_bitmap(sb, desc); 264 bitmap_blk = ext4_block_bitmap(sb, desc);
265 bh = sb_getblk(sb, bitmap_blk);
266 if (unlikely(!bh)) {
267 ext4_error(sb, __FUNCTION__,
268 "Cannot read block bitmap - "
269 "block_group = %d, block_bitmap = %llu",
270 (int)block_group, (unsigned long long)bitmap_blk);
271 return NULL;
272 }
273 if (bh_uptodate_or_lock(bh))
274 return bh;
275
213 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 276 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
214 bh = sb_getblk(sb, bitmap_blk); 277 ext4_init_block_bitmap(sb, bh, block_group, desc);
215 if (!buffer_uptodate(bh)) { 278 set_buffer_uptodate(bh);
216 lock_buffer(bh); 279 unlock_buffer(bh);
217 if (!buffer_uptodate(bh)) { 280 return bh;
218 ext4_init_block_bitmap(sb, bh, block_group,
219 desc);
220 set_buffer_uptodate(bh);
221 }
222 unlock_buffer(bh);
223 }
224 } else {
225 bh = sb_bread(sb, bitmap_blk);
226 } 281 }
227 if (!bh) 282 if (bh_submit_read(bh) < 0) {
228 ext4_error (sb, __FUNCTION__, 283 put_bh(bh);
284 ext4_error(sb, __FUNCTION__,
229 "Cannot read block bitmap - " 285 "Cannot read block bitmap - "
230 "block_group = %d, block_bitmap = %llu", 286 "block_group = %d, block_bitmap = %llu",
231 block_group, bitmap_blk); 287 (int)block_group, (unsigned long long)bitmap_blk);
288 return NULL;
289 }
290 if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) {
291 put_bh(bh);
292 return NULL;
293 }
294
232 return bh; 295 return bh;
233} 296}
234/* 297/*
@@ -320,7 +383,7 @@ restart:
320 */ 383 */
321static int 384static int
322goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal, 385goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
323 unsigned int group, struct super_block * sb) 386 ext4_group_t group, struct super_block *sb)
324{ 387{
325 ext4_fsblk_t group_first_block, group_last_block; 388 ext4_fsblk_t group_first_block, group_last_block;
326 389
@@ -463,7 +526,7 @@ static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
463 * when setting the reservation window size through ioctl before the file 526 * when setting the reservation window size through ioctl before the file
464 * is open for write (needs block allocation). 527 * is open for write (needs block allocation).
465 * 528 *
466 * Needs truncate_mutex protection prior to call this function. 529 * Needs down_write(i_data_sem) protection prior to call this function.
467 */ 530 */
468void ext4_init_block_alloc_info(struct inode *inode) 531void ext4_init_block_alloc_info(struct inode *inode)
469{ 532{
@@ -514,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode)
514 struct ext4_reserve_window_node *rsv; 577 struct ext4_reserve_window_node *rsv;
515 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock; 578 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
516 579
580 ext4_mb_discard_inode_preallocations(inode);
581
517 if (!block_i) 582 if (!block_i)
518 return; 583 return;
519 584
@@ -540,7 +605,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
540{ 605{
541 struct buffer_head *bitmap_bh = NULL; 606 struct buffer_head *bitmap_bh = NULL;
542 struct buffer_head *gd_bh; 607 struct buffer_head *gd_bh;
543 unsigned long block_group; 608 ext4_group_t block_group;
544 ext4_grpblk_t bit; 609 ext4_grpblk_t bit;
545 unsigned long i; 610 unsigned long i;
546 unsigned long overflow; 611 unsigned long overflow;
@@ -587,11 +652,13 @@ do_more:
587 in_range(ext4_inode_bitmap(sb, desc), block, count) || 652 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
588 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 653 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
589 in_range(block + count - 1, ext4_inode_table(sb, desc), 654 in_range(block + count - 1, ext4_inode_table(sb, desc),
590 sbi->s_itb_per_group)) 655 sbi->s_itb_per_group)) {
591 ext4_error (sb, "ext4_free_blocks", 656 ext4_error (sb, "ext4_free_blocks",
592 "Freeing blocks in system zones - " 657 "Freeing blocks in system zones - "
593 "Block = %llu, count = %lu", 658 "Block = %llu, count = %lu",
594 block, count); 659 block, count);
660 goto error_return;
661 }
595 662
596 /* 663 /*
597 * We are about to start releasing blocks in the bitmap, 664 * We are about to start releasing blocks in the bitmap,
@@ -720,19 +787,29 @@ error_return:
720 * @inode: inode 787 * @inode: inode
721 * @block: start physical block to free 788 * @block: start physical block to free
722 * @count: number of blocks to count 789 * @count: number of blocks to count
790 * @metadata: Are these metadata blocks
723 */ 791 */
724void ext4_free_blocks(handle_t *handle, struct inode *inode, 792void ext4_free_blocks(handle_t *handle, struct inode *inode,
725 ext4_fsblk_t block, unsigned long count) 793 ext4_fsblk_t block, unsigned long count,
794 int metadata)
726{ 795{
727 struct super_block * sb; 796 struct super_block * sb;
728 unsigned long dquot_freed_blocks; 797 unsigned long dquot_freed_blocks;
729 798
799 /* this isn't the right place to decide whether block is metadata
800 * inode.c/extents.c knows better, but for safety ... */
801 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
802 ext4_should_journal_data(inode))
803 metadata = 1;
804
730 sb = inode->i_sb; 805 sb = inode->i_sb;
731 if (!sb) { 806
732 printk ("ext4_free_blocks: nonexistent device"); 807 if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
733 return; 808 ext4_free_blocks_sb(handle, sb, block, count,
734 } 809 &dquot_freed_blocks);
735 ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); 810 else
811 ext4_mb_free_blocks(handle, inode, block, count,
812 metadata, &dquot_freed_blocks);
736 if (dquot_freed_blocks) 813 if (dquot_freed_blocks)
737 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); 814 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
738 return; 815 return;
@@ -920,9 +997,10 @@ claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
920 * ext4_journal_release_buffer(), else we'll run out of credits. 997 * ext4_journal_release_buffer(), else we'll run out of credits.
921 */ 998 */
922static ext4_grpblk_t 999static ext4_grpblk_t
923ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group, 1000ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
924 struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal, 1001 ext4_group_t group, struct buffer_head *bitmap_bh,
925 unsigned long *count, struct ext4_reserve_window *my_rsv) 1002 ext4_grpblk_t grp_goal, unsigned long *count,
1003 struct ext4_reserve_window *my_rsv)
926{ 1004{
927 ext4_fsblk_t group_first_block; 1005 ext4_fsblk_t group_first_block;
928 ext4_grpblk_t start, end; 1006 ext4_grpblk_t start, end;
@@ -1156,7 +1234,7 @@ static int find_next_reservable_window(
1156 */ 1234 */
1157static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, 1235static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1158 ext4_grpblk_t grp_goal, struct super_block *sb, 1236 ext4_grpblk_t grp_goal, struct super_block *sb,
1159 unsigned int group, struct buffer_head *bitmap_bh) 1237 ext4_group_t group, struct buffer_head *bitmap_bh)
1160{ 1238{
1161 struct ext4_reserve_window_node *search_head; 1239 struct ext4_reserve_window_node *search_head;
1162 ext4_fsblk_t group_first_block, group_end_block, start_block; 1240 ext4_fsblk_t group_first_block, group_end_block, start_block;
@@ -1354,7 +1432,7 @@ static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1354 */ 1432 */
1355static ext4_grpblk_t 1433static ext4_grpblk_t
1356ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, 1434ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1357 unsigned int group, struct buffer_head *bitmap_bh, 1435 ext4_group_t group, struct buffer_head *bitmap_bh,
1358 ext4_grpblk_t grp_goal, 1436 ext4_grpblk_t grp_goal,
1359 struct ext4_reserve_window_node * my_rsv, 1437 struct ext4_reserve_window_node * my_rsv,
1360 unsigned long *count, int *errp) 1438 unsigned long *count, int *errp)
@@ -1510,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1510} 1588}
1511 1589
1512/** 1590/**
1513 * ext4_new_blocks() -- core block(s) allocation function 1591 * ext4_new_blocks_old() -- core block(s) allocation function
1514 * @handle: handle to this transaction 1592 * @handle: handle to this transaction
1515 * @inode: file inode 1593 * @inode: file inode
1516 * @goal: given target block(filesystem wide) 1594 * @goal: given target block(filesystem wide)
@@ -1523,17 +1601,17 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1523 * any specific goal block. 1601 * any specific goal block.
1524 * 1602 *
1525 */ 1603 */
1526ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1604ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1527 ext4_fsblk_t goal, unsigned long *count, int *errp) 1605 ext4_fsblk_t goal, unsigned long *count, int *errp)
1528{ 1606{
1529 struct buffer_head *bitmap_bh = NULL; 1607 struct buffer_head *bitmap_bh = NULL;
1530 struct buffer_head *gdp_bh; 1608 struct buffer_head *gdp_bh;
1531 unsigned long group_no; 1609 ext4_group_t group_no;
1532 int goal_group; 1610 ext4_group_t goal_group;
1533 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */ 1611 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1534 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ 1612 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1535 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */ 1613 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
1536 int bgi; /* blockgroup iteration index */ 1614 ext4_group_t bgi; /* blockgroup iteration index */
1537 int fatal = 0, err; 1615 int fatal = 0, err;
1538 int performed_allocation = 0; 1616 int performed_allocation = 0;
1539 ext4_grpblk_t free_blocks; /* number of free blocks in a group */ 1617 ext4_grpblk_t free_blocks; /* number of free blocks in a group */
@@ -1544,10 +1622,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1544 struct ext4_reserve_window_node *my_rsv = NULL; 1622 struct ext4_reserve_window_node *my_rsv = NULL;
1545 struct ext4_block_alloc_info *block_i; 1623 struct ext4_block_alloc_info *block_i;
1546 unsigned short windowsz = 0; 1624 unsigned short windowsz = 0;
1547#ifdef EXT4FS_DEBUG 1625 ext4_group_t ngroups;
1548 static int goal_hits, goal_attempts;
1549#endif
1550 unsigned long ngroups;
1551 unsigned long num = *count; 1626 unsigned long num = *count;
1552 1627
1553 *errp = -ENOSPC; 1628 *errp = -ENOSPC;
@@ -1567,7 +1642,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1567 1642
1568 sbi = EXT4_SB(sb); 1643 sbi = EXT4_SB(sb);
1569 es = EXT4_SB(sb)->s_es; 1644 es = EXT4_SB(sb)->s_es;
1570 ext4_debug("goal=%lu.\n", goal); 1645 ext4_debug("goal=%llu.\n", goal);
1571 /* 1646 /*
1572 * Allocate a block from reservation only when 1647 * Allocate a block from reservation only when
1573 * filesystem is mounted with reservation(default,-o reservation), and 1648 * filesystem is mounted with reservation(default,-o reservation), and
@@ -1677,7 +1752,7 @@ retry_alloc:
1677 1752
1678allocated: 1753allocated:
1679 1754
1680 ext4_debug("using block group %d(%d)\n", 1755 ext4_debug("using block group %lu(%d)\n",
1681 group_no, gdp->bg_free_blocks_count); 1756 group_no, gdp->bg_free_blocks_count);
1682 1757
1683 BUFFER_TRACE(gdp_bh, "get_write_access"); 1758 BUFFER_TRACE(gdp_bh, "get_write_access");
@@ -1692,11 +1767,13 @@ allocated:
1692 in_range(ret_block, ext4_inode_table(sb, gdp), 1767 in_range(ret_block, ext4_inode_table(sb, gdp),
1693 EXT4_SB(sb)->s_itb_per_group) || 1768 EXT4_SB(sb)->s_itb_per_group) ||
1694 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp), 1769 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1695 EXT4_SB(sb)->s_itb_per_group)) 1770 EXT4_SB(sb)->s_itb_per_group)) {
1696 ext4_error(sb, "ext4_new_block", 1771 ext4_error(sb, "ext4_new_block",
1697 "Allocating block in system zone - " 1772 "Allocating block in system zone - "
1698 "blocks from %llu, length %lu", 1773 "blocks from %llu, length %lu",
1699 ret_block, num); 1774 ret_block, num);
1775 goto out;
1776 }
1700 1777
1701 performed_allocation = 1; 1778 performed_allocation = 1;
1702 1779
@@ -1743,9 +1820,6 @@ allocated:
1743 * list of some description. We don't know in advance whether 1820 * list of some description. We don't know in advance whether
1744 * the caller wants to use it as metadata or data. 1821 * the caller wants to use it as metadata or data.
1745 */ 1822 */
1746 ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
1747 ret_block, goal_hits, goal_attempts);
1748
1749 spin_lock(sb_bgl_lock(sbi, group_no)); 1823 spin_lock(sb_bgl_lock(sbi, group_no));
1750 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) 1824 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1751 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 1825 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
@@ -1787,13 +1861,46 @@ out:
1787} 1861}
1788 1862
1789ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, 1863ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
1790 ext4_fsblk_t goal, int *errp) 1864 ext4_fsblk_t goal, int *errp)
1791{ 1865{
1792 unsigned long count = 1; 1866 struct ext4_allocation_request ar;
1867 ext4_fsblk_t ret;
1793 1868
1794 return ext4_new_blocks(handle, inode, goal, &count, errp); 1869 if (!test_opt(inode->i_sb, MBALLOC)) {
1870 unsigned long count = 1;
1871 ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
1872 return ret;
1873 }
1874
1875 memset(&ar, 0, sizeof(ar));
1876 ar.inode = inode;
1877 ar.goal = goal;
1878 ar.len = 1;
1879 ret = ext4_mb_new_blocks(handle, &ar, errp);
1880 return ret;
1881}
1882
1883ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1884 ext4_fsblk_t goal, unsigned long *count, int *errp)
1885{
1886 struct ext4_allocation_request ar;
1887 ext4_fsblk_t ret;
1888
1889 if (!test_opt(inode->i_sb, MBALLOC)) {
1890 ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
1891 return ret;
1892 }
1893
1894 memset(&ar, 0, sizeof(ar));
1895 ar.inode = inode;
1896 ar.goal = goal;
1897 ar.len = *count;
1898 ret = ext4_mb_new_blocks(handle, &ar, errp);
1899 *count = ar.len;
1900 return ret;
1795} 1901}
1796 1902
1903
1797/** 1904/**
1798 * ext4_count_free_blocks() -- count filesystem free blocks 1905 * ext4_count_free_blocks() -- count filesystem free blocks
1799 * @sb: superblock 1906 * @sb: superblock
@@ -1804,8 +1911,8 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1804{ 1911{
1805 ext4_fsblk_t desc_count; 1912 ext4_fsblk_t desc_count;
1806 struct ext4_group_desc *gdp; 1913 struct ext4_group_desc *gdp;
1807 int i; 1914 ext4_group_t i;
1808 unsigned long ngroups = EXT4_SB(sb)->s_groups_count; 1915 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
1809#ifdef EXT4FS_DEBUG 1916#ifdef EXT4FS_DEBUG
1810 struct ext4_super_block *es; 1917 struct ext4_super_block *es;
1811 ext4_fsblk_t bitmap_count; 1918 ext4_fsblk_t bitmap_count;
@@ -1829,14 +1936,14 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1829 continue; 1936 continue;
1830 1937
1831 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 1938 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
1832 printk("group %d: stored = %d, counted = %lu\n", 1939 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1833 i, le16_to_cpu(gdp->bg_free_blocks_count), x); 1940 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1834 bitmap_count += x; 1941 bitmap_count += x;
1835 } 1942 }
1836 brelse(bitmap_bh); 1943 brelse(bitmap_bh);
1837 printk("ext4_count_free_blocks: stored = %llu" 1944 printk("ext4_count_free_blocks: stored = %llu"
1838 ", computed = %llu, %llu\n", 1945 ", computed = %llu, %llu\n",
1839 EXT4_FREE_BLOCKS_COUNT(es), 1946 ext4_free_blocks_count(es),
1840 desc_count, bitmap_count); 1947 desc_count, bitmap_count);
1841 return bitmap_count; 1948 return bitmap_count;
1842#else 1949#else
@@ -1853,7 +1960,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1853#endif 1960#endif
1854} 1961}
1855 1962
1856static inline int test_root(int a, int b) 1963static inline int test_root(ext4_group_t a, int b)
1857{ 1964{
1858 int num = b; 1965 int num = b;
1859 1966
@@ -1862,7 +1969,7 @@ static inline int test_root(int a, int b)
1862 return num == a; 1969 return num == a;
1863} 1970}
1864 1971
1865static int ext4_group_sparse(int group) 1972static int ext4_group_sparse(ext4_group_t group)
1866{ 1973{
1867 if (group <= 1) 1974 if (group <= 1)
1868 return 1; 1975 return 1;
@@ -1880,7 +1987,7 @@ static int ext4_group_sparse(int group)
1880 * Return the number of blocks used by the superblock (primary or backup) 1987 * Return the number of blocks used by the superblock (primary or backup)
1881 * in this group. Currently this will be only 0 or 1. 1988 * in this group. Currently this will be only 0 or 1.
1882 */ 1989 */
1883int ext4_bg_has_super(struct super_block *sb, int group) 1990int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
1884{ 1991{
1885 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 1992 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1886 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && 1993 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
@@ -1889,18 +1996,20 @@ int ext4_bg_has_super(struct super_block *sb, int group)
1889 return 1; 1996 return 1;
1890} 1997}
1891 1998
1892static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group) 1999static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
2000 ext4_group_t group)
1893{ 2001{
1894 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); 2002 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
1895 unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb); 2003 ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
1896 unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1; 2004 ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
1897 2005
1898 if (group == first || group == first + 1 || group == last) 2006 if (group == first || group == first + 1 || group == last)
1899 return 1; 2007 return 1;
1900 return 0; 2008 return 0;
1901} 2009}
1902 2010
1903static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group) 2011static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
2012 ext4_group_t group)
1904{ 2013{
1905 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 2014 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1906 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && 2015 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
@@ -1918,7 +2027,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
1918 * (primary or backup) in this group. In the future there may be a 2027 * (primary or backup) in this group. In the future there may be a
1919 * different number of descriptor blocks in each group. 2028 * different number of descriptor blocks in each group.
1920 */ 2029 */
1921unsigned long ext4_bg_num_gdb(struct super_block *sb, int group) 2030unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
1922{ 2031{
1923 unsigned long first_meta_bg = 2032 unsigned long first_meta_bg =
1924 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); 2033 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f612bef98315..33888bb58144 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
67 unsigned long offset) 67 unsigned long offset)
68{ 68{
69 const char * error_msg = NULL; 69 const char * error_msg = NULL;
70 const int rlen = le16_to_cpu(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
71 71
72 if (rlen < EXT4_DIR_REC_LEN(1)) 72 if (rlen < EXT4_DIR_REC_LEN(1))
73 error_msg = "rec_len is smaller than minimal"; 73 error_msg = "rec_len is smaller than minimal";
@@ -124,7 +124,7 @@ static int ext4_readdir(struct file * filp,
124 offset = filp->f_pos & (sb->s_blocksize - 1); 124 offset = filp->f_pos & (sb->s_blocksize - 1);
125 125
126 while (!error && !stored && filp->f_pos < inode->i_size) { 126 while (!error && !stored && filp->f_pos < inode->i_size) {
127 unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 127 ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
128 struct buffer_head map_bh; 128 struct buffer_head map_bh;
129 struct buffer_head *bh = NULL; 129 struct buffer_head *bh = NULL;
130 130
@@ -172,10 +172,10 @@ revalidate:
172 * least that it is non-zero. A 172 * least that it is non-zero. A
173 * failure will be detected in the 173 * failure will be detected in the
174 * dirent test below. */ 174 * dirent test below. */
175 if (le16_to_cpu(de->rec_len) < 175 if (ext4_rec_len_from_disk(de->rec_len)
176 EXT4_DIR_REC_LEN(1)) 176 < EXT4_DIR_REC_LEN(1))
177 break; 177 break;
178 i += le16_to_cpu(de->rec_len); 178 i += ext4_rec_len_from_disk(de->rec_len);
179 } 179 }
180 offset = i; 180 offset = i;
181 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 181 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -197,7 +197,7 @@ revalidate:
197 ret = stored; 197 ret = stored;
198 goto out; 198 goto out;
199 } 199 }
200 offset += le16_to_cpu(de->rec_len); 200 offset += ext4_rec_len_from_disk(de->rec_len);
201 if (le32_to_cpu(de->inode)) { 201 if (le32_to_cpu(de->inode)) {
202 /* We might block in the next section 202 /* We might block in the next section
203 * if the data destination is 203 * if the data destination is
@@ -219,7 +219,7 @@ revalidate:
219 goto revalidate; 219 goto revalidate;
220 stored ++; 220 stored ++;
221 } 221 }
222 filp->f_pos += le16_to_cpu(de->rec_len); 222 filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
223 } 223 }
224 offset = 0; 224 offset = 0;
225 brelse (bh); 225 brelse (bh);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 85287742f2ae..bc7081f1fbe8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -61,7 +61,7 @@ static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
61 * idx_pblock: 61 * idx_pblock:
62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t 62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
63 */ 63 */
64static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) 64ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
65{ 65{
66 ext4_fsblk_t block; 66 ext4_fsblk_t block;
67 67
@@ -75,7 +75,7 @@ static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
75 * stores a large physical block number into an extent struct, 75 * stores a large physical block number into an extent struct,
76 * breaking it into parts 76 * breaking it into parts
77 */ 77 */
78static void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) 78void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
79{ 79{
80 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); 80 ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
81 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 81 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
@@ -144,7 +144,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
144 144
145static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 145static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
146 struct ext4_ext_path *path, 146 struct ext4_ext_path *path,
147 ext4_fsblk_t block) 147 ext4_lblk_t block)
148{ 148{
149 struct ext4_inode_info *ei = EXT4_I(inode); 149 struct ext4_inode_info *ei = EXT4_I(inode);
150 ext4_fsblk_t bg_start; 150 ext4_fsblk_t bg_start;
@@ -367,13 +367,14 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path)
367 * the header must be checked before calling this 367 * the header must be checked before calling this
368 */ 368 */
369static void 369static void
370ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block) 370ext4_ext_binsearch_idx(struct inode *inode,
371 struct ext4_ext_path *path, ext4_lblk_t block)
371{ 372{
372 struct ext4_extent_header *eh = path->p_hdr; 373 struct ext4_extent_header *eh = path->p_hdr;
373 struct ext4_extent_idx *r, *l, *m; 374 struct ext4_extent_idx *r, *l, *m;
374 375
375 376
376 ext_debug("binsearch for %d(idx): ", block); 377 ext_debug("binsearch for %u(idx): ", block);
377 378
378 l = EXT_FIRST_INDEX(eh) + 1; 379 l = EXT_FIRST_INDEX(eh) + 1;
379 r = EXT_LAST_INDEX(eh); 380 r = EXT_LAST_INDEX(eh);
@@ -425,7 +426,8 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc
425 * the header must be checked before calling this 426 * the header must be checked before calling this
426 */ 427 */
427static void 428static void
428ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) 429ext4_ext_binsearch(struct inode *inode,
430 struct ext4_ext_path *path, ext4_lblk_t block)
429{ 431{
430 struct ext4_extent_header *eh = path->p_hdr; 432 struct ext4_extent_header *eh = path->p_hdr;
431 struct ext4_extent *r, *l, *m; 433 struct ext4_extent *r, *l, *m;
@@ -438,7 +440,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
438 return; 440 return;
439 } 441 }
440 442
441 ext_debug("binsearch for %d: ", block); 443 ext_debug("binsearch for %u: ", block);
442 444
443 l = EXT_FIRST_EXTENT(eh) + 1; 445 l = EXT_FIRST_EXTENT(eh) + 1;
444 r = EXT_LAST_EXTENT(eh); 446 r = EXT_LAST_EXTENT(eh);
@@ -494,7 +496,8 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
494} 496}
495 497
496struct ext4_ext_path * 498struct ext4_ext_path *
497ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path) 499ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
500 struct ext4_ext_path *path)
498{ 501{
499 struct ext4_extent_header *eh; 502 struct ext4_extent_header *eh;
500 struct buffer_head *bh; 503 struct buffer_head *bh;
@@ -763,7 +766,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
763 while (k--) { 766 while (k--) {
764 oldblock = newblock; 767 oldblock = newblock;
765 newblock = ablocks[--a]; 768 newblock = ablocks[--a];
766 bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock); 769 bh = sb_getblk(inode->i_sb, newblock);
767 if (!bh) { 770 if (!bh) {
768 err = -EIO; 771 err = -EIO;
769 goto cleanup; 772 goto cleanup;
@@ -783,9 +786,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
783 fidx->ei_block = border; 786 fidx->ei_block = border;
784 ext4_idx_store_pblock(fidx, oldblock); 787 ext4_idx_store_pblock(fidx, oldblock);
785 788
786 ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i, 789 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
787 newblock, (unsigned long) le32_to_cpu(border), 790 i, newblock, le32_to_cpu(border), oldblock);
788 oldblock);
789 /* copy indexes */ 791 /* copy indexes */
790 m = 0; 792 m = 0;
791 path[i].p_idx++; 793 path[i].p_idx++;
@@ -851,7 +853,7 @@ cleanup:
851 for (i = 0; i < depth; i++) { 853 for (i = 0; i < depth; i++) {
852 if (!ablocks[i]) 854 if (!ablocks[i])
853 continue; 855 continue;
854 ext4_free_blocks(handle, inode, ablocks[i], 1); 856 ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
855 } 857 }
856 } 858 }
857 kfree(ablocks); 859 kfree(ablocks);
@@ -979,8 +981,8 @@ repeat:
979 /* refill path */ 981 /* refill path */
980 ext4_ext_drop_refs(path); 982 ext4_ext_drop_refs(path);
981 path = ext4_ext_find_extent(inode, 983 path = ext4_ext_find_extent(inode,
982 le32_to_cpu(newext->ee_block), 984 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
983 path); 985 path);
984 if (IS_ERR(path)) 986 if (IS_ERR(path))
985 err = PTR_ERR(path); 987 err = PTR_ERR(path);
986 } else { 988 } else {
@@ -992,8 +994,8 @@ repeat:
992 /* refill path */ 994 /* refill path */
993 ext4_ext_drop_refs(path); 995 ext4_ext_drop_refs(path);
994 path = ext4_ext_find_extent(inode, 996 path = ext4_ext_find_extent(inode,
995 le32_to_cpu(newext->ee_block), 997 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
996 path); 998 path);
997 if (IS_ERR(path)) { 999 if (IS_ERR(path)) {
998 err = PTR_ERR(path); 1000 err = PTR_ERR(path);
999 goto out; 1001 goto out;
@@ -1015,13 +1017,157 @@ out:
1015} 1017}
1016 1018
1017/* 1019/*
1020 * search the closest allocated block to the left for *logical
1021 * and returns it at @logical + it's physical address at @phys
1022 * if *logical is the smallest allocated block, the function
1023 * returns 0 at @phys
1024 * return value contains 0 (success) or error code
1025 */
1026int
1027ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1028 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1029{
1030 struct ext4_extent_idx *ix;
1031 struct ext4_extent *ex;
1032 int depth, ee_len;
1033
1034 BUG_ON(path == NULL);
1035 depth = path->p_depth;
1036 *phys = 0;
1037
1038 if (depth == 0 && path->p_ext == NULL)
1039 return 0;
1040
1041 /* usually extent in the path covers blocks smaller
1042 * then *logical, but it can be that extent is the
1043 * first one in the file */
1044
1045 ex = path[depth].p_ext;
1046 ee_len = ext4_ext_get_actual_len(ex);
1047 if (*logical < le32_to_cpu(ex->ee_block)) {
1048 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1049 while (--depth >= 0) {
1050 ix = path[depth].p_idx;
1051 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1052 }
1053 return 0;
1054 }
1055
1056 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1057
1058 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1059 *phys = ext_pblock(ex) + ee_len - 1;
1060 return 0;
1061}
1062
1063/*
1064 * search the closest allocated block to the right for *logical
1065 * and returns it at @logical + it's physical address at @phys
1066 * if *logical is the smallest allocated block, the function
1067 * returns 0 at @phys
1068 * return value contains 0 (success) or error code
1069 */
1070int
1071ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1072 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1073{
1074 struct buffer_head *bh = NULL;
1075 struct ext4_extent_header *eh;
1076 struct ext4_extent_idx *ix;
1077 struct ext4_extent *ex;
1078 ext4_fsblk_t block;
1079 int depth, ee_len;
1080
1081 BUG_ON(path == NULL);
1082 depth = path->p_depth;
1083 *phys = 0;
1084
1085 if (depth == 0 && path->p_ext == NULL)
1086 return 0;
1087
1088 /* usually extent in the path covers blocks smaller
1089 * then *logical, but it can be that extent is the
1090 * first one in the file */
1091
1092 ex = path[depth].p_ext;
1093 ee_len = ext4_ext_get_actual_len(ex);
1094 if (*logical < le32_to_cpu(ex->ee_block)) {
1095 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1096 while (--depth >= 0) {
1097 ix = path[depth].p_idx;
1098 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1099 }
1100 *logical = le32_to_cpu(ex->ee_block);
1101 *phys = ext_pblock(ex);
1102 return 0;
1103 }
1104
1105 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1106
1107 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1108 /* next allocated block in this leaf */
1109 ex++;
1110 *logical = le32_to_cpu(ex->ee_block);
1111 *phys = ext_pblock(ex);
1112 return 0;
1113 }
1114
1115 /* go up and search for index to the right */
1116 while (--depth >= 0) {
1117 ix = path[depth].p_idx;
1118 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1119 break;
1120 }
1121
1122 if (depth < 0) {
1123 /* we've gone up to the root and
1124 * found no index to the right */
1125 return 0;
1126 }
1127
1128 /* we've found index to the right, let's
1129 * follow it and find the closest allocated
1130 * block to the right */
1131 ix++;
1132 block = idx_pblock(ix);
1133 while (++depth < path->p_depth) {
1134 bh = sb_bread(inode->i_sb, block);
1135 if (bh == NULL)
1136 return -EIO;
1137 eh = ext_block_hdr(bh);
1138 if (ext4_ext_check_header(inode, eh, depth)) {
1139 put_bh(bh);
1140 return -EIO;
1141 }
1142 ix = EXT_FIRST_INDEX(eh);
1143 block = idx_pblock(ix);
1144 put_bh(bh);
1145 }
1146
1147 bh = sb_bread(inode->i_sb, block);
1148 if (bh == NULL)
1149 return -EIO;
1150 eh = ext_block_hdr(bh);
1151 if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
1152 put_bh(bh);
1153 return -EIO;
1154 }
1155 ex = EXT_FIRST_EXTENT(eh);
1156 *logical = le32_to_cpu(ex->ee_block);
1157 *phys = ext_pblock(ex);
1158 put_bh(bh);
1159 return 0;
1160
1161}
1162
1163/*
1018 * ext4_ext_next_allocated_block: 1164 * ext4_ext_next_allocated_block:
1019 * returns allocated block in subsequent extent or EXT_MAX_BLOCK. 1165 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1020 * NOTE: it considers block number from index entry as 1166 * NOTE: it considers block number from index entry as
1021 * allocated block. Thus, index entries have to be consistent 1167 * allocated block. Thus, index entries have to be consistent
1022 * with leaves. 1168 * with leaves.
1023 */ 1169 */
1024static unsigned long 1170static ext4_lblk_t
1025ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1171ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1026{ 1172{
1027 int depth; 1173 int depth;
@@ -1054,7 +1200,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1054 * ext4_ext_next_leaf_block: 1200 * ext4_ext_next_leaf_block:
1055 * returns first allocated block from next leaf or EXT_MAX_BLOCK 1201 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1056 */ 1202 */
1057static unsigned ext4_ext_next_leaf_block(struct inode *inode, 1203static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1058 struct ext4_ext_path *path) 1204 struct ext4_ext_path *path)
1059{ 1205{
1060 int depth; 1206 int depth;
@@ -1072,7 +1218,8 @@ static unsigned ext4_ext_next_leaf_block(struct inode *inode,
1072 while (depth >= 0) { 1218 while (depth >= 0) {
1073 if (path[depth].p_idx != 1219 if (path[depth].p_idx !=
1074 EXT_LAST_INDEX(path[depth].p_hdr)) 1220 EXT_LAST_INDEX(path[depth].p_hdr))
1075 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1221 return (ext4_lblk_t)
1222 le32_to_cpu(path[depth].p_idx[1].ei_block);
1076 depth--; 1223 depth--;
1077 } 1224 }
1078 1225
@@ -1085,7 +1232,7 @@ static unsigned ext4_ext_next_leaf_block(struct inode *inode,
1085 * then we have to correct all indexes above. 1232 * then we have to correct all indexes above.
1086 * TODO: do we need to correct tree in all cases? 1233 * TODO: do we need to correct tree in all cases?
1087 */ 1234 */
1088int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1235static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1089 struct ext4_ext_path *path) 1236 struct ext4_ext_path *path)
1090{ 1237{
1091 struct ext4_extent_header *eh; 1238 struct ext4_extent_header *eh;
@@ -1171,7 +1318,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1171 if (ext1_ee_len + ext2_ee_len > max_len) 1318 if (ext1_ee_len + ext2_ee_len > max_len)
1172 return 0; 1319 return 0;
1173#ifdef AGGRESSIVE_TEST 1320#ifdef AGGRESSIVE_TEST
1174 if (le16_to_cpu(ex1->ee_len) >= 4) 1321 if (ext1_ee_len >= 4)
1175 return 0; 1322 return 0;
1176#endif 1323#endif
1177 1324
@@ -1239,7 +1386,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
1239 struct ext4_extent *newext, 1386 struct ext4_extent *newext,
1240 struct ext4_ext_path *path) 1387 struct ext4_ext_path *path)
1241{ 1388{
1242 unsigned long b1, b2; 1389 ext4_lblk_t b1, b2;
1243 unsigned int depth, len1; 1390 unsigned int depth, len1;
1244 unsigned int ret = 0; 1391 unsigned int ret = 0;
1245 1392
@@ -1260,7 +1407,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
1260 goto out; 1407 goto out;
1261 } 1408 }
1262 1409
1263 /* check for wrap through zero */ 1410 /* check for wrap through zero on extent logical start block*/
1264 if (b1 + len1 < b1) { 1411 if (b1 + len1 < b1) {
1265 len1 = EXT_MAX_BLOCK - b1; 1412 len1 = EXT_MAX_BLOCK - b1;
1266 newext->ee_len = cpu_to_le16(len1); 1413 newext->ee_len = cpu_to_le16(len1);
@@ -1290,7 +1437,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1290 struct ext4_extent *ex, *fex; 1437 struct ext4_extent *ex, *fex;
1291 struct ext4_extent *nearex; /* nearest extent */ 1438 struct ext4_extent *nearex; /* nearest extent */
1292 struct ext4_ext_path *npath = NULL; 1439 struct ext4_ext_path *npath = NULL;
1293 int depth, len, err, next; 1440 int depth, len, err;
1441 ext4_lblk_t next;
1294 unsigned uninitialized = 0; 1442 unsigned uninitialized = 0;
1295 1443
1296 BUG_ON(ext4_ext_get_actual_len(newext) == 0); 1444 BUG_ON(ext4_ext_get_actual_len(newext) == 0);
@@ -1435,114 +1583,8 @@ cleanup:
1435 return err; 1583 return err;
1436} 1584}
1437 1585
1438int ext4_ext_walk_space(struct inode *inode, unsigned long block,
1439 unsigned long num, ext_prepare_callback func,
1440 void *cbdata)
1441{
1442 struct ext4_ext_path *path = NULL;
1443 struct ext4_ext_cache cbex;
1444 struct ext4_extent *ex;
1445 unsigned long next, start = 0, end = 0;
1446 unsigned long last = block + num;
1447 int depth, exists, err = 0;
1448
1449 BUG_ON(func == NULL);
1450 BUG_ON(inode == NULL);
1451
1452 while (block < last && block != EXT_MAX_BLOCK) {
1453 num = last - block;
1454 /* find extent for this block */
1455 path = ext4_ext_find_extent(inode, block, path);
1456 if (IS_ERR(path)) {
1457 err = PTR_ERR(path);
1458 path = NULL;
1459 break;
1460 }
1461
1462 depth = ext_depth(inode);
1463 BUG_ON(path[depth].p_hdr == NULL);
1464 ex = path[depth].p_ext;
1465 next = ext4_ext_next_allocated_block(path);
1466
1467 exists = 0;
1468 if (!ex) {
1469 /* there is no extent yet, so try to allocate
1470 * all requested space */
1471 start = block;
1472 end = block + num;
1473 } else if (le32_to_cpu(ex->ee_block) > block) {
1474 /* need to allocate space before found extent */
1475 start = block;
1476 end = le32_to_cpu(ex->ee_block);
1477 if (block + num < end)
1478 end = block + num;
1479 } else if (block >= le32_to_cpu(ex->ee_block)
1480 + ext4_ext_get_actual_len(ex)) {
1481 /* need to allocate space after found extent */
1482 start = block;
1483 end = block + num;
1484 if (end >= next)
1485 end = next;
1486 } else if (block >= le32_to_cpu(ex->ee_block)) {
1487 /*
1488 * some part of requested space is covered
1489 * by found extent
1490 */
1491 start = block;
1492 end = le32_to_cpu(ex->ee_block)
1493 + ext4_ext_get_actual_len(ex);
1494 if (block + num < end)
1495 end = block + num;
1496 exists = 1;
1497 } else {
1498 BUG();
1499 }
1500 BUG_ON(end <= start);
1501
1502 if (!exists) {
1503 cbex.ec_block = start;
1504 cbex.ec_len = end - start;
1505 cbex.ec_start = 0;
1506 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1507 } else {
1508 cbex.ec_block = le32_to_cpu(ex->ee_block);
1509 cbex.ec_len = ext4_ext_get_actual_len(ex);
1510 cbex.ec_start = ext_pblock(ex);
1511 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1512 }
1513
1514 BUG_ON(cbex.ec_len == 0);
1515 err = func(inode, path, &cbex, cbdata);
1516 ext4_ext_drop_refs(path);
1517
1518 if (err < 0)
1519 break;
1520 if (err == EXT_REPEAT)
1521 continue;
1522 else if (err == EXT_BREAK) {
1523 err = 0;
1524 break;
1525 }
1526
1527 if (ext_depth(inode) != depth) {
1528 /* depth was changed. we have to realloc path */
1529 kfree(path);
1530 path = NULL;
1531 }
1532
1533 block = cbex.ec_block + cbex.ec_len;
1534 }
1535
1536 if (path) {
1537 ext4_ext_drop_refs(path);
1538 kfree(path);
1539 }
1540
1541 return err;
1542}
1543
1544static void 1586static void
1545ext4_ext_put_in_cache(struct inode *inode, __u32 block, 1587ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1546 __u32 len, ext4_fsblk_t start, int type) 1588 __u32 len, ext4_fsblk_t start, int type)
1547{ 1589{
1548 struct ext4_ext_cache *cex; 1590 struct ext4_ext_cache *cex;
@@ -1561,10 +1603,11 @@ ext4_ext_put_in_cache(struct inode *inode, __u32 block,
1561 */ 1603 */
1562static void 1604static void
1563ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 1605ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1564 unsigned long block) 1606 ext4_lblk_t block)
1565{ 1607{
1566 int depth = ext_depth(inode); 1608 int depth = ext_depth(inode);
1567 unsigned long lblock, len; 1609 unsigned long len;
1610 ext4_lblk_t lblock;
1568 struct ext4_extent *ex; 1611 struct ext4_extent *ex;
1569 1612
1570 ex = path[depth].p_ext; 1613 ex = path[depth].p_ext;
@@ -1576,32 +1619,34 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1576 } else if (block < le32_to_cpu(ex->ee_block)) { 1619 } else if (block < le32_to_cpu(ex->ee_block)) {
1577 lblock = block; 1620 lblock = block;
1578 len = le32_to_cpu(ex->ee_block) - block; 1621 len = le32_to_cpu(ex->ee_block) - block;
1579 ext_debug("cache gap(before): %lu [%lu:%lu]", 1622 ext_debug("cache gap(before): %u [%u:%u]",
1580 (unsigned long) block, 1623 block,
1581 (unsigned long) le32_to_cpu(ex->ee_block), 1624 le32_to_cpu(ex->ee_block),
1582 (unsigned long) ext4_ext_get_actual_len(ex)); 1625 ext4_ext_get_actual_len(ex));
1583 } else if (block >= le32_to_cpu(ex->ee_block) 1626 } else if (block >= le32_to_cpu(ex->ee_block)
1584 + ext4_ext_get_actual_len(ex)) { 1627 + ext4_ext_get_actual_len(ex)) {
1628 ext4_lblk_t next;
1585 lblock = le32_to_cpu(ex->ee_block) 1629 lblock = le32_to_cpu(ex->ee_block)
1586 + ext4_ext_get_actual_len(ex); 1630 + ext4_ext_get_actual_len(ex);
1587 len = ext4_ext_next_allocated_block(path); 1631
1588 ext_debug("cache gap(after): [%lu:%lu] %lu", 1632 next = ext4_ext_next_allocated_block(path);
1589 (unsigned long) le32_to_cpu(ex->ee_block), 1633 ext_debug("cache gap(after): [%u:%u] %u",
1590 (unsigned long) ext4_ext_get_actual_len(ex), 1634 le32_to_cpu(ex->ee_block),
1591 (unsigned long) block); 1635 ext4_ext_get_actual_len(ex),
1592 BUG_ON(len == lblock); 1636 block);
1593 len = len - lblock; 1637 BUG_ON(next == lblock);
1638 len = next - lblock;
1594 } else { 1639 } else {
1595 lblock = len = 0; 1640 lblock = len = 0;
1596 BUG(); 1641 BUG();
1597 } 1642 }
1598 1643
1599 ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len); 1644 ext_debug(" -> %u:%lu\n", lblock, len);
1600 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); 1645 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1601} 1646}
1602 1647
1603static int 1648static int
1604ext4_ext_in_cache(struct inode *inode, unsigned long block, 1649ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1605 struct ext4_extent *ex) 1650 struct ext4_extent *ex)
1606{ 1651{
1607 struct ext4_ext_cache *cex; 1652 struct ext4_ext_cache *cex;
@@ -1618,11 +1663,9 @@ ext4_ext_in_cache(struct inode *inode, unsigned long block,
1618 ex->ee_block = cpu_to_le32(cex->ec_block); 1663 ex->ee_block = cpu_to_le32(cex->ec_block);
1619 ext4_ext_store_pblock(ex, cex->ec_start); 1664 ext4_ext_store_pblock(ex, cex->ec_start);
1620 ex->ee_len = cpu_to_le16(cex->ec_len); 1665 ex->ee_len = cpu_to_le16(cex->ec_len);
1621 ext_debug("%lu cached by %lu:%lu:%llu\n", 1666 ext_debug("%u cached by %u:%u:%llu\n",
1622 (unsigned long) block, 1667 block,
1623 (unsigned long) cex->ec_block, 1668 cex->ec_block, cex->ec_len, cex->ec_start);
1624 (unsigned long) cex->ec_len,
1625 cex->ec_start);
1626 return cex->ec_type; 1669 return cex->ec_type;
1627 } 1670 }
1628 1671
@@ -1636,7 +1679,7 @@ ext4_ext_in_cache(struct inode *inode, unsigned long block,
1636 * It's used in truncate case only, thus all requests are for 1679 * It's used in truncate case only, thus all requests are for
1637 * last index in the block only. 1680 * last index in the block only.
1638 */ 1681 */
1639int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 1682static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1640 struct ext4_ext_path *path) 1683 struct ext4_ext_path *path)
1641{ 1684{
1642 struct buffer_head *bh; 1685 struct buffer_head *bh;
@@ -1657,7 +1700,7 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1657 ext_debug("index is empty, remove it, free block %llu\n", leaf); 1700 ext_debug("index is empty, remove it, free block %llu\n", leaf);
1658 bh = sb_find_get_block(inode->i_sb, leaf); 1701 bh = sb_find_get_block(inode->i_sb, leaf);
1659 ext4_forget(handle, 1, inode, bh, leaf); 1702 ext4_forget(handle, 1, inode, bh, leaf);
1660 ext4_free_blocks(handle, inode, leaf, 1); 1703 ext4_free_blocks(handle, inode, leaf, 1, 1);
1661 return err; 1704 return err;
1662} 1705}
1663 1706
@@ -1666,7 +1709,7 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1666 * This routine returns max. credits that the extent tree can consume. 1709 * This routine returns max. credits that the extent tree can consume.
1667 * It should be OK for low-performance paths like ->writepage() 1710 * It should be OK for low-performance paths like ->writepage()
1668 * To allow many writing processes to fit into a single transaction, 1711 * To allow many writing processes to fit into a single transaction,
1669 * the caller should calculate credits under truncate_mutex and 1712 * the caller should calculate credits under i_data_sem and
1670 * pass the actual path. 1713 * pass the actual path.
1671 */ 1714 */
1672int ext4_ext_calc_credits_for_insert(struct inode *inode, 1715int ext4_ext_calc_credits_for_insert(struct inode *inode,
@@ -1714,12 +1757,14 @@ int ext4_ext_calc_credits_for_insert(struct inode *inode,
1714 1757
1715static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 1758static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1716 struct ext4_extent *ex, 1759 struct ext4_extent *ex,
1717 unsigned long from, unsigned long to) 1760 ext4_lblk_t from, ext4_lblk_t to)
1718{ 1761{
1719 struct buffer_head *bh; 1762 struct buffer_head *bh;
1720 unsigned short ee_len = ext4_ext_get_actual_len(ex); 1763 unsigned short ee_len = ext4_ext_get_actual_len(ex);
1721 int i; 1764 int i, metadata = 0;
1722 1765
1766 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1767 metadata = 1;
1723#ifdef EXTENTS_STATS 1768#ifdef EXTENTS_STATS
1724 { 1769 {
1725 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1770 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1738,42 +1783,45 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1738 if (from >= le32_to_cpu(ex->ee_block) 1783 if (from >= le32_to_cpu(ex->ee_block)
1739 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 1784 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
1740 /* tail removal */ 1785 /* tail removal */
1741 unsigned long num; 1786 ext4_lblk_t num;
1742 ext4_fsblk_t start; 1787 ext4_fsblk_t start;
1788
1743 num = le32_to_cpu(ex->ee_block) + ee_len - from; 1789 num = le32_to_cpu(ex->ee_block) + ee_len - from;
1744 start = ext_pblock(ex) + ee_len - num; 1790 start = ext_pblock(ex) + ee_len - num;
1745 ext_debug("free last %lu blocks starting %llu\n", num, start); 1791 ext_debug("free last %u blocks starting %llu\n", num, start);
1746 for (i = 0; i < num; i++) { 1792 for (i = 0; i < num; i++) {
1747 bh = sb_find_get_block(inode->i_sb, start + i); 1793 bh = sb_find_get_block(inode->i_sb, start + i);
1748 ext4_forget(handle, 0, inode, bh, start + i); 1794 ext4_forget(handle, 0, inode, bh, start + i);
1749 } 1795 }
1750 ext4_free_blocks(handle, inode, start, num); 1796 ext4_free_blocks(handle, inode, start, num, metadata);
1751 } else if (from == le32_to_cpu(ex->ee_block) 1797 } else if (from == le32_to_cpu(ex->ee_block)
1752 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 1798 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
1753 printk("strange request: removal %lu-%lu from %u:%u\n", 1799 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
1754 from, to, le32_to_cpu(ex->ee_block), ee_len); 1800 from, to, le32_to_cpu(ex->ee_block), ee_len);
1755 } else { 1801 } else {
1756 printk("strange request: removal(2) %lu-%lu from %u:%u\n", 1802 printk(KERN_INFO "strange request: removal(2) "
1757 from, to, le32_to_cpu(ex->ee_block), ee_len); 1803 "%u-%u from %u:%u\n",
1804 from, to, le32_to_cpu(ex->ee_block), ee_len);
1758 } 1805 }
1759 return 0; 1806 return 0;
1760} 1807}
1761 1808
1762static int 1809static int
1763ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 1810ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1764 struct ext4_ext_path *path, unsigned long start) 1811 struct ext4_ext_path *path, ext4_lblk_t start)
1765{ 1812{
1766 int err = 0, correct_index = 0; 1813 int err = 0, correct_index = 0;
1767 int depth = ext_depth(inode), credits; 1814 int depth = ext_depth(inode), credits;
1768 struct ext4_extent_header *eh; 1815 struct ext4_extent_header *eh;
1769 unsigned a, b, block, num; 1816 ext4_lblk_t a, b, block;
1770 unsigned long ex_ee_block; 1817 unsigned num;
1818 ext4_lblk_t ex_ee_block;
1771 unsigned short ex_ee_len; 1819 unsigned short ex_ee_len;
1772 unsigned uninitialized = 0; 1820 unsigned uninitialized = 0;
1773 struct ext4_extent *ex; 1821 struct ext4_extent *ex;
1774 1822
1775 /* the header must be checked already in ext4_ext_remove_space() */ 1823 /* the header must be checked already in ext4_ext_remove_space() */
1776 ext_debug("truncate since %lu in leaf\n", start); 1824 ext_debug("truncate since %u in leaf\n", start);
1777 if (!path[depth].p_hdr) 1825 if (!path[depth].p_hdr)
1778 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 1826 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1779 eh = path[depth].p_hdr; 1827 eh = path[depth].p_hdr;
@@ -1904,7 +1952,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
1904 return 1; 1952 return 1;
1905} 1953}
1906 1954
1907int ext4_ext_remove_space(struct inode *inode, unsigned long start) 1955static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
1908{ 1956{
1909 struct super_block *sb = inode->i_sb; 1957 struct super_block *sb = inode->i_sb;
1910 int depth = ext_depth(inode); 1958 int depth = ext_depth(inode);
@@ -1912,7 +1960,7 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start)
1912 handle_t *handle; 1960 handle_t *handle;
1913 int i = 0, err = 0; 1961 int i = 0, err = 0;
1914 1962
1915 ext_debug("truncate since %lu\n", start); 1963 ext_debug("truncate since %u\n", start);
1916 1964
1917 /* probably first extent we're gonna free will be last in block */ 1965 /* probably first extent we're gonna free will be last in block */
1918 handle = ext4_journal_start(inode, depth + 1); 1966 handle = ext4_journal_start(inode, depth + 1);
@@ -2094,17 +2142,19 @@ void ext4_ext_release(struct super_block *sb)
2094 * b> Splits in two extents: Write is happening at either end of the extent 2142 * b> Splits in two extents: Write is happening at either end of the extent
2095 * c> Splits in three extents: Somone is writing in middle of the extent 2143 * c> Splits in three extents: Somone is writing in middle of the extent
2096 */ 2144 */
2097int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, 2145static int ext4_ext_convert_to_initialized(handle_t *handle,
2098 struct ext4_ext_path *path, 2146 struct inode *inode,
2099 ext4_fsblk_t iblock, 2147 struct ext4_ext_path *path,
2100 unsigned long max_blocks) 2148 ext4_lblk_t iblock,
2149 unsigned long max_blocks)
2101{ 2150{
2102 struct ext4_extent *ex, newex; 2151 struct ext4_extent *ex, newex;
2103 struct ext4_extent *ex1 = NULL; 2152 struct ext4_extent *ex1 = NULL;
2104 struct ext4_extent *ex2 = NULL; 2153 struct ext4_extent *ex2 = NULL;
2105 struct ext4_extent *ex3 = NULL; 2154 struct ext4_extent *ex3 = NULL;
2106 struct ext4_extent_header *eh; 2155 struct ext4_extent_header *eh;
2107 unsigned int allocated, ee_block, ee_len, depth; 2156 ext4_lblk_t ee_block;
2157 unsigned int allocated, ee_len, depth;
2108 ext4_fsblk_t newblock; 2158 ext4_fsblk_t newblock;
2109 int err = 0; 2159 int err = 0;
2110 int ret = 0; 2160 int ret = 0;
@@ -2225,8 +2275,13 @@ out:
2225 return err ? err : allocated; 2275 return err ? err : allocated;
2226} 2276}
2227 2277
2278/*
2279 * Need to be called with
2280 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
2281 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
2282 */
2228int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2283int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2229 ext4_fsblk_t iblock, 2284 ext4_lblk_t iblock,
2230 unsigned long max_blocks, struct buffer_head *bh_result, 2285 unsigned long max_blocks, struct buffer_head *bh_result,
2231 int create, int extend_disksize) 2286 int create, int extend_disksize)
2232{ 2287{
@@ -2236,11 +2291,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2236 ext4_fsblk_t goal, newblock; 2291 ext4_fsblk_t goal, newblock;
2237 int err = 0, depth, ret; 2292 int err = 0, depth, ret;
2238 unsigned long allocated = 0; 2293 unsigned long allocated = 0;
2294 struct ext4_allocation_request ar;
2239 2295
2240 __clear_bit(BH_New, &bh_result->b_state); 2296 __clear_bit(BH_New, &bh_result->b_state);
2241 ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock, 2297 ext_debug("blocks %u/%lu requested for inode %u\n",
2242 max_blocks, (unsigned) inode->i_ino); 2298 iblock, max_blocks, inode->i_ino);
2243 mutex_lock(&EXT4_I(inode)->truncate_mutex);
2244 2299
2245 /* check in cache */ 2300 /* check in cache */
2246 goal = ext4_ext_in_cache(inode, iblock, &newex); 2301 goal = ext4_ext_in_cache(inode, iblock, &newex);
@@ -2260,7 +2315,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2260 - le32_to_cpu(newex.ee_block) 2315 - le32_to_cpu(newex.ee_block)
2261 + ext_pblock(&newex); 2316 + ext_pblock(&newex);
2262 /* number of remaining blocks in the extent */ 2317 /* number of remaining blocks in the extent */
2263 allocated = le16_to_cpu(newex.ee_len) - 2318 allocated = ext4_ext_get_actual_len(&newex) -
2264 (iblock - le32_to_cpu(newex.ee_block)); 2319 (iblock - le32_to_cpu(newex.ee_block));
2265 goto out; 2320 goto out;
2266 } else { 2321 } else {
@@ -2288,7 +2343,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2288 2343
2289 ex = path[depth].p_ext; 2344 ex = path[depth].p_ext;
2290 if (ex) { 2345 if (ex) {
2291 unsigned long ee_block = le32_to_cpu(ex->ee_block); 2346 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
2292 ext4_fsblk_t ee_start = ext_pblock(ex); 2347 ext4_fsblk_t ee_start = ext_pblock(ex);
2293 unsigned short ee_len; 2348 unsigned short ee_len;
2294 2349
@@ -2302,7 +2357,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2302 newblock = iblock - ee_block + ee_start; 2357 newblock = iblock - ee_block + ee_start;
2303 /* number of remaining blocks in the extent */ 2358 /* number of remaining blocks in the extent */
2304 allocated = ee_len - (iblock - ee_block); 2359 allocated = ee_len - (iblock - ee_block);
2305 ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, 2360 ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
2306 ee_block, ee_len, newblock); 2361 ee_block, ee_len, newblock);
2307 2362
2308 /* Do not put uninitialized extent in the cache */ 2363 /* Do not put uninitialized extent in the cache */
@@ -2320,9 +2375,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2320 ret = ext4_ext_convert_to_initialized(handle, inode, 2375 ret = ext4_ext_convert_to_initialized(handle, inode,
2321 path, iblock, 2376 path, iblock,
2322 max_blocks); 2377 max_blocks);
2323 if (ret <= 0) 2378 if (ret <= 0) {
2379 err = ret;
2324 goto out2; 2380 goto out2;
2325 else 2381 } else
2326 allocated = ret; 2382 allocated = ret;
2327 goto outnew; 2383 goto outnew;
2328 } 2384 }
@@ -2347,8 +2403,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2347 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) 2403 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2348 ext4_init_block_alloc_info(inode); 2404 ext4_init_block_alloc_info(inode);
2349 2405
2350 /* allocate new block */ 2406 /* find neighbour allocated blocks */
2351 goal = ext4_ext_find_goal(inode, path, iblock); 2407 ar.lleft = iblock;
2408 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
2409 if (err)
2410 goto out2;
2411 ar.lright = iblock;
2412 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
2413 if (err)
2414 goto out2;
2352 2415
2353 /* 2416 /*
2354 * See if request is beyond maximum number of blocks we can have in 2417 * See if request is beyond maximum number of blocks we can have in
@@ -2368,10 +2431,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2368 newex.ee_len = cpu_to_le16(max_blocks); 2431 newex.ee_len = cpu_to_le16(max_blocks);
2369 err = ext4_ext_check_overlap(inode, &newex, path); 2432 err = ext4_ext_check_overlap(inode, &newex, path);
2370 if (err) 2433 if (err)
2371 allocated = le16_to_cpu(newex.ee_len); 2434 allocated = ext4_ext_get_actual_len(&newex);
2372 else 2435 else
2373 allocated = max_blocks; 2436 allocated = max_blocks;
2374 newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err); 2437
2438 /* allocate new block */
2439 ar.inode = inode;
2440 ar.goal = ext4_ext_find_goal(inode, path, iblock);
2441 ar.logical = iblock;
2442 ar.len = allocated;
2443 if (S_ISREG(inode->i_mode))
2444 ar.flags = EXT4_MB_HINT_DATA;
2445 else
2446 /* disable in-core preallocation for non-regular files */
2447 ar.flags = 0;
2448 newblock = ext4_mb_new_blocks(handle, &ar, &err);
2375 if (!newblock) 2449 if (!newblock)
2376 goto out2; 2450 goto out2;
2377 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2451 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
@@ -2379,14 +2453,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2379 2453
2380 /* try to insert new extent into found leaf and return */ 2454 /* try to insert new extent into found leaf and return */
2381 ext4_ext_store_pblock(&newex, newblock); 2455 ext4_ext_store_pblock(&newex, newblock);
2382 newex.ee_len = cpu_to_le16(allocated); 2456 newex.ee_len = cpu_to_le16(ar.len);
2383 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ 2457 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
2384 ext4_ext_mark_uninitialized(&newex); 2458 ext4_ext_mark_uninitialized(&newex);
2385 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2459 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2386 if (err) { 2460 if (err) {
2387 /* free data blocks we just allocated */ 2461 /* free data blocks we just allocated */
2462 /* not a good idea to call discard here directly,
2463 * but otherwise we'd need to call it every free() */
2464 ext4_mb_discard_inode_preallocations(inode);
2388 ext4_free_blocks(handle, inode, ext_pblock(&newex), 2465 ext4_free_blocks(handle, inode, ext_pblock(&newex),
2389 le16_to_cpu(newex.ee_len)); 2466 ext4_ext_get_actual_len(&newex), 0);
2390 goto out2; 2467 goto out2;
2391 } 2468 }
2392 2469
@@ -2395,6 +2472,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2395 2472
2396 /* previous routine could use block we allocated */ 2473 /* previous routine could use block we allocated */
2397 newblock = ext_pblock(&newex); 2474 newblock = ext_pblock(&newex);
2475 allocated = ext4_ext_get_actual_len(&newex);
2398outnew: 2476outnew:
2399 __set_bit(BH_New, &bh_result->b_state); 2477 __set_bit(BH_New, &bh_result->b_state);
2400 2478
@@ -2414,8 +2492,6 @@ out2:
2414 ext4_ext_drop_refs(path); 2492 ext4_ext_drop_refs(path);
2415 kfree(path); 2493 kfree(path);
2416 } 2494 }
2417 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
2418
2419 return err ? err : allocated; 2495 return err ? err : allocated;
2420} 2496}
2421 2497
@@ -2423,7 +2499,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2423{ 2499{
2424 struct address_space *mapping = inode->i_mapping; 2500 struct address_space *mapping = inode->i_mapping;
2425 struct super_block *sb = inode->i_sb; 2501 struct super_block *sb = inode->i_sb;
2426 unsigned long last_block; 2502 ext4_lblk_t last_block;
2427 handle_t *handle; 2503 handle_t *handle;
2428 int err = 0; 2504 int err = 0;
2429 2505
@@ -2445,9 +2521,11 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2445 if (page) 2521 if (page)
2446 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 2522 ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2447 2523
2448 mutex_lock(&EXT4_I(inode)->truncate_mutex); 2524 down_write(&EXT4_I(inode)->i_data_sem);
2449 ext4_ext_invalidate_cache(inode); 2525 ext4_ext_invalidate_cache(inode);
2450 2526
2527 ext4_mb_discard_inode_preallocations(inode);
2528
2451 /* 2529 /*
2452 * TODO: optimization is possible here. 2530 * TODO: optimization is possible here.
2453 * Probably we need not scan at all, 2531 * Probably we need not scan at all,
@@ -2481,7 +2559,7 @@ out_stop:
2481 if (inode->i_nlink) 2559 if (inode->i_nlink)
2482 ext4_orphan_del(handle, inode); 2560 ext4_orphan_del(handle, inode);
2483 2561
2484 mutex_unlock(&EXT4_I(inode)->truncate_mutex); 2562 up_write(&EXT4_I(inode)->i_data_sem);
2485 ext4_journal_stop(handle); 2563 ext4_journal_stop(handle);
2486} 2564}
2487 2565
@@ -2516,7 +2594,8 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2516long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 2594long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2517{ 2595{
2518 handle_t *handle; 2596 handle_t *handle;
2519 ext4_fsblk_t block, max_blocks; 2597 ext4_lblk_t block;
2598 unsigned long max_blocks;
2520 ext4_fsblk_t nblocks = 0; 2599 ext4_fsblk_t nblocks = 0;
2521 int ret = 0; 2600 int ret = 0;
2522 int ret2 = 0; 2601 int ret2 = 0;
@@ -2544,6 +2623,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2544 * modify 1 super block, 1 block bitmap and 1 group descriptor. 2623 * modify 1 super block, 1 block bitmap and 1 group descriptor.
2545 */ 2624 */
2546 credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; 2625 credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
2626 down_write((&EXT4_I(inode)->i_data_sem));
2547retry: 2627retry:
2548 while (ret >= 0 && ret < max_blocks) { 2628 while (ret >= 0 && ret < max_blocks) {
2549 block = block + ret; 2629 block = block + ret;
@@ -2557,12 +2637,12 @@ retry:
2557 ret = ext4_ext_get_blocks(handle, inode, block, 2637 ret = ext4_ext_get_blocks(handle, inode, block,
2558 max_blocks, &map_bh, 2638 max_blocks, &map_bh,
2559 EXT4_CREATE_UNINITIALIZED_EXT, 0); 2639 EXT4_CREATE_UNINITIALIZED_EXT, 0);
2560 WARN_ON(!ret); 2640 WARN_ON(ret <= 0);
2561 if (!ret) { 2641 if (ret <= 0) {
2562 ext4_error(inode->i_sb, "ext4_fallocate", 2642 ext4_error(inode->i_sb, "ext4_fallocate",
2563 "ext4_ext_get_blocks returned 0! inode#%lu" 2643 "ext4_ext_get_blocks returned error: "
2564 ", block=%llu, max_blocks=%llu", 2644 "inode#%lu, block=%u, max_blocks=%lu",
2565 inode->i_ino, block, max_blocks); 2645 inode->i_ino, block, max_blocks);
2566 ret = -EIO; 2646 ret = -EIO;
2567 ext4_mark_inode_dirty(handle, inode); 2647 ext4_mark_inode_dirty(handle, inode);
2568 ret2 = ext4_journal_stop(handle); 2648 ret2 = ext4_journal_stop(handle);
@@ -2600,6 +2680,7 @@ retry:
2600 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2680 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2601 goto retry; 2681 goto retry;
2602 2682
2683 up_write((&EXT4_I(inode)->i_data_sem));
2603 /* 2684 /*
2604 * Time to update the file size. 2685 * Time to update the file size.
2605 * Update only when preallocation was requested beyond the file size. 2686 * Update only when preallocation was requested beyond the file size.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1a81cd66d63b..ac35ec58db55 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -37,9 +37,9 @@ static int ext4_release_file (struct inode * inode, struct file * filp)
37 if ((filp->f_mode & FMODE_WRITE) && 37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 38 (atomic_read(&inode->i_writecount) == 1))
39 { 39 {
40 mutex_lock(&EXT4_I(inode)->truncate_mutex); 40 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_reservation(inode); 41 ext4_discard_reservation(inode);
42 mutex_unlock(&EXT4_I(inode)->truncate_mutex); 42 up_write(&EXT4_I(inode)->i_data_sem);
43 } 43 }
44 if (is_dx(inode) && filp->private_data) 44 if (is_dx(inode) && filp->private_data)
45 ext4_htree_free_dir_info(filp->private_data); 45 ext4_htree_free_dir_info(filp->private_data);
@@ -56,8 +56,25 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
56 ssize_t ret; 56 ssize_t ret;
57 int err; 57 int err;
58 58
59 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 59 /*
60 * If we have encountered a bitmap-format file, the size limit
61 * is smaller than s_maxbytes, which is for extent-mapped files.
62 */
63
64 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
65 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
66 size_t length = iov_length(iov, nr_segs);
60 67
68 if (pos > sbi->s_bitmap_maxbytes)
69 return -EFBIG;
70
71 if (pos + length > sbi->s_bitmap_maxbytes) {
72 nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
73 sbi->s_bitmap_maxbytes - pos);
74 }
75 }
76
77 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
61 /* 78 /*
62 * Skip flushing if there was an error, or if nothing was written. 79 * Skip flushing if there was an error, or if nothing was written.
63 */ 80 */
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 1577910bb58b..7eb0604e7eea 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -14,14 +14,16 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, 14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
15 struct ext4_group_desc *gdp); 15 struct ext4_group_desc *gdp);
16struct buffer_head *read_block_bitmap(struct super_block *sb, 16struct buffer_head *read_block_bitmap(struct super_block *sb,
17 unsigned int block_group); 17 ext4_group_t block_group);
18extern unsigned ext4_init_block_bitmap(struct super_block *sb, 18extern unsigned ext4_init_block_bitmap(struct super_block *sb,
19 struct buffer_head *bh, int group, 19 struct buffer_head *bh,
20 ext4_group_t group,
20 struct ext4_group_desc *desc); 21 struct ext4_group_desc *desc);
21#define ext4_free_blocks_after_init(sb, group, desc) \ 22#define ext4_free_blocks_after_init(sb, group, desc) \
22 ext4_init_block_bitmap(sb, NULL, group, desc) 23 ext4_init_block_bitmap(sb, NULL, group, desc)
23extern unsigned ext4_init_inode_bitmap(struct super_block *sb, 24extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
24 struct buffer_head *bh, int group, 25 struct buffer_head *bh,
26 ext4_group_t group,
25 struct ext4_group_desc *desc); 27 struct ext4_group_desc *desc);
26extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); 28extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
27#endif /* _LINUX_EXT4_GROUP_H */ 29#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c61f37fd3f05..575b5215c808 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -64,8 +64,8 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
64} 64}
65 65
66/* Initializes an uninitialized inode bitmap */ 66/* Initializes an uninitialized inode bitmap */
67unsigned ext4_init_inode_bitmap(struct super_block *sb, 67unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
68 struct buffer_head *bh, int block_group, 68 ext4_group_t block_group,
69 struct ext4_group_desc *gdp) 69 struct ext4_group_desc *gdp)
70{ 70{
71 struct ext4_sb_info *sbi = EXT4_SB(sb); 71 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -75,7 +75,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb,
75 /* If checksum is bad mark all blocks and inodes use to prevent 75 /* If checksum is bad mark all blocks and inodes use to prevent
76 * allocation, essentially implementing a per-group read-only flag. */ 76 * allocation, essentially implementing a per-group read-only flag. */
77 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 77 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
78 ext4_error(sb, __FUNCTION__, "Checksum bad for group %u\n", 78 ext4_error(sb, __FUNCTION__, "Checksum bad for group %lu\n",
79 block_group); 79 block_group);
80 gdp->bg_free_blocks_count = 0; 80 gdp->bg_free_blocks_count = 0;
81 gdp->bg_free_inodes_count = 0; 81 gdp->bg_free_inodes_count = 0;
@@ -98,7 +98,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb,
98 * Return buffer_head of bitmap on success or NULL. 98 * Return buffer_head of bitmap on success or NULL.
99 */ 99 */
100static struct buffer_head * 100static struct buffer_head *
101read_inode_bitmap(struct super_block * sb, unsigned long block_group) 101read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
102{ 102{
103 struct ext4_group_desc *desc; 103 struct ext4_group_desc *desc;
104 struct buffer_head *bh = NULL; 104 struct buffer_head *bh = NULL;
@@ -152,7 +152,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
152 unsigned long ino; 152 unsigned long ino;
153 struct buffer_head *bitmap_bh = NULL; 153 struct buffer_head *bitmap_bh = NULL;
154 struct buffer_head *bh2; 154 struct buffer_head *bh2;
155 unsigned long block_group; 155 ext4_group_t block_group;
156 unsigned long bit; 156 unsigned long bit;
157 struct ext4_group_desc * gdp; 157 struct ext4_group_desc * gdp;
158 struct ext4_super_block * es; 158 struct ext4_super_block * es;
@@ -260,12 +260,14 @@ error_return:
260 * For other inodes, search forward from the parent directory\'s block 260 * For other inodes, search forward from the parent directory\'s block
261 * group to find a free inode. 261 * group to find a free inode.
262 */ 262 */
263static int find_group_dir(struct super_block *sb, struct inode *parent) 263static int find_group_dir(struct super_block *sb, struct inode *parent,
264 ext4_group_t *best_group)
264{ 265{
265 int ngroups = EXT4_SB(sb)->s_groups_count; 266 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
266 unsigned int freei, avefreei; 267 unsigned int freei, avefreei;
267 struct ext4_group_desc *desc, *best_desc = NULL; 268 struct ext4_group_desc *desc, *best_desc = NULL;
268 int group, best_group = -1; 269 ext4_group_t group;
270 int ret = -1;
269 271
270 freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); 272 freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
271 avefreei = freei / ngroups; 273 avefreei = freei / ngroups;
@@ -279,11 +281,12 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
279 if (!best_desc || 281 if (!best_desc ||
280 (le16_to_cpu(desc->bg_free_blocks_count) > 282 (le16_to_cpu(desc->bg_free_blocks_count) >
281 le16_to_cpu(best_desc->bg_free_blocks_count))) { 283 le16_to_cpu(best_desc->bg_free_blocks_count))) {
282 best_group = group; 284 *best_group = group;
283 best_desc = desc; 285 best_desc = desc;
286 ret = 0;
284 } 287 }
285 } 288 }
286 return best_group; 289 return ret;
287} 290}
288 291
289/* 292/*
@@ -314,12 +317,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
314#define INODE_COST 64 317#define INODE_COST 64
315#define BLOCK_COST 256 318#define BLOCK_COST 256
316 319
317static int find_group_orlov(struct super_block *sb, struct inode *parent) 320static int find_group_orlov(struct super_block *sb, struct inode *parent,
321 ext4_group_t *group)
318{ 322{
319 int parent_group = EXT4_I(parent)->i_block_group; 323 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
320 struct ext4_sb_info *sbi = EXT4_SB(sb); 324 struct ext4_sb_info *sbi = EXT4_SB(sb);
321 struct ext4_super_block *es = sbi->s_es; 325 struct ext4_super_block *es = sbi->s_es;
322 int ngroups = sbi->s_groups_count; 326 ext4_group_t ngroups = sbi->s_groups_count;
323 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 327 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
324 unsigned int freei, avefreei; 328 unsigned int freei, avefreei;
325 ext4_fsblk_t freeb, avefreeb; 329 ext4_fsblk_t freeb, avefreeb;
@@ -327,7 +331,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
327 unsigned int ndirs; 331 unsigned int ndirs;
328 int max_debt, max_dirs, min_inodes; 332 int max_debt, max_dirs, min_inodes;
329 ext4_grpblk_t min_blocks; 333 ext4_grpblk_t min_blocks;
330 int group = -1, i; 334 ext4_group_t i;
331 struct ext4_group_desc *desc; 335 struct ext4_group_desc *desc;
332 336
333 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 337 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
@@ -340,13 +344,14 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
340 if ((parent == sb->s_root->d_inode) || 344 if ((parent == sb->s_root->d_inode) ||
341 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { 345 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
342 int best_ndir = inodes_per_group; 346 int best_ndir = inodes_per_group;
343 int best_group = -1; 347 ext4_group_t grp;
348 int ret = -1;
344 349
345 get_random_bytes(&group, sizeof(group)); 350 get_random_bytes(&grp, sizeof(grp));
346 parent_group = (unsigned)group % ngroups; 351 parent_group = (unsigned)grp % ngroups;
347 for (i = 0; i < ngroups; i++) { 352 for (i = 0; i < ngroups; i++) {
348 group = (parent_group + i) % ngroups; 353 grp = (parent_group + i) % ngroups;
349 desc = ext4_get_group_desc (sb, group, NULL); 354 desc = ext4_get_group_desc(sb, grp, NULL);
350 if (!desc || !desc->bg_free_inodes_count) 355 if (!desc || !desc->bg_free_inodes_count)
351 continue; 356 continue;
352 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) 357 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
@@ -355,11 +360,12 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
355 continue; 360 continue;
356 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) 361 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
357 continue; 362 continue;
358 best_group = group; 363 *group = grp;
364 ret = 0;
359 best_ndir = le16_to_cpu(desc->bg_used_dirs_count); 365 best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
360 } 366 }
361 if (best_group >= 0) 367 if (ret == 0)
362 return best_group; 368 return ret;
363 goto fallback; 369 goto fallback;
364 } 370 }
365 371
@@ -380,8 +386,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
380 max_debt = 1; 386 max_debt = 1;
381 387
382 for (i = 0; i < ngroups; i++) { 388 for (i = 0; i < ngroups; i++) {
383 group = (parent_group + i) % ngroups; 389 *group = (parent_group + i) % ngroups;
384 desc = ext4_get_group_desc (sb, group, NULL); 390 desc = ext4_get_group_desc(sb, *group, NULL);
385 if (!desc || !desc->bg_free_inodes_count) 391 if (!desc || !desc->bg_free_inodes_count)
386 continue; 392 continue;
387 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) 393 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
@@ -390,17 +396,16 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
390 continue; 396 continue;
391 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) 397 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
392 continue; 398 continue;
393 return group; 399 return 0;
394 } 400 }
395 401
396fallback: 402fallback:
397 for (i = 0; i < ngroups; i++) { 403 for (i = 0; i < ngroups; i++) {
398 group = (parent_group + i) % ngroups; 404 *group = (parent_group + i) % ngroups;
399 desc = ext4_get_group_desc (sb, group, NULL); 405 desc = ext4_get_group_desc(sb, *group, NULL);
400 if (!desc || !desc->bg_free_inodes_count) 406 if (desc && desc->bg_free_inodes_count &&
401 continue; 407 le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
402 if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) 408 return 0;
403 return group;
404 } 409 }
405 410
406 if (avefreei) { 411 if (avefreei) {
@@ -415,21 +420,22 @@ fallback:
415 return -1; 420 return -1;
416} 421}
417 422
418static int find_group_other(struct super_block *sb, struct inode *parent) 423static int find_group_other(struct super_block *sb, struct inode *parent,
424 ext4_group_t *group)
419{ 425{
420 int parent_group = EXT4_I(parent)->i_block_group; 426 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
421 int ngroups = EXT4_SB(sb)->s_groups_count; 427 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
422 struct ext4_group_desc *desc; 428 struct ext4_group_desc *desc;
423 int group, i; 429 ext4_group_t i;
424 430
425 /* 431 /*
426 * Try to place the inode in its parent directory 432 * Try to place the inode in its parent directory
427 */ 433 */
428 group = parent_group; 434 *group = parent_group;
429 desc = ext4_get_group_desc (sb, group, NULL); 435 desc = ext4_get_group_desc(sb, *group, NULL);
430 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 436 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
431 le16_to_cpu(desc->bg_free_blocks_count)) 437 le16_to_cpu(desc->bg_free_blocks_count))
432 return group; 438 return 0;
433 439
434 /* 440 /*
435 * We're going to place this inode in a different blockgroup from its 441 * We're going to place this inode in a different blockgroup from its
@@ -440,33 +446,33 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
440 * 446 *
441 * So add our directory's i_ino into the starting point for the hash. 447 * So add our directory's i_ino into the starting point for the hash.
442 */ 448 */
443 group = (group + parent->i_ino) % ngroups; 449 *group = (*group + parent->i_ino) % ngroups;
444 450
445 /* 451 /*
446 * Use a quadratic hash to find a group with a free inode and some free 452 * Use a quadratic hash to find a group with a free inode and some free
447 * blocks. 453 * blocks.
448 */ 454 */
449 for (i = 1; i < ngroups; i <<= 1) { 455 for (i = 1; i < ngroups; i <<= 1) {
450 group += i; 456 *group += i;
451 if (group >= ngroups) 457 if (*group >= ngroups)
452 group -= ngroups; 458 *group -= ngroups;
453 desc = ext4_get_group_desc (sb, group, NULL); 459 desc = ext4_get_group_desc(sb, *group, NULL);
454 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 460 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
455 le16_to_cpu(desc->bg_free_blocks_count)) 461 le16_to_cpu(desc->bg_free_blocks_count))
456 return group; 462 return 0;
457 } 463 }
458 464
459 /* 465 /*
460 * That failed: try linear search for a free inode, even if that group 466 * That failed: try linear search for a free inode, even if that group
461 * has no free blocks. 467 * has no free blocks.
462 */ 468 */
463 group = parent_group; 469 *group = parent_group;
464 for (i = 0; i < ngroups; i++) { 470 for (i = 0; i < ngroups; i++) {
465 if (++group >= ngroups) 471 if (++*group >= ngroups)
466 group = 0; 472 *group = 0;
467 desc = ext4_get_group_desc (sb, group, NULL); 473 desc = ext4_get_group_desc(sb, *group, NULL);
468 if (desc && le16_to_cpu(desc->bg_free_inodes_count)) 474 if (desc && le16_to_cpu(desc->bg_free_inodes_count))
469 return group; 475 return 0;
470 } 476 }
471 477
472 return -1; 478 return -1;
@@ -487,16 +493,17 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
487 struct super_block *sb; 493 struct super_block *sb;
488 struct buffer_head *bitmap_bh = NULL; 494 struct buffer_head *bitmap_bh = NULL;
489 struct buffer_head *bh2; 495 struct buffer_head *bh2;
490 int group; 496 ext4_group_t group = 0;
491 unsigned long ino = 0; 497 unsigned long ino = 0;
492 struct inode * inode; 498 struct inode * inode;
493 struct ext4_group_desc * gdp = NULL; 499 struct ext4_group_desc * gdp = NULL;
494 struct ext4_super_block * es; 500 struct ext4_super_block * es;
495 struct ext4_inode_info *ei; 501 struct ext4_inode_info *ei;
496 struct ext4_sb_info *sbi; 502 struct ext4_sb_info *sbi;
497 int err = 0; 503 int ret2, err = 0;
498 struct inode *ret; 504 struct inode *ret;
499 int i, free = 0; 505 ext4_group_t i;
506 int free = 0;
500 507
501 /* Cannot create files in a deleted directory */ 508 /* Cannot create files in a deleted directory */
502 if (!dir || !dir->i_nlink) 509 if (!dir || !dir->i_nlink)
@@ -512,14 +519,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
512 es = sbi->s_es; 519 es = sbi->s_es;
513 if (S_ISDIR(mode)) { 520 if (S_ISDIR(mode)) {
514 if (test_opt (sb, OLDALLOC)) 521 if (test_opt (sb, OLDALLOC))
515 group = find_group_dir(sb, dir); 522 ret2 = find_group_dir(sb, dir, &group);
516 else 523 else
517 group = find_group_orlov(sb, dir); 524 ret2 = find_group_orlov(sb, dir, &group);
518 } else 525 } else
519 group = find_group_other(sb, dir); 526 ret2 = find_group_other(sb, dir, &group);
520 527
521 err = -ENOSPC; 528 err = -ENOSPC;
522 if (group == -1) 529 if (ret2 == -1)
523 goto out; 530 goto out;
524 531
525 for (i = 0; i < sbi->s_groups_count; i++) { 532 for (i = 0; i < sbi->s_groups_count; i++) {
@@ -583,7 +590,7 @@ got:
583 ino > EXT4_INODES_PER_GROUP(sb)) { 590 ino > EXT4_INODES_PER_GROUP(sb)) {
584 ext4_error(sb, __FUNCTION__, 591 ext4_error(sb, __FUNCTION__,
585 "reserved inode or inode > inodes count - " 592 "reserved inode or inode > inodes count - "
586 "block_group = %d, inode=%lu", group, 593 "block_group = %lu, inode=%lu", group,
587 ino + group * EXT4_INODES_PER_GROUP(sb)); 594 ino + group * EXT4_INODES_PER_GROUP(sb));
588 err = -EIO; 595 err = -EIO;
589 goto fail; 596 goto fail;
@@ -702,7 +709,6 @@ got:
702 if (!S_ISDIR(mode)) 709 if (!S_ISDIR(mode))
703 ei->i_flags &= ~EXT4_DIRSYNC_FL; 710 ei->i_flags &= ~EXT4_DIRSYNC_FL;
704 ei->i_file_acl = 0; 711 ei->i_file_acl = 0;
705 ei->i_dir_acl = 0;
706 ei->i_dtime = 0; 712 ei->i_dtime = 0;
707 ei->i_block_alloc_info = NULL; 713 ei->i_block_alloc_info = NULL;
708 ei->i_block_group = group; 714 ei->i_block_group = group;
@@ -741,13 +747,10 @@ got:
741 if (test_opt(sb, EXTENTS)) { 747 if (test_opt(sb, EXTENTS)) {
742 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 748 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
743 ext4_ext_tree_init(handle, inode); 749 ext4_ext_tree_init(handle, inode);
744 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 750 err = ext4_update_incompat_feature(handle, sb,
745 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); 751 EXT4_FEATURE_INCOMPAT_EXTENTS);
746 if (err) goto fail; 752 if (err)
747 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS); 753 goto fail;
748 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
749 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
750 }
751 } 754 }
752 755
753 ext4_debug("allocating inode %lu\n", inode->i_ino); 756 ext4_debug("allocating inode %lu\n", inode->i_ino);
@@ -777,7 +780,7 @@ fail_drop:
777struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) 780struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
778{ 781{
779 unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); 782 unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
780 unsigned long block_group; 783 ext4_group_t block_group;
781 int bit; 784 int bit;
782 struct buffer_head *bitmap_bh = NULL; 785 struct buffer_head *bitmap_bh = NULL;
783 struct inode *inode = NULL; 786 struct inode *inode = NULL;
@@ -833,7 +836,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
833{ 836{
834 unsigned long desc_count; 837 unsigned long desc_count;
835 struct ext4_group_desc *gdp; 838 struct ext4_group_desc *gdp;
836 int i; 839 ext4_group_t i;
837#ifdef EXT4FS_DEBUG 840#ifdef EXT4FS_DEBUG
838 struct ext4_super_block *es; 841 struct ext4_super_block *es;
839 unsigned long bitmap_count, x; 842 unsigned long bitmap_count, x;
@@ -854,7 +857,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
854 continue; 857 continue;
855 858
856 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 859 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
857 printk("group %d: stored = %d, counted = %lu\n", 860 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
858 i, le16_to_cpu(gdp->bg_free_inodes_count), x); 861 i, le16_to_cpu(gdp->bg_free_inodes_count), x);
859 bitmap_count += x; 862 bitmap_count += x;
860 } 863 }
@@ -879,7 +882,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
879unsigned long ext4_count_dirs (struct super_block * sb) 882unsigned long ext4_count_dirs (struct super_block * sb)
880{ 883{
881 unsigned long count = 0; 884 unsigned long count = 0;
882 int i; 885 ext4_group_t i;
883 886
884 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 887 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
885 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); 888 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5489703d9573..bb717cbb749c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -105,7 +105,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
105 */ 105 */
106static unsigned long blocks_for_truncate(struct inode *inode) 106static unsigned long blocks_for_truncate(struct inode *inode)
107{ 107{
108 unsigned long needed; 108 ext4_lblk_t needed;
109 109
110 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 110 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
111 111
@@ -243,13 +243,6 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
243 p->bh = bh; 243 p->bh = bh;
244} 244}
245 245
246static int verify_chain(Indirect *from, Indirect *to)
247{
248 while (from <= to && from->key == *from->p)
249 from++;
250 return (from > to);
251}
252
253/** 246/**
254 * ext4_block_to_path - parse the block number into array of offsets 247 * ext4_block_to_path - parse the block number into array of offsets
255 * @inode: inode in question (we are only interested in its superblock) 248 * @inode: inode in question (we are only interested in its superblock)
@@ -282,7 +275,8 @@ static int verify_chain(Indirect *from, Indirect *to)
282 */ 275 */
283 276
284static int ext4_block_to_path(struct inode *inode, 277static int ext4_block_to_path(struct inode *inode,
285 long i_block, int offsets[4], int *boundary) 278 ext4_lblk_t i_block,
279 ext4_lblk_t offsets[4], int *boundary)
286{ 280{
287 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 281 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
288 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 282 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -313,7 +307,10 @@ static int ext4_block_to_path(struct inode *inode,
313 offsets[n++] = i_block & (ptrs - 1); 307 offsets[n++] = i_block & (ptrs - 1);
314 final = ptrs; 308 final = ptrs;
315 } else { 309 } else {
316 ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big"); 310 ext4_warning(inode->i_sb, "ext4_block_to_path",
311 "block %lu > max",
312 i_block + direct_blocks +
313 indirect_blocks + double_blocks);
317 } 314 }
318 if (boundary) 315 if (boundary)
319 *boundary = final - 1 - (i_block & (ptrs - 1)); 316 *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -344,12 +341,14 @@ static int ext4_block_to_path(struct inode *inode,
344 * (pointer to last triple returned, *@err == 0) 341 * (pointer to last triple returned, *@err == 0)
345 * or when it gets an IO error reading an indirect block 342 * or when it gets an IO error reading an indirect block
346 * (ditto, *@err == -EIO) 343 * (ditto, *@err == -EIO)
347 * or when it notices that chain had been changed while it was reading
348 * (ditto, *@err == -EAGAIN)
349 * or when it reads all @depth-1 indirect blocks successfully and finds 344 * or when it reads all @depth-1 indirect blocks successfully and finds
350 * the whole chain, all way to the data (returns %NULL, *err == 0). 345 * the whole chain, all way to the data (returns %NULL, *err == 0).
346 *
347 * Need to be called with
348 * down_read(&EXT4_I(inode)->i_data_sem)
351 */ 349 */
352static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets, 350static Indirect *ext4_get_branch(struct inode *inode, int depth,
351 ext4_lblk_t *offsets,
353 Indirect chain[4], int *err) 352 Indirect chain[4], int *err)
354{ 353{
355 struct super_block *sb = inode->i_sb; 354 struct super_block *sb = inode->i_sb;
@@ -365,9 +364,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
365 bh = sb_bread(sb, le32_to_cpu(p->key)); 364 bh = sb_bread(sb, le32_to_cpu(p->key));
366 if (!bh) 365 if (!bh)
367 goto failure; 366 goto failure;
368 /* Reader: pointers */
369 if (!verify_chain(chain, p))
370 goto changed;
371 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 367 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
372 /* Reader: end */ 368 /* Reader: end */
373 if (!p->key) 369 if (!p->key)
@@ -375,10 +371,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
375 } 371 }
376 return NULL; 372 return NULL;
377 373
378changed:
379 brelse(bh);
380 *err = -EAGAIN;
381 goto no_block;
382failure: 374failure:
383 *err = -EIO; 375 *err = -EIO;
384no_block: 376no_block:
@@ -445,7 +437,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
445 * stores it in *@goal and returns zero. 437 * stores it in *@goal and returns zero.
446 */ 438 */
447 439
448static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block, 440static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
449 Indirect chain[4], Indirect *partial) 441 Indirect chain[4], Indirect *partial)
450{ 442{
451 struct ext4_block_alloc_info *block_i; 443 struct ext4_block_alloc_info *block_i;
@@ -559,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
559 return ret; 551 return ret;
560failed_out: 552failed_out:
561 for (i = 0; i <index; i++) 553 for (i = 0; i <index; i++)
562 ext4_free_blocks(handle, inode, new_blocks[i], 1); 554 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
563 return ret; 555 return ret;
564} 556}
565 557
@@ -590,7 +582,7 @@ failed_out:
590 */ 582 */
591static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 583static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
592 int indirect_blks, int *blks, ext4_fsblk_t goal, 584 int indirect_blks, int *blks, ext4_fsblk_t goal,
593 int *offsets, Indirect *branch) 585 ext4_lblk_t *offsets, Indirect *branch)
594{ 586{
595 int blocksize = inode->i_sb->s_blocksize; 587 int blocksize = inode->i_sb->s_blocksize;
596 int i, n = 0; 588 int i, n = 0;
@@ -658,9 +650,9 @@ failed:
658 ext4_journal_forget(handle, branch[i].bh); 650 ext4_journal_forget(handle, branch[i].bh);
659 } 651 }
660 for (i = 0; i <indirect_blks; i++) 652 for (i = 0; i <indirect_blks; i++)
661 ext4_free_blocks(handle, inode, new_blocks[i], 1); 653 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
662 654
663 ext4_free_blocks(handle, inode, new_blocks[i], num); 655 ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
664 656
665 return err; 657 return err;
666} 658}
@@ -680,7 +672,7 @@ failed:
680 * chain to new block and return 0. 672 * chain to new block and return 0.
681 */ 673 */
682static int ext4_splice_branch(handle_t *handle, struct inode *inode, 674static int ext4_splice_branch(handle_t *handle, struct inode *inode,
683 long block, Indirect *where, int num, int blks) 675 ext4_lblk_t block, Indirect *where, int num, int blks)
684{ 676{
685 int i; 677 int i;
686 int err = 0; 678 int err = 0;
@@ -757,9 +749,10 @@ err_out:
757 for (i = 1; i <= num; i++) { 749 for (i = 1; i <= num; i++) {
758 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 750 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
759 ext4_journal_forget(handle, where[i].bh); 751 ext4_journal_forget(handle, where[i].bh);
760 ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); 752 ext4_free_blocks(handle, inode,
753 le32_to_cpu(where[i-1].key), 1, 0);
761 } 754 }
762 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); 755 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
763 756
764 return err; 757 return err;
765} 758}
@@ -782,14 +775,19 @@ err_out:
782 * return > 0, # of blocks mapped or allocated. 775 * return > 0, # of blocks mapped or allocated.
783 * return = 0, if plain lookup failed. 776 * return = 0, if plain lookup failed.
784 * return < 0, error case. 777 * return < 0, error case.
778 *
779 *
780 * Need to be called with
781 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
782 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
785 */ 783 */
786int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 784int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
787 sector_t iblock, unsigned long maxblocks, 785 ext4_lblk_t iblock, unsigned long maxblocks,
788 struct buffer_head *bh_result, 786 struct buffer_head *bh_result,
789 int create, int extend_disksize) 787 int create, int extend_disksize)
790{ 788{
791 int err = -EIO; 789 int err = -EIO;
792 int offsets[4]; 790 ext4_lblk_t offsets[4];
793 Indirect chain[4]; 791 Indirect chain[4];
794 Indirect *partial; 792 Indirect *partial;
795 ext4_fsblk_t goal; 793 ext4_fsblk_t goal;
@@ -803,7 +801,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
803 801
804 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 802 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
805 J_ASSERT(handle != NULL || create == 0); 803 J_ASSERT(handle != NULL || create == 0);
806 depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary); 804 depth = ext4_block_to_path(inode, iblock, offsets,
805 &blocks_to_boundary);
807 806
808 if (depth == 0) 807 if (depth == 0)
809 goto out; 808 goto out;
@@ -819,18 +818,6 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
819 while (count < maxblocks && count <= blocks_to_boundary) { 818 while (count < maxblocks && count <= blocks_to_boundary) {
820 ext4_fsblk_t blk; 819 ext4_fsblk_t blk;
821 820
822 if (!verify_chain(chain, partial)) {
823 /*
824 * Indirect block might be removed by
825 * truncate while we were reading it.
826 * Handling of that case: forget what we've
827 * got now. Flag the err as EAGAIN, so it
828 * will reread.
829 */
830 err = -EAGAIN;
831 count = 0;
832 break;
833 }
834 blk = le32_to_cpu(*(chain[depth-1].p + count)); 821 blk = le32_to_cpu(*(chain[depth-1].p + count));
835 822
836 if (blk == first_block + count) 823 if (blk == first_block + count)
@@ -838,44 +825,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
838 else 825 else
839 break; 826 break;
840 } 827 }
841 if (err != -EAGAIN) 828 goto got_it;
842 goto got_it;
843 } 829 }
844 830
845 /* Next simple case - plain lookup or failed read of indirect block */ 831 /* Next simple case - plain lookup or failed read of indirect block */
846 if (!create || err == -EIO) 832 if (!create || err == -EIO)
847 goto cleanup; 833 goto cleanup;
848 834
849 mutex_lock(&ei->truncate_mutex);
850
851 /*
852 * If the indirect block is missing while we are reading
853 * the chain(ext4_get_branch() returns -EAGAIN err), or
854 * if the chain has been changed after we grab the semaphore,
855 * (either because another process truncated this branch, or
856 * another get_block allocated this branch) re-grab the chain to see if
857 * the request block has been allocated or not.
858 *
859 * Since we already block the truncate/other get_block
860 * at this point, we will have the current copy of the chain when we
861 * splice the branch into the tree.
862 */
863 if (err == -EAGAIN || !verify_chain(chain, partial)) {
864 while (partial > chain) {
865 brelse(partial->bh);
866 partial--;
867 }
868 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
869 if (!partial) {
870 count++;
871 mutex_unlock(&ei->truncate_mutex);
872 if (err)
873 goto cleanup;
874 clear_buffer_new(bh_result);
875 goto got_it;
876 }
877 }
878
879 /* 835 /*
880 * Okay, we need to do block allocation. Lazily initialize the block 836 * Okay, we need to do block allocation. Lazily initialize the block
881 * allocation info here if necessary 837 * allocation info here if necessary
@@ -911,13 +867,12 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
911 err = ext4_splice_branch(handle, inode, iblock, 867 err = ext4_splice_branch(handle, inode, iblock,
912 partial, indirect_blks, count); 868 partial, indirect_blks, count);
913 /* 869 /*
914 * i_disksize growing is protected by truncate_mutex. Don't forget to 870 * i_disksize growing is protected by i_data_sem. Don't forget to
915 * protect it if you're about to implement concurrent 871 * protect it if you're about to implement concurrent
916 * ext4_get_block() -bzzz 872 * ext4_get_block() -bzzz
917 */ 873 */
918 if (!err && extend_disksize && inode->i_size > ei->i_disksize) 874 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
919 ei->i_disksize = inode->i_size; 875 ei->i_disksize = inode->i_size;
920 mutex_unlock(&ei->truncate_mutex);
921 if (err) 876 if (err)
922 goto cleanup; 877 goto cleanup;
923 878
@@ -942,6 +897,47 @@ out:
942 897
943#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) 898#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
944 899
900int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
901 unsigned long max_blocks, struct buffer_head *bh,
902 int create, int extend_disksize)
903{
904 int retval;
905 /*
906 * Try to see if we can get the block without requesting
907 * for new file system block.
908 */
909 down_read((&EXT4_I(inode)->i_data_sem));
910 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
911 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
912 bh, 0, 0);
913 } else {
914 retval = ext4_get_blocks_handle(handle,
915 inode, block, max_blocks, bh, 0, 0);
916 }
917 up_read((&EXT4_I(inode)->i_data_sem));
918 if (!create || (retval > 0))
919 return retval;
920
921 /*
922 * We need to allocate new blocks which will result
923 * in i_data update
924 */
925 down_write((&EXT4_I(inode)->i_data_sem));
926 /*
927 * We need to check for EXT4 here because migrate
928 * could have changed the inode type in between
929 */
930 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
931 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
932 bh, create, extend_disksize);
933 } else {
934 retval = ext4_get_blocks_handle(handle, inode, block,
935 max_blocks, bh, create, extend_disksize);
936 }
937 up_write((&EXT4_I(inode)->i_data_sem));
938 return retval;
939}
940
945static int ext4_get_block(struct inode *inode, sector_t iblock, 941static int ext4_get_block(struct inode *inode, sector_t iblock,
946 struct buffer_head *bh_result, int create) 942 struct buffer_head *bh_result, int create)
947{ 943{
@@ -996,7 +992,7 @@ get_block:
996 * `handle' can be NULL if create is zero 992 * `handle' can be NULL if create is zero
997 */ 993 */
998struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 994struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
999 long block, int create, int *errp) 995 ext4_lblk_t block, int create, int *errp)
1000{ 996{
1001 struct buffer_head dummy; 997 struct buffer_head dummy;
1002 int fatal = 0, err; 998 int fatal = 0, err;
@@ -1063,7 +1059,7 @@ err:
1063} 1059}
1064 1060
1065struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1061struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1066 int block, int create, int *err) 1062 ext4_lblk_t block, int create, int *err)
1067{ 1063{
1068 struct buffer_head * bh; 1064 struct buffer_head * bh;
1069 1065
@@ -1446,7 +1442,7 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1446 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 1442 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1447 * 1443 *
1448 * Same applies to ext4_get_block(). We will deadlock on various things like 1444 * Same applies to ext4_get_block(). We will deadlock on various things like
1449 * lock_journal and i_truncate_mutex. 1445 * lock_journal and i_data_sem
1450 * 1446 *
1451 * Setting PF_MEMALLOC here doesn't work - too many internal memory 1447 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1452 * allocations fail. 1448 * allocations fail.
@@ -1828,7 +1824,8 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1828{ 1824{
1829 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 1825 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1830 unsigned offset = from & (PAGE_CACHE_SIZE-1); 1826 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1831 unsigned blocksize, iblock, length, pos; 1827 unsigned blocksize, length, pos;
1828 ext4_lblk_t iblock;
1832 struct inode *inode = mapping->host; 1829 struct inode *inode = mapping->host;
1833 struct buffer_head *bh; 1830 struct buffer_head *bh;
1834 int err = 0; 1831 int err = 0;
@@ -1964,7 +1961,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
1964 * (no partially truncated stuff there). */ 1961 * (no partially truncated stuff there). */
1965 1962
1966static Indirect *ext4_find_shared(struct inode *inode, int depth, 1963static Indirect *ext4_find_shared(struct inode *inode, int depth,
1967 int offsets[4], Indirect chain[4], __le32 *top) 1964 ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
1968{ 1965{
1969 Indirect *partial, *p; 1966 Indirect *partial, *p;
1970 int k, err; 1967 int k, err;
@@ -2048,15 +2045,15 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
2048 for (p = first; p < last; p++) { 2045 for (p = first; p < last; p++) {
2049 u32 nr = le32_to_cpu(*p); 2046 u32 nr = le32_to_cpu(*p);
2050 if (nr) { 2047 if (nr) {
2051 struct buffer_head *bh; 2048 struct buffer_head *tbh;
2052 2049
2053 *p = 0; 2050 *p = 0;
2054 bh = sb_find_get_block(inode->i_sb, nr); 2051 tbh = sb_find_get_block(inode->i_sb, nr);
2055 ext4_forget(handle, 0, inode, bh, nr); 2052 ext4_forget(handle, 0, inode, tbh, nr);
2056 } 2053 }
2057 } 2054 }
2058 2055
2059 ext4_free_blocks(handle, inode, block_to_free, count); 2056 ext4_free_blocks(handle, inode, block_to_free, count, 0);
2060} 2057}
2061 2058
2062/** 2059/**
@@ -2229,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
2229 ext4_journal_test_restart(handle, inode); 2226 ext4_journal_test_restart(handle, inode);
2230 } 2227 }
2231 2228
2232 ext4_free_blocks(handle, inode, nr, 1); 2229 ext4_free_blocks(handle, inode, nr, 1, 1);
2233 2230
2234 if (parent_bh) { 2231 if (parent_bh) {
2235 /* 2232 /*
@@ -2289,12 +2286,12 @@ void ext4_truncate(struct inode *inode)
2289 __le32 *i_data = ei->i_data; 2286 __le32 *i_data = ei->i_data;
2290 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 2287 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2291 struct address_space *mapping = inode->i_mapping; 2288 struct address_space *mapping = inode->i_mapping;
2292 int offsets[4]; 2289 ext4_lblk_t offsets[4];
2293 Indirect chain[4]; 2290 Indirect chain[4];
2294 Indirect *partial; 2291 Indirect *partial;
2295 __le32 nr = 0; 2292 __le32 nr = 0;
2296 int n; 2293 int n;
2297 long last_block; 2294 ext4_lblk_t last_block;
2298 unsigned blocksize = inode->i_sb->s_blocksize; 2295 unsigned blocksize = inode->i_sb->s_blocksize;
2299 struct page *page; 2296 struct page *page;
2300 2297
@@ -2320,8 +2317,10 @@ void ext4_truncate(struct inode *inode)
2320 return; 2317 return;
2321 } 2318 }
2322 2319
2323 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 2320 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
2324 return ext4_ext_truncate(inode, page); 2321 ext4_ext_truncate(inode, page);
2322 return;
2323 }
2325 2324
2326 handle = start_transaction(inode); 2325 handle = start_transaction(inode);
2327 if (IS_ERR(handle)) { 2326 if (IS_ERR(handle)) {
@@ -2369,7 +2368,7 @@ void ext4_truncate(struct inode *inode)
2369 * From here we block out all ext4_get_block() callers who want to 2368 * From here we block out all ext4_get_block() callers who want to
2370 * modify the block allocation tree. 2369 * modify the block allocation tree.
2371 */ 2370 */
2372 mutex_lock(&ei->truncate_mutex); 2371 down_write(&ei->i_data_sem);
2373 2372
2374 if (n == 1) { /* direct blocks */ 2373 if (n == 1) { /* direct blocks */
2375 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 2374 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
@@ -2433,7 +2432,7 @@ do_indirects:
2433 2432
2434 ext4_discard_reservation(inode); 2433 ext4_discard_reservation(inode);
2435 2434
2436 mutex_unlock(&ei->truncate_mutex); 2435 up_write(&ei->i_data_sem);
2437 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 2436 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
2438 ext4_mark_inode_dirty(handle, inode); 2437 ext4_mark_inode_dirty(handle, inode);
2439 2438
@@ -2460,7 +2459,8 @@ out_stop:
2460static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, 2459static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
2461 unsigned long ino, struct ext4_iloc *iloc) 2460 unsigned long ino, struct ext4_iloc *iloc)
2462{ 2461{
2463 unsigned long desc, group_desc, block_group; 2462 unsigned long desc, group_desc;
2463 ext4_group_t block_group;
2464 unsigned long offset; 2464 unsigned long offset;
2465 ext4_fsblk_t block; 2465 ext4_fsblk_t block;
2466 struct buffer_head *bh; 2466 struct buffer_head *bh;
@@ -2547,7 +2547,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
2547 struct ext4_group_desc *desc; 2547 struct ext4_group_desc *desc;
2548 int inodes_per_buffer; 2548 int inodes_per_buffer;
2549 int inode_offset, i; 2549 int inode_offset, i;
2550 int block_group; 2550 ext4_group_t block_group;
2551 int start; 2551 int start;
2552 2552
2553 block_group = (inode->i_ino - 1) / 2553 block_group = (inode->i_ino - 1) /
@@ -2660,6 +2660,28 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
2660 if (flags & S_DIRSYNC) 2660 if (flags & S_DIRSYNC)
2661 ei->i_flags |= EXT4_DIRSYNC_FL; 2661 ei->i_flags |= EXT4_DIRSYNC_FL;
2662} 2662}
2663static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
2664 struct ext4_inode_info *ei)
2665{
2666 blkcnt_t i_blocks ;
2667 struct inode *inode = &(ei->vfs_inode);
2668 struct super_block *sb = inode->i_sb;
2669
2670 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
2671 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2672 /* we are using combined 48 bit field */
2673 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
2674 le32_to_cpu(raw_inode->i_blocks_lo);
2675 if (ei->i_flags & EXT4_HUGE_FILE_FL) {
2676 /* i_blocks represent file system block size */
2677 return i_blocks << (inode->i_blkbits - 9);
2678 } else {
2679 return i_blocks;
2680 }
2681 } else {
2682 return le32_to_cpu(raw_inode->i_blocks_lo);
2683 }
2684}
2663 2685
2664void ext4_read_inode(struct inode * inode) 2686void ext4_read_inode(struct inode * inode)
2665{ 2687{
@@ -2687,7 +2709,6 @@ void ext4_read_inode(struct inode * inode)
2687 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 2709 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2688 } 2710 }
2689 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 2711 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2690 inode->i_size = le32_to_cpu(raw_inode->i_size);
2691 2712
2692 ei->i_state = 0; 2713 ei->i_state = 0;
2693 ei->i_dir_start_lookup = 0; 2714 ei->i_dir_start_lookup = 0;
@@ -2709,19 +2730,15 @@ void ext4_read_inode(struct inode * inode)
2709 * recovery code: that's fine, we're about to complete 2730 * recovery code: that's fine, we're about to complete
2710 * the process of deleting those. */ 2731 * the process of deleting those. */
2711 } 2732 }
2712 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2713 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 2733 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2714 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); 2734 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
2735 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
2715 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 2736 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2716 cpu_to_le32(EXT4_OS_HURD)) 2737 cpu_to_le32(EXT4_OS_HURD)) {
2717 ei->i_file_acl |= 2738 ei->i_file_acl |=
2718 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 2739 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
2719 if (!S_ISREG(inode->i_mode)) {
2720 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2721 } else {
2722 inode->i_size |=
2723 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2724 } 2740 }
2741 inode->i_size = ext4_isize(raw_inode);
2725 ei->i_disksize = inode->i_size; 2742 ei->i_disksize = inode->i_size;
2726 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 2743 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2727 ei->i_block_group = iloc.block_group; 2744 ei->i_block_group = iloc.block_group;
@@ -2765,6 +2782,13 @@ void ext4_read_inode(struct inode * inode)
2765 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 2782 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
2766 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 2783 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
2767 2784
2785 inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
2786 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
2787 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
2788 inode->i_version |=
2789 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
2790 }
2791
2768 if (S_ISREG(inode->i_mode)) { 2792 if (S_ISREG(inode->i_mode)) {
2769 inode->i_op = &ext4_file_inode_operations; 2793 inode->i_op = &ext4_file_inode_operations;
2770 inode->i_fop = &ext4_file_operations; 2794 inode->i_fop = &ext4_file_operations;
@@ -2797,6 +2821,55 @@ bad_inode:
2797 return; 2821 return;
2798} 2822}
2799 2823
2824static int ext4_inode_blocks_set(handle_t *handle,
2825 struct ext4_inode *raw_inode,
2826 struct ext4_inode_info *ei)
2827{
2828 struct inode *inode = &(ei->vfs_inode);
2829 u64 i_blocks = inode->i_blocks;
2830 struct super_block *sb = inode->i_sb;
2831 int err = 0;
2832
2833 if (i_blocks <= ~0U) {
2834 /*
2835 * i_blocks can be represnted in a 32 bit variable
2836 * as multiple of 512 bytes
2837 */
2838 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
2839 raw_inode->i_blocks_high = 0;
2840 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
2841 } else if (i_blocks <= 0xffffffffffffULL) {
2842 /*
2843 * i_blocks can be represented in a 48 bit variable
2844 * as multiple of 512 bytes
2845 */
2846 err = ext4_update_rocompat_feature(handle, sb,
2847 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2848 if (err)
2849 goto err_out;
2850 /* i_block is stored in the split 48 bit fields */
2851 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
2852 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
2853 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
2854 } else {
2855 /*
2856 * i_blocks should be represented in a 48 bit variable
2857 * as multiple of file system block size
2858 */
2859 err = ext4_update_rocompat_feature(handle, sb,
2860 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2861 if (err)
2862 goto err_out;
2863 ei->i_flags |= EXT4_HUGE_FILE_FL;
2864 /* i_block is stored in file system block size */
2865 i_blocks = i_blocks >> (inode->i_blkbits - 9);
2866 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
2867 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
2868 }
2869err_out:
2870 return err;
2871}
2872
2800/* 2873/*
2801 * Post the struct inode info into an on-disk inode location in the 2874 * Post the struct inode info into an on-disk inode location in the
2802 * buffer-cache. This gobbles the caller's reference to the 2875 * buffer-cache. This gobbles the caller's reference to the
@@ -2845,47 +2918,42 @@ static int ext4_do_update_inode(handle_t *handle,
2845 raw_inode->i_gid_high = 0; 2918 raw_inode->i_gid_high = 0;
2846 } 2919 }
2847 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 2920 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2848 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2849 2921
2850 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 2922 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
2851 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 2923 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
2852 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 2924 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
2853 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 2925 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
2854 2926
2855 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); 2927 if (ext4_inode_blocks_set(handle, raw_inode, ei))
2928 goto out_brelse;
2856 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 2929 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2857 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 2930 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2858 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 2931 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2859 cpu_to_le32(EXT4_OS_HURD)) 2932 cpu_to_le32(EXT4_OS_HURD))
2860 raw_inode->i_file_acl_high = 2933 raw_inode->i_file_acl_high =
2861 cpu_to_le16(ei->i_file_acl >> 32); 2934 cpu_to_le16(ei->i_file_acl >> 32);
2862 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); 2935 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
2863 if (!S_ISREG(inode->i_mode)) { 2936 ext4_isize_set(raw_inode, ei->i_disksize);
2864 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); 2937 if (ei->i_disksize > 0x7fffffffULL) {
2865 } else { 2938 struct super_block *sb = inode->i_sb;
2866 raw_inode->i_size_high = 2939 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
2867 cpu_to_le32(ei->i_disksize >> 32); 2940 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
2868 if (ei->i_disksize > 0x7fffffffULL) { 2941 EXT4_SB(sb)->s_es->s_rev_level ==
2869 struct super_block *sb = inode->i_sb; 2942 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
2870 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 2943 /* If this is the first large file
2871 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 2944 * created, add a flag to the superblock.
2872 EXT4_SB(sb)->s_es->s_rev_level == 2945 */
2873 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 2946 err = ext4_journal_get_write_access(handle,
2874 /* If this is the first large file 2947 EXT4_SB(sb)->s_sbh);
2875 * created, add a flag to the superblock. 2948 if (err)
2876 */ 2949 goto out_brelse;
2877 err = ext4_journal_get_write_access(handle, 2950 ext4_update_dynamic_rev(sb);
2878 EXT4_SB(sb)->s_sbh); 2951 EXT4_SET_RO_COMPAT_FEATURE(sb,
2879 if (err)
2880 goto out_brelse;
2881 ext4_update_dynamic_rev(sb);
2882 EXT4_SET_RO_COMPAT_FEATURE(sb,
2883 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 2952 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
2884 sb->s_dirt = 1; 2953 sb->s_dirt = 1;
2885 handle->h_sync = 1; 2954 handle->h_sync = 1;
2886 err = ext4_journal_dirty_metadata(handle, 2955 err = ext4_journal_dirty_metadata(handle,
2887 EXT4_SB(sb)->s_sbh); 2956 EXT4_SB(sb)->s_sbh);
2888 }
2889 } 2957 }
2890 } 2958 }
2891 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 2959 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -2903,8 +2971,14 @@ static int ext4_do_update_inode(handle_t *handle,
2903 } else for (block = 0; block < EXT4_N_BLOCKS; block++) 2971 } else for (block = 0; block < EXT4_N_BLOCKS; block++)
2904 raw_inode->i_block[block] = ei->i_data[block]; 2972 raw_inode->i_block[block] = ei->i_data[block];
2905 2973
2906 if (ei->i_extra_isize) 2974 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
2975 if (ei->i_extra_isize) {
2976 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
2977 raw_inode->i_version_hi =
2978 cpu_to_le32(inode->i_version >> 32);
2907 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 2979 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2980 }
2981
2908 2982
2909 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 2983 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
2910 rc = ext4_journal_dirty_metadata(handle, bh); 2984 rc = ext4_journal_dirty_metadata(handle, bh);
@@ -3024,6 +3098,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3024 ext4_journal_stop(handle); 3098 ext4_journal_stop(handle);
3025 } 3099 }
3026 3100
3101 if (attr->ia_valid & ATTR_SIZE) {
3102 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
3103 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3104
3105 if (attr->ia_size > sbi->s_bitmap_maxbytes) {
3106 error = -EFBIG;
3107 goto err_out;
3108 }
3109 }
3110 }
3111
3027 if (S_ISREG(inode->i_mode) && 3112 if (S_ISREG(inode->i_mode) &&
3028 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 3113 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
3029 handle_t *handle; 3114 handle_t *handle;
@@ -3120,6 +3205,9 @@ int ext4_mark_iloc_dirty(handle_t *handle,
3120{ 3205{
3121 int err = 0; 3206 int err = 0;
3122 3207
3208 if (test_opt(inode->i_sb, I_VERSION))
3209 inode_inc_iversion(inode);
3210
3123 /* the do_update_inode consumes one bh->b_count */ 3211 /* the do_update_inode consumes one bh->b_count */
3124 get_bh(iloc->bh); 3212 get_bh(iloc->bh);
3125 3213
@@ -3158,8 +3246,10 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
3158 * Expand an inode by new_extra_isize bytes. 3246 * Expand an inode by new_extra_isize bytes.
3159 * Returns 0 on success or negative error number on failure. 3247 * Returns 0 on success or negative error number on failure.
3160 */ 3248 */
3161int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, 3249static int ext4_expand_extra_isize(struct inode *inode,
3162 struct ext4_iloc iloc, handle_t *handle) 3250 unsigned int new_extra_isize,
3251 struct ext4_iloc iloc,
3252 handle_t *handle)
3163{ 3253{
3164 struct ext4_inode *raw_inode; 3254 struct ext4_inode *raw_inode;
3165 struct ext4_xattr_ibody_header *header; 3255 struct ext4_xattr_ibody_header *header;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e7f894bdb420..2ed7c37f897e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -199,7 +199,7 @@ flags_err:
199 * need to allocate reservation structure for this inode 199 * need to allocate reservation structure for this inode
200 * before set the window size 200 * before set the window size
201 */ 201 */
202 mutex_lock(&ei->truncate_mutex); 202 down_write(&ei->i_data_sem);
203 if (!ei->i_block_alloc_info) 203 if (!ei->i_block_alloc_info)
204 ext4_init_block_alloc_info(inode); 204 ext4_init_block_alloc_info(inode);
205 205
@@ -207,7 +207,7 @@ flags_err:
207 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; 207 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
208 rsv->rsv_goal_size = rsv_window_size; 208 rsv->rsv_goal_size = rsv_window_size;
209 } 209 }
210 mutex_unlock(&ei->truncate_mutex); 210 up_write(&ei->i_data_sem);
211 return 0; 211 return 0;
212 } 212 }
213 case EXT4_IOC_GROUP_EXTEND: { 213 case EXT4_IOC_GROUP_EXTEND: {
@@ -254,6 +254,9 @@ flags_err:
254 return err; 254 return err;
255 } 255 }
256 256
257 case EXT4_IOC_MIGRATE:
258 return ext4_ext_migrate(inode, filp, cmd, arg);
259
257 default: 260 default:
258 return -ENOTTY; 261 return -ENOTTY;
259 } 262 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
new file mode 100644
index 000000000000..76e5fedc0a0b
--- /dev/null
+++ b/fs/ext4/mballoc.c
@@ -0,0 +1,4552 @@
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */
18
19
20/*
21 * mballoc.c contains the multiblocks allocation routines
22 */
23
24#include <linux/time.h>
25#include <linux/fs.h>
26#include <linux/namei.h>
27#include <linux/ext4_jbd2.h>
28#include <linux/ext4_fs.h>
29#include <linux/quotaops.h>
30#include <linux/buffer_head.h>
31#include <linux/module.h>
32#include <linux/swap.h>
33#include <linux/proc_fs.h>
34#include <linux/pagemap.h>
35#include <linux/seq_file.h>
36#include <linux/version.h>
37#include "group.h"
38
39/*
40 * MUSTDO:
41 * - test ext4_ext_search_left() and ext4_ext_search_right()
42 * - search for metadata in few groups
43 *
44 * TODO v4:
45 * - normalization should take into account whether file is still open
46 * - discard preallocations if no free space left (policy?)
47 * - don't normalize tails
48 * - quota
49 * - reservation for superuser
50 *
51 * TODO v3:
52 * - bitmap read-ahead (proposed by Oleg Drokin aka green)
53 * - track min/max extents in each group for better group selection
54 * - mb_mark_used() may allocate chunk right after splitting buddy
55 * - tree of groups sorted by number of free blocks
56 * - error handling
57 */
58
59/*
60 * The allocation request involve request for multiple number of blocks
61 * near to the goal(block) value specified.
62 *
63 * During initialization phase of the allocator we decide to use the group
64 * preallocation or inode preallocation depending on the size file. The
65 * size of the file could be the resulting file size we would have after
66 * allocation or the current file size which ever is larger. If the size is
67 * less that sbi->s_mb_stream_request we select the group
68 * preallocation. The default value of s_mb_stream_request is 16
69 * blocks. This can also be tuned via
70 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
71 * of number of blocks.
72 *
73 * The main motivation for having small file use group preallocation is to
74 * ensure that we have small file closer in the disk.
75 *
76 * First stage the allocator looks at the inode prealloc list
77 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
78 * this particular inode. The inode prealloc space is represented as:
79 *
80 * pa_lstart -> the logical start block for this prealloc space
81 * pa_pstart -> the physical start block for this prealloc space
82 * pa_len -> lenght for this prealloc space
83 * pa_free -> free space available in this prealloc space
84 *
85 * The inode preallocation space is used looking at the _logical_ start
86 * block. If only the logical file block falls within the range of prealloc
87 * space we will consume the particular prealloc space. This make sure that
88 * that the we have contiguous physical blocks representing the file blocks
89 *
90 * The important thing to be noted in case of inode prealloc space is that
91 * we don't modify the values associated to inode prealloc space except
92 * pa_free.
93 *
94 * If we are not able to find blocks in the inode prealloc space and if we
95 * have the group allocation flag set then we look at the locality group
96 * prealloc space. These are per CPU prealloc list repreasented as
97 *
98 * ext4_sb_info.s_locality_groups[smp_processor_id()]
99 *
100 * The reason for having a per cpu locality group is to reduce the contention
101 * between CPUs. It is possible to get scheduled at this point.
102 *
103 * The locality group prealloc space is used looking at whether we have
104 * enough free space (pa_free) withing the prealloc space.
105 *
106 * If we can't allocate blocks via inode prealloc or/and locality group
107 * prealloc then we look at the buddy cache. The buddy cache is represented
108 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
109 * mapped to the buddy and bitmap information regarding different
110 * groups. The buddy information is attached to buddy cache inode so that
111 * we can access them through the page cache. The information regarding
112 * each group is loaded via ext4_mb_load_buddy. The information involve
113 * block bitmap and buddy information. The information are stored in the
114 * inode as:
115 *
116 * { page }
117 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
118 *
119 *
120 * one block each for bitmap and buddy information. So for each group we
121 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
122 * blocksize) blocks. So it can have information regarding groups_per_page
123 * which is blocks_per_page/2
124 *
125 * The buddy cache inode is not stored on disk. The inode is thrown
126 * away when the filesystem is unmounted.
127 *
128 * We look for count number of blocks in the buddy cache. If we were able
129 * to locate that many free blocks we return with additional information
130 * regarding rest of the contiguous physical block available
131 *
132 * Before allocating blocks via buddy cache we normalize the request
133 * blocks. This ensure we ask for more blocks that we needed. The extra
134 * blocks that we get after allocation is added to the respective prealloc
135 * list. In case of inode preallocation we follow a list of heuristics
136 * based on file size. This can be found in ext4_mb_normalize_request. If
137 * we are doing a group prealloc we try to normalize the request to
138 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
139 * 512 blocks. This can be tuned via
140 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
141 * terms of number of blocks. If we have mounted the file system with -O
142 * stripe=<value> option the group prealloc request is normalized to the
143 * stripe value (sbi->s_stripe)
144 *
145 * The regular allocator(using the buddy cache) support few tunables.
146 *
147 * /proc/fs/ext4/<partition>/min_to_scan
148 * /proc/fs/ext4/<partition>/max_to_scan
149 * /proc/fs/ext4/<partition>/order2_req
150 *
151 * The regular allocator use buddy scan only if the request len is power of
152 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
153 * value of s_mb_order2_reqs can be tuned via
154 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to
155 * stripe size (sbi->s_stripe), we try to search for contigous block in
156 * stripe size. This should result in better allocation on RAID setup. If
157 * not we search in the specific group using bitmap for best extents. The
158 * tunable min_to_scan and max_to_scan controll the behaviour here.
159 * min_to_scan indicate how long the mballoc __must__ look for a best
160 * extent and max_to_scanindicate how long the mballoc __can__ look for a
161 * best extent in the found extents. Searching for the blocks starts with
162 * the group specified as the goal value in allocation context via
163 * ac_g_ex. Each group is first checked based on the criteria whether it
164 * can used for allocation. ext4_mb_good_group explains how the groups are
165 * checked.
166 *
167 * Both the prealloc space are getting populated as above. So for the first
168 * request we will hit the buddy cache which will result in this prealloc
169 * space getting filled. The prealloc space is then later used for the
170 * subsequent request.
171 */
172
173/*
174 * mballoc operates on the following data:
175 * - on-disk bitmap
176 * - in-core buddy (actually includes buddy and bitmap)
177 * - preallocation descriptors (PAs)
178 *
179 * there are two types of preallocations:
180 * - inode
181 * assiged to specific inode and can be used for this inode only.
182 * it describes part of inode's space preallocated to specific
183 * physical blocks. any block from that preallocated can be used
184 * independent. the descriptor just tracks number of blocks left
185 * unused. so, before taking some block from descriptor, one must
186 * make sure corresponded logical block isn't allocated yet. this
187 * also means that freeing any block within descriptor's range
188 * must discard all preallocated blocks.
189 * - locality group
190 * assigned to specific locality group which does not translate to
191 * permanent set of inodes: inode can join and leave group. space
192 * from this type of preallocation can be used for any inode. thus
193 * it's consumed from the beginning to the end.
194 *
195 * relation between them can be expressed as:
196 * in-core buddy = on-disk bitmap + preallocation descriptors
197 *
198 * this mean blocks mballoc considers used are:
199 * - allocated blocks (persistent)
200 * - preallocated blocks (non-persistent)
201 *
202 * consistency in mballoc world means that at any time a block is either
203 * free or used in ALL structures. notice: "any time" should not be read
204 * literally -- time is discrete and delimited by locks.
205 *
206 * to keep it simple, we don't use block numbers, instead we count number of
207 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
208 *
209 * all operations can be expressed as:
210 * - init buddy: buddy = on-disk + PAs
211 * - new PA: buddy += N; PA = N
212 * - use inode PA: on-disk += N; PA -= N
213 * - discard inode PA buddy -= on-disk - PA; PA = 0
214 * - use locality group PA on-disk += N; PA -= N
215 * - discard locality group PA buddy -= PA; PA = 0
216 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
217 * is used in real operation because we can't know actual used
218 * bits from PA, only from on-disk bitmap
219 *
220 * if we follow this strict logic, then all operations above should be atomic.
221 * given some of them can block, we'd have to use something like semaphores
222 * killing performance on high-end SMP hardware. let's try to relax it using
223 * the following knowledge:
224 * 1) if buddy is referenced, it's already initialized
225 * 2) while block is used in buddy and the buddy is referenced,
226 * nobody can re-allocate that block
227 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
228 * bit set and PA claims same block, it's OK. IOW, one can set bit in
229 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
230 * block
231 *
232 * so, now we're building a concurrency table:
233 * - init buddy vs.
234 * - new PA
235 * blocks for PA are allocated in the buddy, buddy must be referenced
236 * until PA is linked to allocation group to avoid concurrent buddy init
237 * - use inode PA
238 * we need to make sure that either on-disk bitmap or PA has uptodate data
239 * given (3) we care that PA-=N operation doesn't interfere with init
240 * - discard inode PA
241 * the simplest way would be to have buddy initialized by the discard
242 * - use locality group PA
243 * again PA-=N must be serialized with init
244 * - discard locality group PA
245 * the simplest way would be to have buddy initialized by the discard
246 * - new PA vs.
247 * - use inode PA
248 * i_data_sem serializes them
249 * - discard inode PA
250 * discard process must wait until PA isn't used by another process
251 * - use locality group PA
252 * some mutex should serialize them
253 * - discard locality group PA
254 * discard process must wait until PA isn't used by another process
255 * - use inode PA
256 * - use inode PA
257 * i_data_sem or another mutex should serializes them
258 * - discard inode PA
259 * discard process must wait until PA isn't used by another process
260 * - use locality group PA
261 * nothing wrong here -- they're different PAs covering different blocks
262 * - discard locality group PA
263 * discard process must wait until PA isn't used by another process
264 *
265 * now we're ready to make few consequences:
266 * - PA is referenced and while it is no discard is possible
267 * - PA is referenced until block isn't marked in on-disk bitmap
268 * - PA changes only after on-disk bitmap
269 * - discard must not compete with init. either init is done before
270 * any discard or they're serialized somehow
271 * - buddy init as sum of on-disk bitmap and PAs is done atomically
272 *
273 * a special case when we've used PA to emptiness. no need to modify buddy
274 * in this case, but we should care about concurrent init
275 *
276 */
277
278 /*
279 * Logic in few words:
280 *
281 * - allocation:
282 * load group
283 * find blocks
284 * mark bits in on-disk bitmap
285 * release group
286 *
287 * - use preallocation:
288 * find proper PA (per-inode or group)
289 * load group
290 * mark bits in on-disk bitmap
291 * release group
292 * release PA
293 *
294 * - free:
295 * load group
296 * mark bits in on-disk bitmap
297 * release group
298 *
299 * - discard preallocations in group:
300 * mark PAs deleted
301 * move them onto local list
302 * load on-disk bitmap
303 * load group
304 * remove PA from object (inode or locality group)
305 * mark free blocks in-core
306 *
307 * - discard inode's preallocations:
308 */
309
310/*
311 * Locking rules
312 *
313 * Locks:
314 * - bitlock on a group (group)
315 * - object (inode/locality) (object)
316 * - per-pa lock (pa)
317 *
318 * Paths:
319 * - new pa
320 * object
321 * group
322 *
323 * - find and use pa:
324 * pa
325 *
326 * - release consumed pa:
327 * pa
328 * group
329 * object
330 *
331 * - generate in-core bitmap:
332 * group
333 * pa
334 *
335 * - discard all for given object (inode, locality group):
336 * object
337 * pa
338 * group
339 *
340 * - discard all for given group:
341 * group
342 * pa
343 * group
344 * object
345 *
346 */
347
348/*
349 * with AGGRESSIVE_CHECK allocator runs consistency checks over
350 * structures. these checks slow things down a lot
351 */
352#define AGGRESSIVE_CHECK__
353
354/*
355 * with DOUBLE_CHECK defined mballoc creates persistent in-core
356 * bitmaps, maintains and uses them to check for double allocations
357 */
358#define DOUBLE_CHECK__
359
360/*
361 */
362#define MB_DEBUG__
363#ifdef MB_DEBUG
364#define mb_debug(fmt, a...) printk(fmt, ##a)
365#else
366#define mb_debug(fmt, a...)
367#endif
368
369/*
370 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
371 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
372 */
373#define EXT4_MB_HISTORY
374#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
375#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
376#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
377#define EXT4_MB_HISTORY_FREE 8 /* free */
378
379#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
380 EXT4_MB_HISTORY_PREALLOC)
381
382/*
383 * How long mballoc can look for a best extent (in found extents)
384 */
385#define MB_DEFAULT_MAX_TO_SCAN 200
386
387/*
388 * How long mballoc must look for a best extent
389 */
390#define MB_DEFAULT_MIN_TO_SCAN 10
391
392/*
393 * How many groups mballoc will scan looking for the best chunk
394 */
395#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
396
397/*
398 * with 'ext4_mb_stats' allocator will collect stats that will be
399 * shown at umount. The collecting costs though!
400 */
401#define MB_DEFAULT_STATS 1
402
403/*
404 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
405 * by the stream allocator, which purpose is to pack requests
406 * as close each to other as possible to produce smooth I/O traffic
407 * We use locality group prealloc space for stream request.
408 * We can tune the same via /proc/fs/ext4/<parition>/stream_req
409 */
410#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
411
412/*
413 * for which requests use 2^N search using buddies
414 */
415#define MB_DEFAULT_ORDER2_REQS 2
416
417/*
418 * default group prealloc size 512 blocks
419 */
420#define MB_DEFAULT_GROUP_PREALLOC 512
421
422static struct kmem_cache *ext4_pspace_cachep;
423
424#ifdef EXT4_BB_MAX_BLOCKS
425#undef EXT4_BB_MAX_BLOCKS
426#endif
427#define EXT4_BB_MAX_BLOCKS 30
428
429struct ext4_free_metadata {
430 ext4_group_t group;
431 unsigned short num;
432 ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
433 struct list_head list;
434};
435
436struct ext4_group_info {
437 unsigned long bb_state;
438 unsigned long bb_tid;
439 struct ext4_free_metadata *bb_md_cur;
440 unsigned short bb_first_free;
441 unsigned short bb_free;
442 unsigned short bb_fragments;
443 struct list_head bb_prealloc_list;
444#ifdef DOUBLE_CHECK
445 void *bb_bitmap;
446#endif
447 unsigned short bb_counters[];
448};
449
450#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
451#define EXT4_GROUP_INFO_LOCKED_BIT 1
452
453#define EXT4_MB_GRP_NEED_INIT(grp) \
454 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
455
456
457struct ext4_prealloc_space {
458 struct list_head pa_inode_list;
459 struct list_head pa_group_list;
460 union {
461 struct list_head pa_tmp_list;
462 struct rcu_head pa_rcu;
463 } u;
464 spinlock_t pa_lock;
465 atomic_t pa_count;
466 unsigned pa_deleted;
467 ext4_fsblk_t pa_pstart; /* phys. block */
468 ext4_lblk_t pa_lstart; /* log. block */
469 unsigned short pa_len; /* len of preallocated chunk */
470 unsigned short pa_free; /* how many blocks are free */
471 unsigned short pa_linear; /* consumed in one direction
472 * strictly, for grp prealloc */
473 spinlock_t *pa_obj_lock;
474 struct inode *pa_inode; /* hack, for history only */
475};
476
477
478struct ext4_free_extent {
479 ext4_lblk_t fe_logical;
480 ext4_grpblk_t fe_start;
481 ext4_group_t fe_group;
482 int fe_len;
483};
484
485/*
486 * Locality group:
487 * we try to group all related changes together
488 * so that writeback can flush/allocate them together as well
489 */
490struct ext4_locality_group {
491 /* for allocator */
492 struct mutex lg_mutex; /* to serialize allocates */
493 struct list_head lg_prealloc_list;/* list of preallocations */
494 spinlock_t lg_prealloc_lock;
495};
496
497struct ext4_allocation_context {
498 struct inode *ac_inode;
499 struct super_block *ac_sb;
500
501 /* original request */
502 struct ext4_free_extent ac_o_ex;
503
504 /* goal request (after normalization) */
505 struct ext4_free_extent ac_g_ex;
506
507 /* the best found extent */
508 struct ext4_free_extent ac_b_ex;
509
510 /* copy of the bext found extent taken before preallocation efforts */
511 struct ext4_free_extent ac_f_ex;
512
513 /* number of iterations done. we have to track to limit searching */
514 unsigned long ac_ex_scanned;
515 __u16 ac_groups_scanned;
516 __u16 ac_found;
517 __u16 ac_tail;
518 __u16 ac_buddy;
519 __u16 ac_flags; /* allocation hints */
520 __u8 ac_status;
521 __u8 ac_criteria;
522 __u8 ac_repeats;
523 __u8 ac_2order; /* if request is to allocate 2^N blocks and
524 * N > 0, the field stores N, otherwise 0 */
525 __u8 ac_op; /* operation, for history only */
526 struct page *ac_bitmap_page;
527 struct page *ac_buddy_page;
528 struct ext4_prealloc_space *ac_pa;
529 struct ext4_locality_group *ac_lg;
530};
531
532#define AC_STATUS_CONTINUE 1
533#define AC_STATUS_FOUND 2
534#define AC_STATUS_BREAK 3
535
536struct ext4_mb_history {
537 struct ext4_free_extent orig; /* orig allocation */
538 struct ext4_free_extent goal; /* goal allocation */
539 struct ext4_free_extent result; /* result allocation */
540 unsigned pid;
541 unsigned ino;
542 __u16 found; /* how many extents have been found */
543 __u16 groups; /* how many groups have been scanned */
544 __u16 tail; /* what tail broke some buddy */
545 __u16 buddy; /* buddy the tail ^^^ broke */
546 __u16 flags;
547 __u8 cr:3; /* which phase the result extent was found at */
548 __u8 op:4;
549 __u8 merged:1;
550};
551
552struct ext4_buddy {
553 struct page *bd_buddy_page;
554 void *bd_buddy;
555 struct page *bd_bitmap_page;
556 void *bd_bitmap;
557 struct ext4_group_info *bd_info;
558 struct super_block *bd_sb;
559 __u16 bd_blkbits;
560 ext4_group_t bd_group;
561};
562#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
563#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
564
565#ifndef EXT4_MB_HISTORY
566static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
567{
568 return;
569}
570#else
571static void ext4_mb_store_history(struct ext4_allocation_context *ac);
572#endif
573
574#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
575
576static struct proc_dir_entry *proc_root_ext4;
577struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
578ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
579 ext4_fsblk_t goal, unsigned long *count, int *errp);
580
581static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
582 ext4_group_t group);
583static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
584static void ext4_mb_free_committed_blocks(struct super_block *);
585static void ext4_mb_return_to_preallocation(struct inode *inode,
586 struct ext4_buddy *e4b, sector_t block,
587 int count);
588static void ext4_mb_put_pa(struct ext4_allocation_context *,
589 struct super_block *, struct ext4_prealloc_space *pa);
590static int ext4_mb_init_per_dev_proc(struct super_block *sb);
591static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
592
593
594static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
595{
596 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
597
598 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
599}
600
601static inline void ext4_unlock_group(struct super_block *sb,
602 ext4_group_t group)
603{
604 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
605
606 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
607}
608
609static inline int ext4_is_group_locked(struct super_block *sb,
610 ext4_group_t group)
611{
612 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
613
614 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
615 &(grinfo->bb_state));
616}
617
618static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
619 struct ext4_free_extent *fex)
620{
621 ext4_fsblk_t block;
622
623 block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
624 + fex->fe_start
625 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
626 return block;
627}
628
629#if BITS_PER_LONG == 64
630#define mb_correct_addr_and_bit(bit, addr) \
631{ \
632 bit += ((unsigned long) addr & 7UL) << 3; \
633 addr = (void *) ((unsigned long) addr & ~7UL); \
634}
635#elif BITS_PER_LONG == 32
636#define mb_correct_addr_and_bit(bit, addr) \
637{ \
638 bit += ((unsigned long) addr & 3UL) << 3; \
639 addr = (void *) ((unsigned long) addr & ~3UL); \
640}
641#else
642#error "how many bits you are?!"
643#endif
644
645static inline int mb_test_bit(int bit, void *addr)
646{
647 /*
648 * ext4_test_bit on architecture like powerpc
649 * needs unsigned long aligned address
650 */
651 mb_correct_addr_and_bit(bit, addr);
652 return ext4_test_bit(bit, addr);
653}
654
655static inline void mb_set_bit(int bit, void *addr)
656{
657 mb_correct_addr_and_bit(bit, addr);
658 ext4_set_bit(bit, addr);
659}
660
661static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
662{
663 mb_correct_addr_and_bit(bit, addr);
664 ext4_set_bit_atomic(lock, bit, addr);
665}
666
667static inline void mb_clear_bit(int bit, void *addr)
668{
669 mb_correct_addr_and_bit(bit, addr);
670 ext4_clear_bit(bit, addr);
671}
672
673static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
674{
675 mb_correct_addr_and_bit(bit, addr);
676 ext4_clear_bit_atomic(lock, bit, addr);
677}
678
679static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
680{
681 char *bb;
682
683 /* FIXME!! is this needed */
684 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
685 BUG_ON(max == NULL);
686
687 if (order > e4b->bd_blkbits + 1) {
688 *max = 0;
689 return NULL;
690 }
691
692 /* at order 0 we see each particular block */
693 *max = 1 << (e4b->bd_blkbits + 3);
694 if (order == 0)
695 return EXT4_MB_BITMAP(e4b);
696
697 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
698 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
699
700 return bb;
701}
702
703#ifdef DOUBLE_CHECK
704static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
705 int first, int count)
706{
707 int i;
708 struct super_block *sb = e4b->bd_sb;
709
710 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
711 return;
712 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
713 for (i = 0; i < count; i++) {
714 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
715 ext4_fsblk_t blocknr;
716 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
717 blocknr += first + i;
718 blocknr +=
719 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
720
721 ext4_error(sb, __FUNCTION__, "double-free of inode"
722 " %lu's block %llu(bit %u in group %lu)\n",
723 inode ? inode->i_ino : 0, blocknr,
724 first + i, e4b->bd_group);
725 }
726 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
727 }
728}
729
730static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
731{
732 int i;
733
734 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
735 return;
736 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
737 for (i = 0; i < count; i++) {
738 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
739 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
740 }
741}
742
743static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
744{
745 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
746 unsigned char *b1, *b2;
747 int i;
748 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
749 b2 = (unsigned char *) bitmap;
750 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
751 if (b1[i] != b2[i]) {
752 printk("corruption in group %lu at byte %u(%u):"
753 " %x in copy != %x on disk/prealloc\n",
754 e4b->bd_group, i, i * 8, b1[i], b2[i]);
755 BUG();
756 }
757 }
758 }
759}
760
761#else
762static inline void mb_free_blocks_double(struct inode *inode,
763 struct ext4_buddy *e4b, int first, int count)
764{
765 return;
766}
767static inline void mb_mark_used_double(struct ext4_buddy *e4b,
768 int first, int count)
769{
770 return;
771}
772static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
773{
774 return;
775}
776#endif
777
778#ifdef AGGRESSIVE_CHECK
779
780#define MB_CHECK_ASSERT(assert) \
781do { \
782 if (!(assert)) { \
783 printk(KERN_EMERG \
784 "Assertion failure in %s() at %s:%d: \"%s\"\n", \
785 function, file, line, # assert); \
786 BUG(); \
787 } \
788} while (0)
789
790static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
791 const char *function, int line)
792{
793 struct super_block *sb = e4b->bd_sb;
794 int order = e4b->bd_blkbits + 1;
795 int max;
796 int max2;
797 int i;
798 int j;
799 int k;
800 int count;
801 struct ext4_group_info *grp;
802 int fragments = 0;
803 int fstart;
804 struct list_head *cur;
805 void *buddy;
806 void *buddy2;
807
808 if (!test_opt(sb, MBALLOC))
809 return 0;
810
811 {
812 static int mb_check_counter;
813 if (mb_check_counter++ % 100 != 0)
814 return 0;
815 }
816
817 while (order > 1) {
818 buddy = mb_find_buddy(e4b, order, &max);
819 MB_CHECK_ASSERT(buddy);
820 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
821 MB_CHECK_ASSERT(buddy2);
822 MB_CHECK_ASSERT(buddy != buddy2);
823 MB_CHECK_ASSERT(max * 2 == max2);
824
825 count = 0;
826 for (i = 0; i < max; i++) {
827
828 if (mb_test_bit(i, buddy)) {
829 /* only single bit in buddy2 may be 1 */
830 if (!mb_test_bit(i << 1, buddy2)) {
831 MB_CHECK_ASSERT(
832 mb_test_bit((i<<1)+1, buddy2));
833 } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
834 MB_CHECK_ASSERT(
835 mb_test_bit(i << 1, buddy2));
836 }
837 continue;
838 }
839
840 /* both bits in buddy2 must be 0 */
841 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
842 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
843
844 for (j = 0; j < (1 << order); j++) {
845 k = (i * (1 << order)) + j;
846 MB_CHECK_ASSERT(
847 !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
848 }
849 count++;
850 }
851 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
852 order--;
853 }
854
855 fstart = -1;
856 buddy = mb_find_buddy(e4b, 0, &max);
857 for (i = 0; i < max; i++) {
858 if (!mb_test_bit(i, buddy)) {
859 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
860 if (fstart == -1) {
861 fragments++;
862 fstart = i;
863 }
864 continue;
865 }
866 fstart = -1;
867 /* check used bits only */
868 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
869 buddy2 = mb_find_buddy(e4b, j, &max2);
870 k = i >> j;
871 MB_CHECK_ASSERT(k < max2);
872 MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
873 }
874 }
875 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
876 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
877
878 grp = ext4_get_group_info(sb, e4b->bd_group);
879 buddy = mb_find_buddy(e4b, 0, &max);
880 list_for_each(cur, &grp->bb_prealloc_list) {
881 ext4_group_t groupnr;
882 struct ext4_prealloc_space *pa;
883 pa = list_entry(cur, struct ext4_prealloc_space, group_list);
884 ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k);
885 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
886 for (i = 0; i < pa->len; i++)
887 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
888 }
889 return 0;
890}
891#undef MB_CHECK_ASSERT
892#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
893 __FILE__, __FUNCTION__, __LINE__)
894#else
895#define mb_check_buddy(e4b)
896#endif
897
898/* FIXME!! need more doc */
899static void ext4_mb_mark_free_simple(struct super_block *sb,
900 void *buddy, unsigned first, int len,
901 struct ext4_group_info *grp)
902{
903 struct ext4_sb_info *sbi = EXT4_SB(sb);
904 unsigned short min;
905 unsigned short max;
906 unsigned short chunk;
907 unsigned short border;
908
909 BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
910
911 border = 2 << sb->s_blocksize_bits;
912
913 while (len > 0) {
914 /* find how many blocks can be covered since this position */
915 max = ffs(first | border) - 1;
916
917 /* find how many blocks of power 2 we need to mark */
918 min = fls(len) - 1;
919
920 if (max < min)
921 min = max;
922 chunk = 1 << min;
923
924 /* mark multiblock chunks only */
925 grp->bb_counters[min]++;
926 if (min > 0)
927 mb_clear_bit(first >> min,
928 buddy + sbi->s_mb_offsets[min]);
929
930 len -= chunk;
931 first += chunk;
932 }
933}
934
935static void ext4_mb_generate_buddy(struct super_block *sb,
936 void *buddy, void *bitmap, ext4_group_t group)
937{
938 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
939 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
940 unsigned short i = 0;
941 unsigned short first;
942 unsigned short len;
943 unsigned free = 0;
944 unsigned fragments = 0;
945 unsigned long long period = get_cycles();
946
947 /* initialize buddy from bitmap which is aggregation
948 * of on-disk bitmap and preallocations */
949 i = ext4_find_next_zero_bit(bitmap, max, 0);
950 grp->bb_first_free = i;
951 while (i < max) {
952 fragments++;
953 first = i;
954 i = ext4_find_next_bit(bitmap, max, i);
955 len = i - first;
956 free += len;
957 if (len > 1)
958 ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
959 else
960 grp->bb_counters[0]++;
961 if (i < max)
962 i = ext4_find_next_zero_bit(bitmap, max, i);
963 }
964 grp->bb_fragments = fragments;
965
966 if (free != grp->bb_free) {
967 printk(KERN_DEBUG
968 "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
969 group, free, grp->bb_free);
970 grp->bb_free = free;
971 }
972
973 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
974
975 period = get_cycles() - period;
976 spin_lock(&EXT4_SB(sb)->s_bal_lock);
977 EXT4_SB(sb)->s_mb_buddies_generated++;
978 EXT4_SB(sb)->s_mb_generation_time += period;
979 spin_unlock(&EXT4_SB(sb)->s_bal_lock);
980}
981
982/* The buddy information is attached the buddy cache inode
983 * for convenience. The information regarding each group
984 * is loaded via ext4_mb_load_buddy. The information involve
985 * block bitmap and buddy information. The information are
986 * stored in the inode as
987 *
988 * { page }
989 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
990 *
991 *
992 * one block each for bitmap and buddy information.
993 * So for each group we take up 2 blocks. A page can
994 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
995 * So it can have information regarding groups_per_page which
996 * is blocks_per_page/2
997 */
998
999static int ext4_mb_init_cache(struct page *page, char *incore)
1000{
1001 int blocksize;
1002 int blocks_per_page;
1003 int groups_per_page;
1004 int err = 0;
1005 int i;
1006 ext4_group_t first_group;
1007 int first_block;
1008 struct super_block *sb;
1009 struct buffer_head *bhs;
1010 struct buffer_head **bh;
1011 struct inode *inode;
1012 char *data;
1013 char *bitmap;
1014
1015 mb_debug("init page %lu\n", page->index);
1016
1017 inode = page->mapping->host;
1018 sb = inode->i_sb;
1019 blocksize = 1 << inode->i_blkbits;
1020 blocks_per_page = PAGE_CACHE_SIZE / blocksize;
1021
1022 groups_per_page = blocks_per_page >> 1;
1023 if (groups_per_page == 0)
1024 groups_per_page = 1;
1025
1026 /* allocate buffer_heads to read bitmaps */
1027 if (groups_per_page > 1) {
1028 err = -ENOMEM;
1029 i = sizeof(struct buffer_head *) * groups_per_page;
1030 bh = kzalloc(i, GFP_NOFS);
1031 if (bh == NULL)
1032 goto out;
1033 } else
1034 bh = &bhs;
1035
1036 first_group = page->index * blocks_per_page / 2;
1037
1038 /* read all groups the page covers into the cache */
1039 for (i = 0; i < groups_per_page; i++) {
1040 struct ext4_group_desc *desc;
1041
1042 if (first_group + i >= EXT4_SB(sb)->s_groups_count)
1043 break;
1044
1045 err = -EIO;
1046 desc = ext4_get_group_desc(sb, first_group + i, NULL);
1047 if (desc == NULL)
1048 goto out;
1049
1050 err = -ENOMEM;
1051 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
1052 if (bh[i] == NULL)
1053 goto out;
1054
1055 if (bh_uptodate_or_lock(bh[i]))
1056 continue;
1057
1058 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1059 ext4_init_block_bitmap(sb, bh[i],
1060 first_group + i, desc);
1061 set_buffer_uptodate(bh[i]);
1062 unlock_buffer(bh[i]);
1063 continue;
1064 }
1065 get_bh(bh[i]);
1066 bh[i]->b_end_io = end_buffer_read_sync;
1067 submit_bh(READ, bh[i]);
1068 mb_debug("read bitmap for group %lu\n", first_group + i);
1069 }
1070
1071 /* wait for I/O completion */
1072 for (i = 0; i < groups_per_page && bh[i]; i++)
1073 wait_on_buffer(bh[i]);
1074
1075 err = -EIO;
1076 for (i = 0; i < groups_per_page && bh[i]; i++)
1077 if (!buffer_uptodate(bh[i]))
1078 goto out;
1079
1080 first_block = page->index * blocks_per_page;
1081 for (i = 0; i < blocks_per_page; i++) {
1082 int group;
1083 struct ext4_group_info *grinfo;
1084
1085 group = (first_block + i) >> 1;
1086 if (group >= EXT4_SB(sb)->s_groups_count)
1087 break;
1088
1089 /*
1090 * data carry information regarding this
1091 * particular group in the format specified
1092 * above
1093 *
1094 */
1095 data = page_address(page) + (i * blocksize);
1096 bitmap = bh[group - first_group]->b_data;
1097
1098 /*
1099 * We place the buddy block and bitmap block
1100 * close together
1101 */
1102 if ((first_block + i) & 1) {
1103 /* this is block of buddy */
1104 BUG_ON(incore == NULL);
1105 mb_debug("put buddy for group %u in page %lu/%x\n",
1106 group, page->index, i * blocksize);
1107 memset(data, 0xff, blocksize);
1108 grinfo = ext4_get_group_info(sb, group);
1109 grinfo->bb_fragments = 0;
1110 memset(grinfo->bb_counters, 0,
1111 sizeof(unsigned short)*(sb->s_blocksize_bits+2));
1112 /*
1113 * incore got set to the group block bitmap below
1114 */
1115 ext4_mb_generate_buddy(sb, data, incore, group);
1116 incore = NULL;
1117 } else {
1118 /* this is block of bitmap */
1119 BUG_ON(incore != NULL);
1120 mb_debug("put bitmap for group %u in page %lu/%x\n",
1121 group, page->index, i * blocksize);
1122
1123 /* see comments in ext4_mb_put_pa() */
1124 ext4_lock_group(sb, group);
1125 memcpy(data, bitmap, blocksize);
1126
1127 /* mark all preallocated blks used in in-core bitmap */
1128 ext4_mb_generate_from_pa(sb, data, group);
1129 ext4_unlock_group(sb, group);
1130
1131 /* set incore so that the buddy information can be
1132 * generated using this
1133 */
1134 incore = data;
1135 }
1136 }
1137 SetPageUptodate(page);
1138
1139out:
1140 if (bh) {
1141 for (i = 0; i < groups_per_page && bh[i]; i++)
1142 brelse(bh[i]);
1143 if (bh != &bhs)
1144 kfree(bh);
1145 }
1146 return err;
1147}
1148
1149static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1150 struct ext4_buddy *e4b)
1151{
1152 struct ext4_sb_info *sbi = EXT4_SB(sb);
1153 struct inode *inode = sbi->s_buddy_cache;
1154 int blocks_per_page;
1155 int block;
1156 int pnum;
1157 int poff;
1158 struct page *page;
1159
1160 mb_debug("load group %lu\n", group);
1161
1162 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1163
1164 e4b->bd_blkbits = sb->s_blocksize_bits;
1165 e4b->bd_info = ext4_get_group_info(sb, group);
1166 e4b->bd_sb = sb;
1167 e4b->bd_group = group;
1168 e4b->bd_buddy_page = NULL;
1169 e4b->bd_bitmap_page = NULL;
1170
1171 /*
1172 * the buddy cache inode stores the block bitmap
1173 * and buddy information in consecutive blocks.
1174 * So for each group we need two blocks.
1175 */
1176 block = group * 2;
1177 pnum = block / blocks_per_page;
1178 poff = block % blocks_per_page;
1179
1180 /* we could use find_or_create_page(), but it locks page
1181 * what we'd like to avoid in fast path ... */
1182 page = find_get_page(inode->i_mapping, pnum);
1183 if (page == NULL || !PageUptodate(page)) {
1184 if (page)
1185 page_cache_release(page);
1186 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1187 if (page) {
1188 BUG_ON(page->mapping != inode->i_mapping);
1189 if (!PageUptodate(page)) {
1190 ext4_mb_init_cache(page, NULL);
1191 mb_cmp_bitmaps(e4b, page_address(page) +
1192 (poff * sb->s_blocksize));
1193 }
1194 unlock_page(page);
1195 }
1196 }
1197 if (page == NULL || !PageUptodate(page))
1198 goto err;
1199 e4b->bd_bitmap_page = page;
1200 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1201 mark_page_accessed(page);
1202
1203 block++;
1204 pnum = block / blocks_per_page;
1205 poff = block % blocks_per_page;
1206
1207 page = find_get_page(inode->i_mapping, pnum);
1208 if (page == NULL || !PageUptodate(page)) {
1209 if (page)
1210 page_cache_release(page);
1211 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1212 if (page) {
1213 BUG_ON(page->mapping != inode->i_mapping);
1214 if (!PageUptodate(page))
1215 ext4_mb_init_cache(page, e4b->bd_bitmap);
1216
1217 unlock_page(page);
1218 }
1219 }
1220 if (page == NULL || !PageUptodate(page))
1221 goto err;
1222 e4b->bd_buddy_page = page;
1223 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1224 mark_page_accessed(page);
1225
1226 BUG_ON(e4b->bd_bitmap_page == NULL);
1227 BUG_ON(e4b->bd_buddy_page == NULL);
1228
1229 return 0;
1230
1231err:
1232 if (e4b->bd_bitmap_page)
1233 page_cache_release(e4b->bd_bitmap_page);
1234 if (e4b->bd_buddy_page)
1235 page_cache_release(e4b->bd_buddy_page);
1236 e4b->bd_buddy = NULL;
1237 e4b->bd_bitmap = NULL;
1238 return -EIO;
1239}
1240
1241static void ext4_mb_release_desc(struct ext4_buddy *e4b)
1242{
1243 if (e4b->bd_bitmap_page)
1244 page_cache_release(e4b->bd_bitmap_page);
1245 if (e4b->bd_buddy_page)
1246 page_cache_release(e4b->bd_buddy_page);
1247}
1248
1249
1250static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1251{
1252 int order = 1;
1253 void *bb;
1254
1255 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
1256 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1257
1258 bb = EXT4_MB_BUDDY(e4b);
1259 while (order <= e4b->bd_blkbits + 1) {
1260 block = block >> 1;
1261 if (!mb_test_bit(block, bb)) {
1262 /* this block is part of buddy of order 'order' */
1263 return order;
1264 }
1265 bb += 1 << (e4b->bd_blkbits - order);
1266 order++;
1267 }
1268 return 0;
1269}
1270
1271static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1272{
1273 __u32 *addr;
1274
1275 len = cur + len;
1276 while (cur < len) {
1277 if ((cur & 31) == 0 && (len - cur) >= 32) {
1278 /* fast path: clear whole word at once */
1279 addr = bm + (cur >> 3);
1280 *addr = 0;
1281 cur += 32;
1282 continue;
1283 }
1284 mb_clear_bit_atomic(lock, cur, bm);
1285 cur++;
1286 }
1287}
1288
1289static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1290{
1291 __u32 *addr;
1292
1293 len = cur + len;
1294 while (cur < len) {
1295 if ((cur & 31) == 0 && (len - cur) >= 32) {
1296 /* fast path: set whole word at once */
1297 addr = bm + (cur >> 3);
1298 *addr = 0xffffffff;
1299 cur += 32;
1300 continue;
1301 }
1302 mb_set_bit_atomic(lock, cur, bm);
1303 cur++;
1304 }
1305}
1306
1307static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1308 int first, int count)
1309{
1310 int block = 0;
1311 int max = 0;
1312 int order;
1313 void *buddy;
1314 void *buddy2;
1315 struct super_block *sb = e4b->bd_sb;
1316
1317 BUG_ON(first + count > (sb->s_blocksize << 3));
1318 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
1319 mb_check_buddy(e4b);
1320 mb_free_blocks_double(inode, e4b, first, count);
1321
1322 e4b->bd_info->bb_free += count;
1323 if (first < e4b->bd_info->bb_first_free)
1324 e4b->bd_info->bb_first_free = first;
1325
1326 /* let's maintain fragments counter */
1327 if (first != 0)
1328 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
1329 if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
1330 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
1331 if (block && max)
1332 e4b->bd_info->bb_fragments--;
1333 else if (!block && !max)
1334 e4b->bd_info->bb_fragments++;
1335
1336 /* let's maintain buddy itself */
1337 while (count-- > 0) {
1338 block = first++;
1339 order = 0;
1340
1341 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1342 ext4_fsblk_t blocknr;
1343 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
1344 blocknr += block;
1345 blocknr +=
1346 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1347
1348 ext4_error(sb, __FUNCTION__, "double-free of inode"
1349 " %lu's block %llu(bit %u in group %lu)\n",
1350 inode ? inode->i_ino : 0, blocknr, block,
1351 e4b->bd_group);
1352 }
1353 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1354 e4b->bd_info->bb_counters[order]++;
1355
1356 /* start of the buddy */
1357 buddy = mb_find_buddy(e4b, order, &max);
1358
1359 do {
1360 block &= ~1UL;
1361 if (mb_test_bit(block, buddy) ||
1362 mb_test_bit(block + 1, buddy))
1363 break;
1364
1365 /* both the buddies are free, try to coalesce them */
1366 buddy2 = mb_find_buddy(e4b, order + 1, &max);
1367
1368 if (!buddy2)
1369 break;
1370
1371 if (order > 0) {
1372 /* for special purposes, we don't set
1373 * free bits in bitmap */
1374 mb_set_bit(block, buddy);
1375 mb_set_bit(block + 1, buddy);
1376 }
1377 e4b->bd_info->bb_counters[order]--;
1378 e4b->bd_info->bb_counters[order]--;
1379
1380 block = block >> 1;
1381 order++;
1382 e4b->bd_info->bb_counters[order]++;
1383
1384 mb_clear_bit(block, buddy2);
1385 buddy = buddy2;
1386 } while (1);
1387 }
1388 mb_check_buddy(e4b);
1389
1390 return 0;
1391}
1392
1393static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1394 int needed, struct ext4_free_extent *ex)
1395{
1396 int next = block;
1397 int max;
1398 int ord;
1399 void *buddy;
1400
1401 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
1402 BUG_ON(ex == NULL);
1403
1404 buddy = mb_find_buddy(e4b, order, &max);
1405 BUG_ON(buddy == NULL);
1406 BUG_ON(block >= max);
1407 if (mb_test_bit(block, buddy)) {
1408 ex->fe_len = 0;
1409 ex->fe_start = 0;
1410 ex->fe_group = 0;
1411 return 0;
1412 }
1413
1414 /* FIXME dorp order completely ? */
1415 if (likely(order == 0)) {
1416 /* find actual order */
1417 order = mb_find_order_for_block(e4b, block);
1418 block = block >> order;
1419 }
1420
1421 ex->fe_len = 1 << order;
1422 ex->fe_start = block << order;
1423 ex->fe_group = e4b->bd_group;
1424
1425 /* calc difference from given start */
1426 next = next - ex->fe_start;
1427 ex->fe_len -= next;
1428 ex->fe_start += next;
1429
1430 while (needed > ex->fe_len &&
1431 (buddy = mb_find_buddy(e4b, order, &max))) {
1432
1433 if (block + 1 >= max)
1434 break;
1435
1436 next = (block + 1) * (1 << order);
1437 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
1438 break;
1439
1440 ord = mb_find_order_for_block(e4b, next);
1441
1442 order = ord;
1443 block = next >> order;
1444 ex->fe_len += 1 << order;
1445 }
1446
1447 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1448 return ex->fe_len;
1449}
1450
1451static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1452{
1453 int ord;
1454 int mlen = 0;
1455 int max = 0;
1456 int cur;
1457 int start = ex->fe_start;
1458 int len = ex->fe_len;
1459 unsigned ret = 0;
1460 int len0 = len;
1461 void *buddy;
1462
1463 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1464 BUG_ON(e4b->bd_group != ex->fe_group);
1465 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
1466 mb_check_buddy(e4b);
1467 mb_mark_used_double(e4b, start, len);
1468
1469 e4b->bd_info->bb_free -= len;
1470 if (e4b->bd_info->bb_first_free == start)
1471 e4b->bd_info->bb_first_free += len;
1472
1473 /* let's maintain fragments counter */
1474 if (start != 0)
1475 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
1476 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1477 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
1478 if (mlen && max)
1479 e4b->bd_info->bb_fragments++;
1480 else if (!mlen && !max)
1481 e4b->bd_info->bb_fragments--;
1482
1483 /* let's maintain buddy itself */
1484 while (len) {
1485 ord = mb_find_order_for_block(e4b, start);
1486
1487 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1488 /* the whole chunk may be allocated at once! */
1489 mlen = 1 << ord;
1490 buddy = mb_find_buddy(e4b, ord, &max);
1491 BUG_ON((start >> ord) >= max);
1492 mb_set_bit(start >> ord, buddy);
1493 e4b->bd_info->bb_counters[ord]--;
1494 start += mlen;
1495 len -= mlen;
1496 BUG_ON(len < 0);
1497 continue;
1498 }
1499
1500 /* store for history */
1501 if (ret == 0)
1502 ret = len | (ord << 16);
1503
1504 /* we have to split large buddy */
1505 BUG_ON(ord <= 0);
1506 buddy = mb_find_buddy(e4b, ord, &max);
1507 mb_set_bit(start >> ord, buddy);
1508 e4b->bd_info->bb_counters[ord]--;
1509
1510 ord--;
1511 cur = (start >> ord) & ~1U;
1512 buddy = mb_find_buddy(e4b, ord, &max);
1513 mb_clear_bit(cur, buddy);
1514 mb_clear_bit(cur + 1, buddy);
1515 e4b->bd_info->bb_counters[ord]++;
1516 e4b->bd_info->bb_counters[ord]++;
1517 }
1518
1519 mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
1520 EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1521 mb_check_buddy(e4b);
1522
1523 return ret;
1524}
1525
1526/*
1527 * Must be called under group lock!
1528 */
1529static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1530 struct ext4_buddy *e4b)
1531{
1532 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1533 int ret;
1534
1535 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1536 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1537
1538 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1539 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1540 ret = mb_mark_used(e4b, &ac->ac_b_ex);
1541
1542 /* preallocation can change ac_b_ex, thus we store actually
1543 * allocated blocks for history */
1544 ac->ac_f_ex = ac->ac_b_ex;
1545
1546 ac->ac_status = AC_STATUS_FOUND;
1547 ac->ac_tail = ret & 0xffff;
1548 ac->ac_buddy = ret >> 16;
1549
1550 /* XXXXXXX: SUCH A HORRIBLE **CK */
1551 /*FIXME!! Why ? */
1552 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1553 get_page(ac->ac_bitmap_page);
1554 ac->ac_buddy_page = e4b->bd_buddy_page;
1555 get_page(ac->ac_buddy_page);
1556
1557 /* store last allocated for subsequent stream allocation */
1558 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1559 spin_lock(&sbi->s_md_lock);
1560 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1561 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1562 spin_unlock(&sbi->s_md_lock);
1563 }
1564}
1565
1566/*
1567 * regular allocator, for general purposes allocation
1568 */
1569
1570static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1571 struct ext4_buddy *e4b,
1572 int finish_group)
1573{
1574 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1575 struct ext4_free_extent *bex = &ac->ac_b_ex;
1576 struct ext4_free_extent *gex = &ac->ac_g_ex;
1577 struct ext4_free_extent ex;
1578 int max;
1579
1580 /*
1581 * We don't want to scan for a whole year
1582 */
1583 if (ac->ac_found > sbi->s_mb_max_to_scan &&
1584 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1585 ac->ac_status = AC_STATUS_BREAK;
1586 return;
1587 }
1588
1589 /*
1590 * Haven't found good chunk so far, let's continue
1591 */
1592 if (bex->fe_len < gex->fe_len)
1593 return;
1594
1595 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1596 && bex->fe_group == e4b->bd_group) {
1597 /* recheck chunk's availability - we don't know
1598 * when it was found (within this lock-unlock
1599 * period or not) */
1600 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
1601 if (max >= gex->fe_len) {
1602 ext4_mb_use_best_found(ac, e4b);
1603 return;
1604 }
1605 }
1606}
1607
1608/*
1609 * The routine checks whether found extent is good enough. If it is,
1610 * then the extent gets marked used and flag is set to the context
1611 * to stop scanning. Otherwise, the extent is compared with the
1612 * previous found extent and if new one is better, then it's stored
1613 * in the context. Later, the best found extent will be used, if
1614 * mballoc can't find good enough extent.
1615 *
1616 * FIXME: real allocation policy is to be designed yet!
1617 */
1618static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1619 struct ext4_free_extent *ex,
1620 struct ext4_buddy *e4b)
1621{
1622 struct ext4_free_extent *bex = &ac->ac_b_ex;
1623 struct ext4_free_extent *gex = &ac->ac_g_ex;
1624
1625 BUG_ON(ex->fe_len <= 0);
1626 BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1627 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1628 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1629
1630 ac->ac_found++;
1631
1632 /*
1633 * The special case - take what you catch first
1634 */
1635 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1636 *bex = *ex;
1637 ext4_mb_use_best_found(ac, e4b);
1638 return;
1639 }
1640
1641 /*
1642 * Let's check whether the chuck is good enough
1643 */
1644 if (ex->fe_len == gex->fe_len) {
1645 *bex = *ex;
1646 ext4_mb_use_best_found(ac, e4b);
1647 return;
1648 }
1649
1650 /*
1651 * If this is first found extent, just store it in the context
1652 */
1653 if (bex->fe_len == 0) {
1654 *bex = *ex;
1655 return;
1656 }
1657
1658 /*
1659 * If new found extent is better, store it in the context
1660 */
1661 if (bex->fe_len < gex->fe_len) {
1662 /* if the request isn't satisfied, any found extent
1663 * larger than previous best one is better */
1664 if (ex->fe_len > bex->fe_len)
1665 *bex = *ex;
1666 } else if (ex->fe_len > gex->fe_len) {
1667 /* if the request is satisfied, then we try to find
1668 * an extent that still satisfy the request, but is
1669 * smaller than previous one */
1670 if (ex->fe_len < bex->fe_len)
1671 *bex = *ex;
1672 }
1673
1674 ext4_mb_check_limits(ac, e4b, 0);
1675}
1676
1677static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1678 struct ext4_buddy *e4b)
1679{
1680 struct ext4_free_extent ex = ac->ac_b_ex;
1681 ext4_group_t group = ex.fe_group;
1682 int max;
1683 int err;
1684
1685 BUG_ON(ex.fe_len <= 0);
1686 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1687 if (err)
1688 return err;
1689
1690 ext4_lock_group(ac->ac_sb, group);
1691 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
1692
1693 if (max > 0) {
1694 ac->ac_b_ex = ex;
1695 ext4_mb_use_best_found(ac, e4b);
1696 }
1697
1698 ext4_unlock_group(ac->ac_sb, group);
1699 ext4_mb_release_desc(e4b);
1700
1701 return 0;
1702}
1703
1704static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1705 struct ext4_buddy *e4b)
1706{
1707 ext4_group_t group = ac->ac_g_ex.fe_group;
1708 int max;
1709 int err;
1710 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1711 struct ext4_super_block *es = sbi->s_es;
1712 struct ext4_free_extent ex;
1713
1714 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1715 return 0;
1716
1717 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1718 if (err)
1719 return err;
1720
1721 ext4_lock_group(ac->ac_sb, group);
1722 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
1723 ac->ac_g_ex.fe_len, &ex);
1724
1725 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1726 ext4_fsblk_t start;
1727
1728 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
1729 ex.fe_start + le32_to_cpu(es->s_first_data_block);
1730 /* use do_div to get remainder (would be 64-bit modulo) */
1731 if (do_div(start, sbi->s_stripe) == 0) {
1732 ac->ac_found++;
1733 ac->ac_b_ex = ex;
1734 ext4_mb_use_best_found(ac, e4b);
1735 }
1736 } else if (max >= ac->ac_g_ex.fe_len) {
1737 BUG_ON(ex.fe_len <= 0);
1738 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1739 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1740 ac->ac_found++;
1741 ac->ac_b_ex = ex;
1742 ext4_mb_use_best_found(ac, e4b);
1743 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1744 /* Sometimes, caller may want to merge even small
1745 * number of blocks to an existing extent */
1746 BUG_ON(ex.fe_len <= 0);
1747 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1748 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1749 ac->ac_found++;
1750 ac->ac_b_ex = ex;
1751 ext4_mb_use_best_found(ac, e4b);
1752 }
1753 ext4_unlock_group(ac->ac_sb, group);
1754 ext4_mb_release_desc(e4b);
1755
1756 return 0;
1757}
1758
1759/*
1760 * The routine scans buddy structures (not bitmap!) from given order
1761 * to max order and tries to find big enough chunk to satisfy the req
1762 */
1763static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1764 struct ext4_buddy *e4b)
1765{
1766 struct super_block *sb = ac->ac_sb;
1767 struct ext4_group_info *grp = e4b->bd_info;
1768 void *buddy;
1769 int i;
1770 int k;
1771 int max;
1772
1773 BUG_ON(ac->ac_2order <= 0);
1774 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1775 if (grp->bb_counters[i] == 0)
1776 continue;
1777
1778 buddy = mb_find_buddy(e4b, i, &max);
1779 BUG_ON(buddy == NULL);
1780
1781 k = ext4_find_next_zero_bit(buddy, max, 0);
1782 BUG_ON(k >= max);
1783
1784 ac->ac_found++;
1785
1786 ac->ac_b_ex.fe_len = 1 << i;
1787 ac->ac_b_ex.fe_start = k << i;
1788 ac->ac_b_ex.fe_group = e4b->bd_group;
1789
1790 ext4_mb_use_best_found(ac, e4b);
1791
1792 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1793
1794 if (EXT4_SB(sb)->s_mb_stats)
1795 atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1796
1797 break;
1798 }
1799}
1800
1801/*
1802 * The routine scans the group and measures all found extents.
1803 * In order to optimize scanning, caller must pass number of
1804 * free blocks in the group, so the routine can know upper limit.
1805 */
1806static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1807 struct ext4_buddy *e4b)
1808{
1809 struct super_block *sb = ac->ac_sb;
1810 void *bitmap = EXT4_MB_BITMAP(e4b);
1811 struct ext4_free_extent ex;
1812 int i;
1813 int free;
1814
1815 free = e4b->bd_info->bb_free;
1816 BUG_ON(free <= 0);
1817
1818 i = e4b->bd_info->bb_first_free;
1819
1820 while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1821 i = ext4_find_next_zero_bit(bitmap,
1822 EXT4_BLOCKS_PER_GROUP(sb), i);
1823 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
1824 BUG_ON(free != 0);
1825 break;
1826 }
1827
1828 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1829 BUG_ON(ex.fe_len <= 0);
1830 BUG_ON(free < ex.fe_len);
1831
1832 ext4_mb_measure_extent(ac, &ex, e4b);
1833
1834 i += ex.fe_len;
1835 free -= ex.fe_len;
1836 }
1837
1838 ext4_mb_check_limits(ac, e4b, 1);
1839}
1840
1841/*
1842 * This is a special case for storages like raid5
1843 * we try to find stripe-aligned chunks for stripe-size requests
1844 * XXX should do so at least for multiples of stripe size as well
1845 */
1846static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1847 struct ext4_buddy *e4b)
1848{
1849 struct super_block *sb = ac->ac_sb;
1850 struct ext4_sb_info *sbi = EXT4_SB(sb);
1851 void *bitmap = EXT4_MB_BITMAP(e4b);
1852 struct ext4_free_extent ex;
1853 ext4_fsblk_t first_group_block;
1854 ext4_fsblk_t a;
1855 ext4_grpblk_t i;
1856 int max;
1857
1858 BUG_ON(sbi->s_stripe == 0);
1859
1860 /* find first stripe-aligned block in group */
1861 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
1862 + le32_to_cpu(sbi->s_es->s_first_data_block);
1863 a = first_group_block + sbi->s_stripe - 1;
1864 do_div(a, sbi->s_stripe);
1865 i = (a * sbi->s_stripe) - first_group_block;
1866
1867 while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
1868 if (!mb_test_bit(i, bitmap)) {
1869 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
1870 if (max >= sbi->s_stripe) {
1871 ac->ac_found++;
1872 ac->ac_b_ex = ex;
1873 ext4_mb_use_best_found(ac, e4b);
1874 break;
1875 }
1876 }
1877 i += sbi->s_stripe;
1878 }
1879}
1880
1881static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1882 ext4_group_t group, int cr)
1883{
1884 unsigned free, fragments;
1885 unsigned i, bits;
1886 struct ext4_group_desc *desc;
1887 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1888
1889 BUG_ON(cr < 0 || cr >= 4);
1890 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
1891
1892 free = grp->bb_free;
1893 fragments = grp->bb_fragments;
1894 if (free == 0)
1895 return 0;
1896 if (fragments == 0)
1897 return 0;
1898
1899 switch (cr) {
1900 case 0:
1901 BUG_ON(ac->ac_2order == 0);
1902 /* If this group is uninitialized, skip it initially */
1903 desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
1904 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1905 return 0;
1906
1907 bits = ac->ac_sb->s_blocksize_bits + 1;
1908 for (i = ac->ac_2order; i <= bits; i++)
1909 if (grp->bb_counters[i] > 0)
1910 return 1;
1911 break;
1912 case 1:
1913 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1914 return 1;
1915 break;
1916 case 2:
1917 if (free >= ac->ac_g_ex.fe_len)
1918 return 1;
1919 break;
1920 case 3:
1921 return 1;
1922 default:
1923 BUG();
1924 }
1925
1926 return 0;
1927}
1928
1929static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1930{
1931 ext4_group_t group;
1932 ext4_group_t i;
1933 int cr;
1934 int err = 0;
1935 int bsbits;
1936 struct ext4_sb_info *sbi;
1937 struct super_block *sb;
1938 struct ext4_buddy e4b;
1939 loff_t size, isize;
1940
1941 sb = ac->ac_sb;
1942 sbi = EXT4_SB(sb);
1943 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1944
1945 /* first, try the goal */
1946 err = ext4_mb_find_by_goal(ac, &e4b);
1947 if (err || ac->ac_status == AC_STATUS_FOUND)
1948 goto out;
1949
1950 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
1951 goto out;
1952
1953 /*
1954 * ac->ac2_order is set only if the fe_len is a power of 2
1955 * if ac2_order is set we also set criteria to 0 so that we
1956 * try exact allocation using buddy.
1957 */
1958 i = fls(ac->ac_g_ex.fe_len);
1959 ac->ac_2order = 0;
1960 /*
1961 * We search using buddy data only if the order of the request
1962 * is greater than equal to the sbi_s_mb_order2_reqs
1963 * You can tune it via /proc/fs/ext4/<partition>/order2_req
1964 */
1965 if (i >= sbi->s_mb_order2_reqs) {
1966 /*
1967 * This should tell if fe_len is exactly power of 2
1968 */
1969 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
1970 ac->ac_2order = i - 1;
1971 }
1972
1973 bsbits = ac->ac_sb->s_blocksize_bits;
1974 /* if stream allocation is enabled, use global goal */
1975 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
1976 isize = i_size_read(ac->ac_inode) >> bsbits;
1977 if (size < isize)
1978 size = isize;
1979
1980 if (size < sbi->s_mb_stream_request &&
1981 (ac->ac_flags & EXT4_MB_HINT_DATA)) {
1982 /* TBD: may be hot point */
1983 spin_lock(&sbi->s_md_lock);
1984 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
1985 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1986 spin_unlock(&sbi->s_md_lock);
1987 }
1988
1989 /* searching for the right group start from the goal value specified */
1990 group = ac->ac_g_ex.fe_group;
1991
1992 /* Let's just scan groups to find more-less suitable blocks */
1993 cr = ac->ac_2order ? 0 : 1;
1994 /*
1995 * cr == 0 try to get exact allocation,
1996 * cr == 3 try to get anything
1997 */
1998repeat:
1999 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2000 ac->ac_criteria = cr;
2001 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
2002 struct ext4_group_info *grp;
2003 struct ext4_group_desc *desc;
2004
2005 if (group == EXT4_SB(sb)->s_groups_count)
2006 group = 0;
2007
2008 /* quick check to skip empty groups */
2009 grp = ext4_get_group_info(ac->ac_sb, group);
2010 if (grp->bb_free == 0)
2011 continue;
2012
2013 /*
2014 * if the group is already init we check whether it is
2015 * a good group and if not we don't load the buddy
2016 */
2017 if (EXT4_MB_GRP_NEED_INIT(grp)) {
2018 /*
2019 * we need full data about the group
2020 * to make a good selection
2021 */
2022 err = ext4_mb_load_buddy(sb, group, &e4b);
2023 if (err)
2024 goto out;
2025 ext4_mb_release_desc(&e4b);
2026 }
2027
2028 /*
2029 * If the particular group doesn't satisfy our
2030 * criteria we continue with the next group
2031 */
2032 if (!ext4_mb_good_group(ac, group, cr))
2033 continue;
2034
2035 err = ext4_mb_load_buddy(sb, group, &e4b);
2036 if (err)
2037 goto out;
2038
2039 ext4_lock_group(sb, group);
2040 if (!ext4_mb_good_group(ac, group, cr)) {
2041 /* someone did allocation from this group */
2042 ext4_unlock_group(sb, group);
2043 ext4_mb_release_desc(&e4b);
2044 continue;
2045 }
2046
2047 ac->ac_groups_scanned++;
2048 desc = ext4_get_group_desc(sb, group, NULL);
2049 if (cr == 0 || (desc->bg_flags &
2050 cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
2051 ac->ac_2order != 0))
2052 ext4_mb_simple_scan_group(ac, &e4b);
2053 else if (cr == 1 &&
2054 ac->ac_g_ex.fe_len == sbi->s_stripe)
2055 ext4_mb_scan_aligned(ac, &e4b);
2056 else
2057 ext4_mb_complex_scan_group(ac, &e4b);
2058
2059 ext4_unlock_group(sb, group);
2060 ext4_mb_release_desc(&e4b);
2061
2062 if (ac->ac_status != AC_STATUS_CONTINUE)
2063 break;
2064 }
2065 }
2066
2067 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2068 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2069 /*
2070 * We've been searching too long. Let's try to allocate
2071 * the best chunk we've found so far
2072 */
2073
2074 ext4_mb_try_best_found(ac, &e4b);
2075 if (ac->ac_status != AC_STATUS_FOUND) {
2076 /*
2077 * Someone more lucky has already allocated it.
2078 * The only thing we can do is just take first
2079 * found block(s)
2080 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2081 */
2082 ac->ac_b_ex.fe_group = 0;
2083 ac->ac_b_ex.fe_start = 0;
2084 ac->ac_b_ex.fe_len = 0;
2085 ac->ac_status = AC_STATUS_CONTINUE;
2086 ac->ac_flags |= EXT4_MB_HINT_FIRST;
2087 cr = 3;
2088 atomic_inc(&sbi->s_mb_lost_chunks);
2089 goto repeat;
2090 }
2091 }
2092out:
2093 return err;
2094}
2095
2096#ifdef EXT4_MB_HISTORY
2097struct ext4_mb_proc_session {
2098 struct ext4_mb_history *history;
2099 struct super_block *sb;
2100 int start;
2101 int max;
2102};
2103
2104static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2105 struct ext4_mb_history *hs,
2106 int first)
2107{
2108 if (hs == s->history + s->max)
2109 hs = s->history;
2110 if (!first && hs == s->history + s->start)
2111 return NULL;
2112 while (hs->orig.fe_len == 0) {
2113 hs++;
2114 if (hs == s->history + s->max)
2115 hs = s->history;
2116 if (hs == s->history + s->start)
2117 return NULL;
2118 }
2119 return hs;
2120}
2121
2122static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2123{
2124 struct ext4_mb_proc_session *s = seq->private;
2125 struct ext4_mb_history *hs;
2126 int l = *pos;
2127
2128 if (l == 0)
2129 return SEQ_START_TOKEN;
2130 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2131 if (!hs)
2132 return NULL;
2133 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2134 return hs;
2135}
2136
2137static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2138 loff_t *pos)
2139{
2140 struct ext4_mb_proc_session *s = seq->private;
2141 struct ext4_mb_history *hs = v;
2142
2143 ++*pos;
2144 if (v == SEQ_START_TOKEN)
2145 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2146 else
2147 return ext4_mb_history_skip_empty(s, ++hs, 0);
2148}
2149
2150static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2151{
2152 char buf[25], buf2[25], buf3[25], *fmt;
2153 struct ext4_mb_history *hs = v;
2154
2155 if (v == SEQ_START_TOKEN) {
2156 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2157 "%-5s %-2s %-5s %-5s %-5s %-6s\n",
2158 "pid", "inode", "original", "goal", "result", "found",
2159 "grps", "cr", "flags", "merge", "tail", "broken");
2160 return 0;
2161 }
2162
2163 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2164 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2165 "%-5u %-5s %-5u %-6u\n";
2166 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
2167 hs->result.fe_start, hs->result.fe_len,
2168 hs->result.fe_logical);
2169 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
2170 hs->orig.fe_start, hs->orig.fe_len,
2171 hs->orig.fe_logical);
2172 sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
2173 hs->goal.fe_start, hs->goal.fe_len,
2174 hs->goal.fe_logical);
2175 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2176 hs->found, hs->groups, hs->cr, hs->flags,
2177 hs->merged ? "M" : "", hs->tail,
2178 hs->buddy ? 1 << hs->buddy : 0);
2179 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2180 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2181 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
2182 hs->result.fe_start, hs->result.fe_len,
2183 hs->result.fe_logical);
2184 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
2185 hs->orig.fe_start, hs->orig.fe_len,
2186 hs->orig.fe_logical);
2187 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2188 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2189 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
2190 hs->result.fe_start, hs->result.fe_len);
2191 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2192 hs->pid, hs->ino, buf2);
2193 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2194 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
2195 hs->result.fe_start, hs->result.fe_len);
2196 seq_printf(seq, "%-5u %-8u %-23s free\n",
2197 hs->pid, hs->ino, buf2);
2198 }
2199 return 0;
2200}
2201
2202static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2203{
2204}
2205
2206static struct seq_operations ext4_mb_seq_history_ops = {
2207 .start = ext4_mb_seq_history_start,
2208 .next = ext4_mb_seq_history_next,
2209 .stop = ext4_mb_seq_history_stop,
2210 .show = ext4_mb_seq_history_show,
2211};
2212
2213static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2214{
2215 struct super_block *sb = PDE(inode)->data;
2216 struct ext4_sb_info *sbi = EXT4_SB(sb);
2217 struct ext4_mb_proc_session *s;
2218 int rc;
2219 int size;
2220
2221 s = kmalloc(sizeof(*s), GFP_KERNEL);
2222 if (s == NULL)
2223 return -ENOMEM;
2224 s->sb = sb;
2225 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2226 s->history = kmalloc(size, GFP_KERNEL);
2227 if (s->history == NULL) {
2228 kfree(s);
2229 return -ENOMEM;
2230 }
2231
2232 spin_lock(&sbi->s_mb_history_lock);
2233 memcpy(s->history, sbi->s_mb_history, size);
2234 s->max = sbi->s_mb_history_max;
2235 s->start = sbi->s_mb_history_cur % s->max;
2236 spin_unlock(&sbi->s_mb_history_lock);
2237
2238 rc = seq_open(file, &ext4_mb_seq_history_ops);
2239 if (rc == 0) {
2240 struct seq_file *m = (struct seq_file *)file->private_data;
2241 m->private = s;
2242 } else {
2243 kfree(s->history);
2244 kfree(s);
2245 }
2246 return rc;
2247
2248}
2249
2250static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2251{
2252 struct seq_file *seq = (struct seq_file *)file->private_data;
2253 struct ext4_mb_proc_session *s = seq->private;
2254 kfree(s->history);
2255 kfree(s);
2256 return seq_release(inode, file);
2257}
2258
2259static ssize_t ext4_mb_seq_history_write(struct file *file,
2260 const char __user *buffer,
2261 size_t count, loff_t *ppos)
2262{
2263 struct seq_file *seq = (struct seq_file *)file->private_data;
2264 struct ext4_mb_proc_session *s = seq->private;
2265 struct super_block *sb = s->sb;
2266 char str[32];
2267 int value;
2268
2269 if (count >= sizeof(str)) {
2270 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2271 "mb_history", (int)sizeof(str));
2272 return -EOVERFLOW;
2273 }
2274
2275 if (copy_from_user(str, buffer, count))
2276 return -EFAULT;
2277
2278 value = simple_strtol(str, NULL, 0);
2279 if (value < 0)
2280 return -ERANGE;
2281 EXT4_SB(sb)->s_mb_history_filter = value;
2282
2283 return count;
2284}
2285
2286static struct file_operations ext4_mb_seq_history_fops = {
2287 .owner = THIS_MODULE,
2288 .open = ext4_mb_seq_history_open,
2289 .read = seq_read,
2290 .write = ext4_mb_seq_history_write,
2291 .llseek = seq_lseek,
2292 .release = ext4_mb_seq_history_release,
2293};
2294
2295static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2296{
2297 struct super_block *sb = seq->private;
2298 struct ext4_sb_info *sbi = EXT4_SB(sb);
2299 ext4_group_t group;
2300
2301 if (*pos < 0 || *pos >= sbi->s_groups_count)
2302 return NULL;
2303
2304 group = *pos + 1;
2305 return (void *) group;
2306}
2307
2308static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2309{
2310 struct super_block *sb = seq->private;
2311 struct ext4_sb_info *sbi = EXT4_SB(sb);
2312 ext4_group_t group;
2313
2314 ++*pos;
2315 if (*pos < 0 || *pos >= sbi->s_groups_count)
2316 return NULL;
2317 group = *pos + 1;
2318 return (void *) group;;
2319}
2320
2321static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2322{
2323 struct super_block *sb = seq->private;
2324 long group = (long) v;
2325 int i;
2326 int err;
2327 struct ext4_buddy e4b;
2328 struct sg {
2329 struct ext4_group_info info;
2330 unsigned short counters[16];
2331 } sg;
2332
2333 group--;
2334 if (group == 0)
2335 seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2336 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2337 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2338 "group", "free", "frags", "first",
2339 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2340 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2341
2342 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2343 sizeof(struct ext4_group_info);
2344 err = ext4_mb_load_buddy(sb, group, &e4b);
2345 if (err) {
2346 seq_printf(seq, "#%-5lu: I/O error\n", group);
2347 return 0;
2348 }
2349 ext4_lock_group(sb, group);
2350 memcpy(&sg, ext4_get_group_info(sb, group), i);
2351 ext4_unlock_group(sb, group);
2352 ext4_mb_release_desc(&e4b);
2353
2354 seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
2355 sg.info.bb_fragments, sg.info.bb_first_free);
2356 for (i = 0; i <= 13; i++)
2357 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2358 sg.info.bb_counters[i] : 0);
2359 seq_printf(seq, " ]\n");
2360
2361 return 0;
2362}
2363
2364static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2365{
2366}
2367
2368static struct seq_operations ext4_mb_seq_groups_ops = {
2369 .start = ext4_mb_seq_groups_start,
2370 .next = ext4_mb_seq_groups_next,
2371 .stop = ext4_mb_seq_groups_stop,
2372 .show = ext4_mb_seq_groups_show,
2373};
2374
2375static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2376{
2377 struct super_block *sb = PDE(inode)->data;
2378 int rc;
2379
2380 rc = seq_open(file, &ext4_mb_seq_groups_ops);
2381 if (rc == 0) {
2382 struct seq_file *m = (struct seq_file *)file->private_data;
2383 m->private = sb;
2384 }
2385 return rc;
2386
2387}
2388
2389static struct file_operations ext4_mb_seq_groups_fops = {
2390 .owner = THIS_MODULE,
2391 .open = ext4_mb_seq_groups_open,
2392 .read = seq_read,
2393 .llseek = seq_lseek,
2394 .release = seq_release,
2395};
2396
2397static void ext4_mb_history_release(struct super_block *sb)
2398{
2399 struct ext4_sb_info *sbi = EXT4_SB(sb);
2400
2401 remove_proc_entry("mb_groups", sbi->s_mb_proc);
2402 remove_proc_entry("mb_history", sbi->s_mb_proc);
2403
2404 kfree(sbi->s_mb_history);
2405}
2406
2407static void ext4_mb_history_init(struct super_block *sb)
2408{
2409 struct ext4_sb_info *sbi = EXT4_SB(sb);
2410 int i;
2411
2412 if (sbi->s_mb_proc != NULL) {
2413 struct proc_dir_entry *p;
2414 p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
2415 if (p) {
2416 p->proc_fops = &ext4_mb_seq_history_fops;
2417 p->data = sb;
2418 }
2419 p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
2420 if (p) {
2421 p->proc_fops = &ext4_mb_seq_groups_fops;
2422 p->data = sb;
2423 }
2424 }
2425
2426 sbi->s_mb_history_max = 1000;
2427 sbi->s_mb_history_cur = 0;
2428 spin_lock_init(&sbi->s_mb_history_lock);
2429 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2430 sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
2431 if (likely(sbi->s_mb_history != NULL))
2432 memset(sbi->s_mb_history, 0, i);
2433 /* if we can't allocate history, then we simple won't use it */
2434}
2435
2436static void ext4_mb_store_history(struct ext4_allocation_context *ac)
2437{
2438 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2439 struct ext4_mb_history h;
2440
2441 if (unlikely(sbi->s_mb_history == NULL))
2442 return;
2443
2444 if (!(ac->ac_op & sbi->s_mb_history_filter))
2445 return;
2446
2447 h.op = ac->ac_op;
2448 h.pid = current->pid;
2449 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2450 h.orig = ac->ac_o_ex;
2451 h.result = ac->ac_b_ex;
2452 h.flags = ac->ac_flags;
2453 h.found = ac->ac_found;
2454 h.groups = ac->ac_groups_scanned;
2455 h.cr = ac->ac_criteria;
2456 h.tail = ac->ac_tail;
2457 h.buddy = ac->ac_buddy;
2458 h.merged = 0;
2459 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2460 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2461 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2462 h.merged = 1;
2463 h.goal = ac->ac_g_ex;
2464 h.result = ac->ac_f_ex;
2465 }
2466
2467 spin_lock(&sbi->s_mb_history_lock);
2468 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2469 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2470 sbi->s_mb_history_cur = 0;
2471 spin_unlock(&sbi->s_mb_history_lock);
2472}
2473
2474#else
2475#define ext4_mb_history_release(sb)
2476#define ext4_mb_history_init(sb)
2477#endif
2478
2479static int ext4_mb_init_backend(struct super_block *sb)
2480{
2481 ext4_group_t i;
2482 int j, len, metalen;
2483 struct ext4_sb_info *sbi = EXT4_SB(sb);
2484 int num_meta_group_infos =
2485 (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2486 EXT4_DESC_PER_BLOCK_BITS(sb);
2487 struct ext4_group_info **meta_group_info;
2488
2489 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2490 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2491 * So a two level scheme suffices for now. */
2492 sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
2493 num_meta_group_infos, GFP_KERNEL);
2494 if (sbi->s_group_info == NULL) {
2495 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2496 return -ENOMEM;
2497 }
2498 sbi->s_buddy_cache = new_inode(sb);
2499 if (sbi->s_buddy_cache == NULL) {
2500 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2501 goto err_freesgi;
2502 }
2503 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2504
2505 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2506 for (i = 0; i < num_meta_group_infos; i++) {
2507 if ((i + 1) == num_meta_group_infos)
2508 metalen = sizeof(*meta_group_info) *
2509 (sbi->s_groups_count -
2510 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2511 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2512 if (meta_group_info == NULL) {
2513 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2514 "buddy group\n");
2515 goto err_freemeta;
2516 }
2517 sbi->s_group_info[i] = meta_group_info;
2518 }
2519
2520 /*
2521 * calculate needed size. if change bb_counters size,
2522 * don't forget about ext4_mb_generate_buddy()
2523 */
2524 len = sizeof(struct ext4_group_info);
2525 len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
2526 for (i = 0; i < sbi->s_groups_count; i++) {
2527 struct ext4_group_desc *desc;
2528
2529 meta_group_info =
2530 sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2531 j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
2532
2533 meta_group_info[j] = kzalloc(len, GFP_KERNEL);
2534 if (meta_group_info[j] == NULL) {
2535 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2536 i--;
2537 goto err_freebuddy;
2538 }
2539 desc = ext4_get_group_desc(sb, i, NULL);
2540 if (desc == NULL) {
2541 printk(KERN_ERR
2542 "EXT4-fs: can't read descriptor %lu\n", i);
2543 goto err_freebuddy;
2544 }
2545 memset(meta_group_info[j], 0, len);
2546 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2547 &(meta_group_info[j]->bb_state));
2548
2549 /*
2550 * initialize bb_free to be able to skip
2551 * empty groups without initialization
2552 */
2553 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2554 meta_group_info[j]->bb_free =
2555 ext4_free_blocks_after_init(sb, i, desc);
2556 } else {
2557 meta_group_info[j]->bb_free =
2558 le16_to_cpu(desc->bg_free_blocks_count);
2559 }
2560
2561 INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
2562
2563#ifdef DOUBLE_CHECK
2564 {
2565 struct buffer_head *bh;
2566 meta_group_info[j]->bb_bitmap =
2567 kmalloc(sb->s_blocksize, GFP_KERNEL);
2568 BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
2569 bh = read_block_bitmap(sb, i);
2570 BUG_ON(bh == NULL);
2571 memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
2572 sb->s_blocksize);
2573 put_bh(bh);
2574 }
2575#endif
2576
2577 }
2578
2579 return 0;
2580
2581err_freebuddy:
2582 while (i >= 0) {
2583 kfree(ext4_get_group_info(sb, i));
2584 i--;
2585 }
2586 i = num_meta_group_infos;
2587err_freemeta:
2588 while (--i >= 0)
2589 kfree(sbi->s_group_info[i]);
2590 iput(sbi->s_buddy_cache);
2591err_freesgi:
2592 kfree(sbi->s_group_info);
2593 return -ENOMEM;
2594}
2595
2596int ext4_mb_init(struct super_block *sb, int needs_recovery)
2597{
2598 struct ext4_sb_info *sbi = EXT4_SB(sb);
2599 unsigned i;
2600 unsigned offset;
2601 unsigned max;
2602
2603 if (!test_opt(sb, MBALLOC))
2604 return 0;
2605
2606 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
2607
2608 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2609 if (sbi->s_mb_offsets == NULL) {
2610 clear_opt(sbi->s_mount_opt, MBALLOC);
2611 return -ENOMEM;
2612 }
2613 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2614 if (sbi->s_mb_maxs == NULL) {
2615 clear_opt(sbi->s_mount_opt, MBALLOC);
2616 kfree(sbi->s_mb_maxs);
2617 return -ENOMEM;
2618 }
2619
2620 /* order 0 is regular bitmap */
2621 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2622 sbi->s_mb_offsets[0] = 0;
2623
2624 i = 1;
2625 offset = 0;
2626 max = sb->s_blocksize << 2;
2627 do {
2628 sbi->s_mb_offsets[i] = offset;
2629 sbi->s_mb_maxs[i] = max;
2630 offset += 1 << (sb->s_blocksize_bits - i);
2631 max = max >> 1;
2632 i++;
2633 } while (i <= sb->s_blocksize_bits + 1);
2634
2635 /* init file for buddy data */
2636 i = ext4_mb_init_backend(sb);
2637 if (i) {
2638 clear_opt(sbi->s_mount_opt, MBALLOC);
2639 kfree(sbi->s_mb_offsets);
2640 kfree(sbi->s_mb_maxs);
2641 return i;
2642 }
2643
2644 spin_lock_init(&sbi->s_md_lock);
2645 INIT_LIST_HEAD(&sbi->s_active_transaction);
2646 INIT_LIST_HEAD(&sbi->s_closed_transaction);
2647 INIT_LIST_HEAD(&sbi->s_committed_transaction);
2648 spin_lock_init(&sbi->s_bal_lock);
2649
2650 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2651 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2652 sbi->s_mb_stats = MB_DEFAULT_STATS;
2653 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2654 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2655 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2656 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2657
2658 i = sizeof(struct ext4_locality_group) * NR_CPUS;
2659 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
2660 if (sbi->s_locality_groups == NULL) {
2661 clear_opt(sbi->s_mount_opt, MBALLOC);
2662 kfree(sbi->s_mb_offsets);
2663 kfree(sbi->s_mb_maxs);
2664 return -ENOMEM;
2665 }
2666 for (i = 0; i < NR_CPUS; i++) {
2667 struct ext4_locality_group *lg;
2668 lg = &sbi->s_locality_groups[i];
2669 mutex_init(&lg->lg_mutex);
2670 INIT_LIST_HEAD(&lg->lg_prealloc_list);
2671 spin_lock_init(&lg->lg_prealloc_lock);
2672 }
2673
2674 ext4_mb_init_per_dev_proc(sb);
2675 ext4_mb_history_init(sb);
2676
2677 printk("EXT4-fs: mballoc enabled\n");
2678 return 0;
2679}
2680
2681/* need to called with ext4 group lock (ext4_lock_group) */
2682static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2683{
2684 struct ext4_prealloc_space *pa;
2685 struct list_head *cur, *tmp;
2686 int count = 0;
2687
2688 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2689 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2690 list_del(&pa->pa_group_list);
2691 count++;
2692 kfree(pa);
2693 }
2694 if (count)
2695 mb_debug("mballoc: %u PAs left\n", count);
2696
2697}
2698
2699int ext4_mb_release(struct super_block *sb)
2700{
2701 ext4_group_t i;
2702 int num_meta_group_infos;
2703 struct ext4_group_info *grinfo;
2704 struct ext4_sb_info *sbi = EXT4_SB(sb);
2705
2706 if (!test_opt(sb, MBALLOC))
2707 return 0;
2708
2709 /* release freed, non-committed blocks */
2710 spin_lock(&sbi->s_md_lock);
2711 list_splice_init(&sbi->s_closed_transaction,
2712 &sbi->s_committed_transaction);
2713 list_splice_init(&sbi->s_active_transaction,
2714 &sbi->s_committed_transaction);
2715 spin_unlock(&sbi->s_md_lock);
2716 ext4_mb_free_committed_blocks(sb);
2717
2718 if (sbi->s_group_info) {
2719 for (i = 0; i < sbi->s_groups_count; i++) {
2720 grinfo = ext4_get_group_info(sb, i);
2721#ifdef DOUBLE_CHECK
2722 kfree(grinfo->bb_bitmap);
2723#endif
2724 ext4_lock_group(sb, i);
2725 ext4_mb_cleanup_pa(grinfo);
2726 ext4_unlock_group(sb, i);
2727 kfree(grinfo);
2728 }
2729 num_meta_group_infos = (sbi->s_groups_count +
2730 EXT4_DESC_PER_BLOCK(sb) - 1) >>
2731 EXT4_DESC_PER_BLOCK_BITS(sb);
2732 for (i = 0; i < num_meta_group_infos; i++)
2733 kfree(sbi->s_group_info[i]);
2734 kfree(sbi->s_group_info);
2735 }
2736 kfree(sbi->s_mb_offsets);
2737 kfree(sbi->s_mb_maxs);
2738 if (sbi->s_buddy_cache)
2739 iput(sbi->s_buddy_cache);
2740 if (sbi->s_mb_stats) {
2741 printk(KERN_INFO
2742 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
2743 atomic_read(&sbi->s_bal_allocated),
2744 atomic_read(&sbi->s_bal_reqs),
2745 atomic_read(&sbi->s_bal_success));
2746 printk(KERN_INFO
2747 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
2748 "%u 2^N hits, %u breaks, %u lost\n",
2749 atomic_read(&sbi->s_bal_ex_scanned),
2750 atomic_read(&sbi->s_bal_goals),
2751 atomic_read(&sbi->s_bal_2orders),
2752 atomic_read(&sbi->s_bal_breaks),
2753 atomic_read(&sbi->s_mb_lost_chunks));
2754 printk(KERN_INFO
2755 "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
2756 sbi->s_mb_buddies_generated++,
2757 sbi->s_mb_generation_time);
2758 printk(KERN_INFO
2759 "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
2760 atomic_read(&sbi->s_mb_preallocated),
2761 atomic_read(&sbi->s_mb_discarded));
2762 }
2763
2764 kfree(sbi->s_locality_groups);
2765
2766 ext4_mb_history_release(sb);
2767 ext4_mb_destroy_per_dev_proc(sb);
2768
2769 return 0;
2770}
2771
2772static void ext4_mb_free_committed_blocks(struct super_block *sb)
2773{
2774 struct ext4_sb_info *sbi = EXT4_SB(sb);
2775 int err;
2776 int i;
2777 int count = 0;
2778 int count2 = 0;
2779 struct ext4_free_metadata *md;
2780 struct ext4_buddy e4b;
2781
2782 if (list_empty(&sbi->s_committed_transaction))
2783 return;
2784
2785 /* there is committed blocks to be freed yet */
2786 do {
2787 /* get next array of blocks */
2788 md = NULL;
2789 spin_lock(&sbi->s_md_lock);
2790 if (!list_empty(&sbi->s_committed_transaction)) {
2791 md = list_entry(sbi->s_committed_transaction.next,
2792 struct ext4_free_metadata, list);
2793 list_del(&md->list);
2794 }
2795 spin_unlock(&sbi->s_md_lock);
2796
2797 if (md == NULL)
2798 break;
2799
2800 mb_debug("gonna free %u blocks in group %lu (0x%p):",
2801 md->num, md->group, md);
2802
2803 err = ext4_mb_load_buddy(sb, md->group, &e4b);
2804 /* we expect to find existing buddy because it's pinned */
2805 BUG_ON(err != 0);
2806
2807 /* there are blocks to put in buddy to make them really free */
2808 count += md->num;
2809 count2++;
2810 ext4_lock_group(sb, md->group);
2811 for (i = 0; i < md->num; i++) {
2812 mb_debug(" %u", md->blocks[i]);
2813 err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
2814 BUG_ON(err != 0);
2815 }
2816 mb_debug("\n");
2817 ext4_unlock_group(sb, md->group);
2818
2819 /* balance refcounts from ext4_mb_free_metadata() */
2820 page_cache_release(e4b.bd_buddy_page);
2821 page_cache_release(e4b.bd_bitmap_page);
2822
2823 kfree(md);
2824 ext4_mb_release_desc(&e4b);
2825
2826 } while (md);
2827
2828 mb_debug("freed %u blocks in %u structures\n", count, count2);
2829}
2830
2831#define EXT4_ROOT "ext4"
2832#define EXT4_MB_STATS_NAME "stats"
2833#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
2834#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
2835#define EXT4_MB_ORDER2_REQ "order2_req"
2836#define EXT4_MB_STREAM_REQ "stream_req"
2837#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2838
2839
2840
2841#define MB_PROC_VALUE_READ(name) \
2842static int ext4_mb_read_##name(char *page, char **start, \
2843 off_t off, int count, int *eof, void *data) \
2844{ \
2845 struct ext4_sb_info *sbi = data; \
2846 int len; \
2847 *eof = 1; \
2848 if (off != 0) \
2849 return 0; \
2850 len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
2851 *start = page; \
2852 return len; \
2853}
2854
2855#define MB_PROC_VALUE_WRITE(name) \
2856static int ext4_mb_write_##name(struct file *file, \
2857 const char __user *buf, unsigned long cnt, void *data) \
2858{ \
2859 struct ext4_sb_info *sbi = data; \
2860 char str[32]; \
2861 long value; \
2862 if (cnt >= sizeof(str)) \
2863 return -EINVAL; \
2864 if (copy_from_user(str, buf, cnt)) \
2865 return -EFAULT; \
2866 value = simple_strtol(str, NULL, 0); \
2867 if (value <= 0) \
2868 return -ERANGE; \
2869 sbi->s_mb_##name = value; \
2870 return cnt; \
2871}
2872
2873MB_PROC_VALUE_READ(stats);
2874MB_PROC_VALUE_WRITE(stats);
2875MB_PROC_VALUE_READ(max_to_scan);
2876MB_PROC_VALUE_WRITE(max_to_scan);
2877MB_PROC_VALUE_READ(min_to_scan);
2878MB_PROC_VALUE_WRITE(min_to_scan);
2879MB_PROC_VALUE_READ(order2_reqs);
2880MB_PROC_VALUE_WRITE(order2_reqs);
2881MB_PROC_VALUE_READ(stream_request);
2882MB_PROC_VALUE_WRITE(stream_request);
2883MB_PROC_VALUE_READ(group_prealloc);
2884MB_PROC_VALUE_WRITE(group_prealloc);
2885
2886#define MB_PROC_HANDLER(name, var) \
2887do { \
2888 proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
2889 if (proc == NULL) { \
2890 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2891 goto err_out; \
2892 } \
2893 proc->data = sbi; \
2894 proc->read_proc = ext4_mb_read_##var ; \
2895 proc->write_proc = ext4_mb_write_##var; \
2896} while (0)
2897
2898static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2899{
2900 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2901 struct ext4_sb_info *sbi = EXT4_SB(sb);
2902 struct proc_dir_entry *proc;
2903 char devname[64];
2904
2905 snprintf(devname, sizeof(devname) - 1, "%s",
2906 bdevname(sb->s_bdev, devname));
2907 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2908
2909 MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
2910 MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
2911 MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
2912 MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
2913 MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
2914 MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
2915
2916 return 0;
2917
2918err_out:
2919 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
2920 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
2921 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
2922 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
2923 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
2924 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
2925 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2926 remove_proc_entry(devname, proc_root_ext4);
2927 sbi->s_mb_proc = NULL;
2928
2929 return -ENOMEM;
2930}
2931
2932static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2933{
2934 struct ext4_sb_info *sbi = EXT4_SB(sb);
2935 char devname[64];
2936
2937 if (sbi->s_mb_proc == NULL)
2938 return -EINVAL;
2939
2940 snprintf(devname, sizeof(devname) - 1, "%s",
2941 bdevname(sb->s_bdev, devname));
2942 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
2943 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
2944 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
2945 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
2946 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
2947 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2948 remove_proc_entry(devname, proc_root_ext4);
2949
2950 return 0;
2951}
2952
2953int __init init_ext4_mballoc(void)
2954{
2955 ext4_pspace_cachep =
2956 kmem_cache_create("ext4_prealloc_space",
2957 sizeof(struct ext4_prealloc_space),
2958 0, SLAB_RECLAIM_ACCOUNT, NULL);
2959 if (ext4_pspace_cachep == NULL)
2960 return -ENOMEM;
2961
2962#ifdef CONFIG_PROC_FS
2963 proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
2964 if (proc_root_ext4 == NULL)
2965 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
2966#endif
2967
2968 return 0;
2969}
2970
2971void exit_ext4_mballoc(void)
2972{
2973 /* XXX: synchronize_rcu(); */
2974 kmem_cache_destroy(ext4_pspace_cachep);
2975#ifdef CONFIG_PROC_FS
2976 remove_proc_entry(EXT4_ROOT, proc_root_fs);
2977#endif
2978}
2979
2980
2981/*
2982 * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
2983 * Returns 0 if success or error code
2984 */
2985static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2986 handle_t *handle)
2987{
2988 struct buffer_head *bitmap_bh = NULL;
2989 struct ext4_super_block *es;
2990 struct ext4_group_desc *gdp;
2991 struct buffer_head *gdp_bh;
2992 struct ext4_sb_info *sbi;
2993 struct super_block *sb;
2994 ext4_fsblk_t block;
2995 int err;
2996
2997 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2998 BUG_ON(ac->ac_b_ex.fe_len <= 0);
2999
3000 sb = ac->ac_sb;
3001 sbi = EXT4_SB(sb);
3002 es = sbi->s_es;
3003
3004 ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
3005 gdp->bg_free_blocks_count);
3006
3007 err = -EIO;
3008 bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
3009 if (!bitmap_bh)
3010 goto out_err;
3011
3012 err = ext4_journal_get_write_access(handle, bitmap_bh);
3013 if (err)
3014 goto out_err;
3015
3016 err = -EIO;
3017 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
3018 if (!gdp)
3019 goto out_err;
3020
3021 err = ext4_journal_get_write_access(handle, gdp_bh);
3022 if (err)
3023 goto out_err;
3024
3025 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
3026 + ac->ac_b_ex.fe_start
3027 + le32_to_cpu(es->s_first_data_block);
3028
3029 if (block == ext4_block_bitmap(sb, gdp) ||
3030 block == ext4_inode_bitmap(sb, gdp) ||
3031 in_range(block, ext4_inode_table(sb, gdp),
3032 EXT4_SB(sb)->s_itb_per_group)) {
3033
3034 ext4_error(sb, __FUNCTION__,
3035 "Allocating block in system zone - block = %llu",
3036 block);
3037 }
3038#ifdef AGGRESSIVE_CHECK
3039 {
3040 int i;
3041 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
3042 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
3043 bitmap_bh->b_data));
3044 }
3045 }
3046#endif
3047 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
3048 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
3049
3050 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3051 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
3052 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3053 gdp->bg_free_blocks_count =
3054 cpu_to_le16(ext4_free_blocks_after_init(sb,
3055 ac->ac_b_ex.fe_group,
3056 gdp));
3057 }
3058 gdp->bg_free_blocks_count =
3059 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
3060 - ac->ac_b_ex.fe_len);
3061 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
3062 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3063 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
3064
3065 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
3066 if (err)
3067 goto out_err;
3068 err = ext4_journal_dirty_metadata(handle, gdp_bh);
3069
3070out_err:
3071 sb->s_dirt = 1;
3072 put_bh(bitmap_bh);
3073 return err;
3074}
3075
3076/*
3077 * here we normalize request for locality group
3078 * Group request are normalized to s_strip size if we set the same via mount
3079 * option. If not we set it to s_mb_group_prealloc which can be configured via
3080 * /proc/fs/ext4/<partition>/group_prealloc
3081 *
3082 * XXX: should we try to preallocate more than the group has now?
3083 */
3084static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3085{
3086 struct super_block *sb = ac->ac_sb;
3087 struct ext4_locality_group *lg = ac->ac_lg;
3088
3089 BUG_ON(lg == NULL);
3090 if (EXT4_SB(sb)->s_stripe)
3091 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
3092 else
3093 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3094 mb_debug("#%u: goal %lu blocks for locality group\n",
3095 current->pid, ac->ac_g_ex.fe_len);
3096}
3097
3098/*
3099 * Normalization means making request better in terms of
3100 * size and alignment
3101 */
3102static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3103 struct ext4_allocation_request *ar)
3104{
3105 int bsbits, max;
3106 ext4_lblk_t end;
3107 struct list_head *cur;
3108 loff_t size, orig_size, start_off;
3109 ext4_lblk_t start, orig_start;
3110 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3111
3112 /* do normalize only data requests, metadata requests
3113 do not need preallocation */
3114 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3115 return;
3116
3117 /* sometime caller may want exact blocks */
3118 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3119 return;
3120
3121 /* caller may indicate that preallocation isn't
3122 * required (it's a tail, for example) */
3123 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3124 return;
3125
3126 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3127 ext4_mb_normalize_group_request(ac);
3128 return ;
3129 }
3130
3131 bsbits = ac->ac_sb->s_blocksize_bits;
3132
3133 /* first, let's learn actual file size
3134 * given current request is allocated */
3135 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
3136 size = size << bsbits;
3137 if (size < i_size_read(ac->ac_inode))
3138 size = i_size_read(ac->ac_inode);
3139
3140 /* max available blocks in a free group */
3141 max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
3142 EXT4_SB(ac->ac_sb)->s_itb_per_group;
3143
3144#define NRL_CHECK_SIZE(req, size, max,bits) \
3145 (req <= (size) || max <= ((size) >> bits))
3146
3147 /* first, try to predict filesize */
3148 /* XXX: should this table be tunable? */
3149 start_off = 0;
3150 if (size <= 16 * 1024) {
3151 size = 16 * 1024;
3152 } else if (size <= 32 * 1024) {
3153 size = 32 * 1024;
3154 } else if (size <= 64 * 1024) {
3155 size = 64 * 1024;
3156 } else if (size <= 128 * 1024) {
3157 size = 128 * 1024;
3158 } else if (size <= 256 * 1024) {
3159 size = 256 * 1024;
3160 } else if (size <= 512 * 1024) {
3161 size = 512 * 1024;
3162 } else if (size <= 1024 * 1024) {
3163 size = 1024 * 1024;
3164 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
3165 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3166 (20 - bsbits)) << 20;
3167 size = 1024 * 1024;
3168 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
3169 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3170 (22 - bsbits)) << 22;
3171 size = 4 * 1024 * 1024;
3172 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3173 (8<<20)>>bsbits, max, bsbits)) {
3174 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3175 (23 - bsbits)) << 23;
3176 size = 8 * 1024 * 1024;
3177 } else {
3178 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
3179 size = ac->ac_o_ex.fe_len << bsbits;
3180 }
3181 orig_size = size = size >> bsbits;
3182 orig_start = start = start_off >> bsbits;
3183
3184 /* don't cover already allocated blocks in selected range */
3185 if (ar->pleft && start <= ar->lleft) {
3186 size -= ar->lleft + 1 - start;
3187 start = ar->lleft + 1;
3188 }
3189 if (ar->pright && start + size - 1 >= ar->lright)
3190 size -= start + size - ar->lright;
3191
3192 end = start + size;
3193
3194 /* check we don't cross already preallocated blocks */
3195 rcu_read_lock();
3196 list_for_each_rcu(cur, &ei->i_prealloc_list) {
3197 struct ext4_prealloc_space *pa;
3198 unsigned long pa_end;
3199
3200 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3201
3202 if (pa->pa_deleted)
3203 continue;
3204 spin_lock(&pa->pa_lock);
3205 if (pa->pa_deleted) {
3206 spin_unlock(&pa->pa_lock);
3207 continue;
3208 }
3209
3210 pa_end = pa->pa_lstart + pa->pa_len;
3211
3212 /* PA must not overlap original request */
3213 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3214 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3215
3216 /* skip PA normalized request doesn't overlap with */
3217 if (pa->pa_lstart >= end) {
3218 spin_unlock(&pa->pa_lock);
3219 continue;
3220 }
3221 if (pa_end <= start) {
3222 spin_unlock(&pa->pa_lock);
3223 continue;
3224 }
3225 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3226
3227 if (pa_end <= ac->ac_o_ex.fe_logical) {
3228 BUG_ON(pa_end < start);
3229 start = pa_end;
3230 }
3231
3232 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3233 BUG_ON(pa->pa_lstart > end);
3234 end = pa->pa_lstart;
3235 }
3236 spin_unlock(&pa->pa_lock);
3237 }
3238 rcu_read_unlock();
3239 size = end - start;
3240
3241 /* XXX: extra loop to check we really don't overlap preallocations */
3242 rcu_read_lock();
3243 list_for_each_rcu(cur, &ei->i_prealloc_list) {
3244 struct ext4_prealloc_space *pa;
3245 unsigned long pa_end;
3246 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3247 spin_lock(&pa->pa_lock);
3248 if (pa->pa_deleted == 0) {
3249 pa_end = pa->pa_lstart + pa->pa_len;
3250 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3251 }
3252 spin_unlock(&pa->pa_lock);
3253 }
3254 rcu_read_unlock();
3255
3256 if (start + size <= ac->ac_o_ex.fe_logical &&
3257 start > ac->ac_o_ex.fe_logical) {
3258 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
3259 (unsigned long) start, (unsigned long) size,
3260 (unsigned long) ac->ac_o_ex.fe_logical);
3261 }
3262 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3263 start > ac->ac_o_ex.fe_logical);
3264 BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3265
3266 /* now prepare goal request */
3267
3268 /* XXX: is it better to align blocks WRT to logical
3269 * placement or satisfy big request as is */
3270 ac->ac_g_ex.fe_logical = start;
3271 ac->ac_g_ex.fe_len = size;
3272
3273 /* define goal start in order to merge */
3274 if (ar->pright && (ar->lright == (start + size))) {
3275 /* merge to the right */
3276 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3277 &ac->ac_f_ex.fe_group,
3278 &ac->ac_f_ex.fe_start);
3279 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3280 }
3281 if (ar->pleft && (ar->lleft + 1 == start)) {
3282 /* merge to the left */
3283 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3284 &ac->ac_f_ex.fe_group,
3285 &ac->ac_f_ex.fe_start);
3286 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3287 }
3288
3289 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
3290 (unsigned) orig_size, (unsigned) start);
3291}
3292
3293static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3294{
3295 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3296
3297 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3298 atomic_inc(&sbi->s_bal_reqs);
3299 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3300 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
3301 atomic_inc(&sbi->s_bal_success);
3302 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3303 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3304 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3305 atomic_inc(&sbi->s_bal_goals);
3306 if (ac->ac_found > sbi->s_mb_max_to_scan)
3307 atomic_inc(&sbi->s_bal_breaks);
3308 }
3309
3310 ext4_mb_store_history(ac);
3311}
3312
3313/*
3314 * use blocks preallocated to inode
3315 */
3316static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3317 struct ext4_prealloc_space *pa)
3318{
3319 ext4_fsblk_t start;
3320 ext4_fsblk_t end;
3321 int len;
3322
3323 /* found preallocated blocks, use them */
3324 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3325 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
3326 len = end - start;
3327 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3328 &ac->ac_b_ex.fe_start);
3329 ac->ac_b_ex.fe_len = len;
3330 ac->ac_status = AC_STATUS_FOUND;
3331 ac->ac_pa = pa;
3332
3333 BUG_ON(start < pa->pa_pstart);
3334 BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
3335 BUG_ON(pa->pa_free < len);
3336 pa->pa_free -= len;
3337
3338 mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa);
3339}
3340
3341/*
3342 * use blocks preallocated to locality group
3343 */
3344static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3345 struct ext4_prealloc_space *pa)
3346{
3347 unsigned len = ac->ac_o_ex.fe_len;
3348
3349 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3350 &ac->ac_b_ex.fe_group,
3351 &ac->ac_b_ex.fe_start);
3352 ac->ac_b_ex.fe_len = len;
3353 ac->ac_status = AC_STATUS_FOUND;
3354 ac->ac_pa = pa;
3355
3356 /* we don't correct pa_pstart or pa_plen here to avoid
3357 * possible race when tte group is being loaded concurrently
3358 * instead we correct pa later, after blocks are marked
3359 * in on-disk bitmap -- see ext4_mb_release_context() */
3360 /*
3361 * FIXME!! but the other CPUs can look at this particular
3362 * pa and think that it have enought free blocks if we
3363 * don't update pa_free here right ?
3364 */
3365 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3366}
3367
3368/*
3369 * search goal blocks in preallocated space
3370 */
3371static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3372{
3373 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3374 struct ext4_locality_group *lg;
3375 struct ext4_prealloc_space *pa;
3376 struct list_head *cur;
3377
3378 /* only data can be preallocated */
3379 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3380 return 0;
3381
3382 /* first, try per-file preallocation */
3383 rcu_read_lock();
3384 list_for_each_rcu(cur, &ei->i_prealloc_list) {
3385 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3386
3387 /* all fields in this condition don't change,
3388 * so we can skip locking for them */
3389 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3390 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3391 continue;
3392
3393 /* found preallocated blocks, use them */
3394 spin_lock(&pa->pa_lock);
3395 if (pa->pa_deleted == 0 && pa->pa_free) {
3396 atomic_inc(&pa->pa_count);
3397 ext4_mb_use_inode_pa(ac, pa);
3398 spin_unlock(&pa->pa_lock);
3399 ac->ac_criteria = 10;
3400 rcu_read_unlock();
3401 return 1;
3402 }
3403 spin_unlock(&pa->pa_lock);
3404 }
3405 rcu_read_unlock();
3406
3407 /* can we use group allocation? */
3408 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3409 return 0;
3410
3411 /* inode may have no locality group for some reason */
3412 lg = ac->ac_lg;
3413 if (lg == NULL)
3414 return 0;
3415
3416 rcu_read_lock();
3417 list_for_each_rcu(cur, &lg->lg_prealloc_list) {
3418 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3419 spin_lock(&pa->pa_lock);
3420 if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
3421 atomic_inc(&pa->pa_count);
3422 ext4_mb_use_group_pa(ac, pa);
3423 spin_unlock(&pa->pa_lock);
3424 ac->ac_criteria = 20;
3425 rcu_read_unlock();
3426 return 1;
3427 }
3428 spin_unlock(&pa->pa_lock);
3429 }
3430 rcu_read_unlock();
3431
3432 return 0;
3433}
3434
3435/*
3436 * the function goes through all preallocation in this group and marks them
3437 * used in in-core bitmap. buddy must be generated from this bitmap
3438 * Need to be called with ext4 group lock (ext4_lock_group)
3439 */
3440static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3441 ext4_group_t group)
3442{
3443 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3444 struct ext4_prealloc_space *pa;
3445 struct list_head *cur;
3446 ext4_group_t groupnr;
3447 ext4_grpblk_t start;
3448 int preallocated = 0;
3449 int count = 0;
3450 int len;
3451
3452 /* all form of preallocation discards first load group,
3453 * so the only competing code is preallocation use.
3454 * we don't need any locking here
3455 * notice we do NOT ignore preallocations with pa_deleted
3456 * otherwise we could leave used blocks available for
3457 * allocation in buddy when concurrent ext4_mb_put_pa()
3458 * is dropping preallocation
3459 */
3460 list_for_each(cur, &grp->bb_prealloc_list) {
3461 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3462 spin_lock(&pa->pa_lock);
3463 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3464 &groupnr, &start);
3465 len = pa->pa_len;
3466 spin_unlock(&pa->pa_lock);
3467 if (unlikely(len == 0))
3468 continue;
3469 BUG_ON(groupnr != group);
3470 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
3471 bitmap, start, len);
3472 preallocated += len;
3473 count++;
3474 }
3475 mb_debug("prellocated %u for group %lu\n", preallocated, group);
3476}
3477
3478static void ext4_mb_pa_callback(struct rcu_head *head)
3479{
3480 struct ext4_prealloc_space *pa;
3481 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3482 kmem_cache_free(ext4_pspace_cachep, pa);
3483}
3484
3485/*
3486 * drops a reference to preallocated space descriptor
3487 * if this was the last reference and the space is consumed
3488 */
3489static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3490 struct super_block *sb, struct ext4_prealloc_space *pa)
3491{
3492 unsigned long grp;
3493
3494 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3495 return;
3496
3497 /* in this short window concurrent discard can set pa_deleted */
3498 spin_lock(&pa->pa_lock);
3499 if (pa->pa_deleted == 1) {
3500 spin_unlock(&pa->pa_lock);
3501 return;
3502 }
3503
3504 pa->pa_deleted = 1;
3505 spin_unlock(&pa->pa_lock);
3506
3507 /* -1 is to protect from crossing allocation group */
3508 ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
3509
3510 /*
3511 * possible race:
3512 *
3513 * P1 (buddy init) P2 (regular allocation)
3514 * find block B in PA
3515 * copy on-disk bitmap to buddy
3516 * mark B in on-disk bitmap
3517 * drop PA from group
3518 * mark all PAs in buddy
3519 *
3520 * thus, P1 initializes buddy with B available. to prevent this
3521 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3522 * against that pair
3523 */
3524 ext4_lock_group(sb, grp);
3525 list_del(&pa->pa_group_list);
3526 ext4_unlock_group(sb, grp);
3527
3528 spin_lock(pa->pa_obj_lock);
3529 list_del_rcu(&pa->pa_inode_list);
3530 spin_unlock(pa->pa_obj_lock);
3531
3532 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3533}
3534
3535/*
3536 * creates new preallocated space for given inode
3537 */
3538static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3539{
3540 struct super_block *sb = ac->ac_sb;
3541 struct ext4_prealloc_space *pa;
3542 struct ext4_group_info *grp;
3543 struct ext4_inode_info *ei;
3544
3545 /* preallocate only when found space is larger then requested */
3546 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3547 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3548 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3549
3550 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3551 if (pa == NULL)
3552 return -ENOMEM;
3553
3554 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3555 int winl;
3556 int wins;
3557 int win;
3558 int offs;
3559
3560 /* we can't allocate as much as normalizer wants.
3561 * so, found space must get proper lstart
3562 * to cover original request */
3563 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3564 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3565
3566 /* we're limited by original request in that
3567 * logical block must be covered any way
3568 * winl is window we can move our chunk within */
3569 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3570
3571 /* also, we should cover whole original request */
3572 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
3573
3574 /* the smallest one defines real window */
3575 win = min(winl, wins);
3576
3577 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
3578 if (offs && offs < win)
3579 win = offs;
3580
3581 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
3582 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3583 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3584 }
3585
3586 /* preallocation can change ac_b_ex, thus we store actually
3587 * allocated blocks for history */
3588 ac->ac_f_ex = ac->ac_b_ex;
3589
3590 pa->pa_lstart = ac->ac_b_ex.fe_logical;
3591 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3592 pa->pa_len = ac->ac_b_ex.fe_len;
3593 pa->pa_free = pa->pa_len;
3594 atomic_set(&pa->pa_count, 1);
3595 spin_lock_init(&pa->pa_lock);
3596 pa->pa_deleted = 0;
3597 pa->pa_linear = 0;
3598
3599 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3600 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3601
3602 ext4_mb_use_inode_pa(ac, pa);
3603 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3604
3605 ei = EXT4_I(ac->ac_inode);
3606 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3607
3608 pa->pa_obj_lock = &ei->i_prealloc_lock;
3609 pa->pa_inode = ac->ac_inode;
3610
3611 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3612 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3613 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3614
3615 spin_lock(pa->pa_obj_lock);
3616 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3617 spin_unlock(pa->pa_obj_lock);
3618
3619 return 0;
3620}
3621
3622/*
3623 * creates new preallocated space for locality group inodes belongs to
3624 */
3625static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3626{
3627 struct super_block *sb = ac->ac_sb;
3628 struct ext4_locality_group *lg;
3629 struct ext4_prealloc_space *pa;
3630 struct ext4_group_info *grp;
3631
3632 /* preallocate only when found space is larger then requested */
3633 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3634 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3635 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3636
3637 BUG_ON(ext4_pspace_cachep == NULL);
3638 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3639 if (pa == NULL)
3640 return -ENOMEM;
3641
3642 /* preallocation can change ac_b_ex, thus we store actually
3643 * allocated blocks for history */
3644 ac->ac_f_ex = ac->ac_b_ex;
3645
3646 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3647 pa->pa_lstart = pa->pa_pstart;
3648 pa->pa_len = ac->ac_b_ex.fe_len;
3649 pa->pa_free = pa->pa_len;
3650 atomic_set(&pa->pa_count, 1);
3651 spin_lock_init(&pa->pa_lock);
3652 pa->pa_deleted = 0;
3653 pa->pa_linear = 1;
3654
3655 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3656 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3657
3658 ext4_mb_use_group_pa(ac, pa);
3659 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3660
3661 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3662 lg = ac->ac_lg;
3663 BUG_ON(lg == NULL);
3664
3665 pa->pa_obj_lock = &lg->lg_prealloc_lock;
3666 pa->pa_inode = NULL;
3667
3668 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3669 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3670 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3671
3672 spin_lock(pa->pa_obj_lock);
3673 list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
3674 spin_unlock(pa->pa_obj_lock);
3675
3676 return 0;
3677}
3678
3679static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3680{
3681 int err;
3682
3683 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3684 err = ext4_mb_new_group_pa(ac);
3685 else
3686 err = ext4_mb_new_inode_pa(ac);
3687 return err;
3688}
3689
3690/*
3691 * finds all unused blocks in on-disk bitmap, frees them in
3692 * in-core bitmap and buddy.
3693 * @pa must be unlinked from inode and group lists, so that
3694 * nobody else can find/use it.
3695 * the caller MUST hold group/inode locks.
3696 * TODO: optimize the case when there are no in-core structures yet
3697 */
3698static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
3699 struct buffer_head *bitmap_bh,
3700 struct ext4_prealloc_space *pa)
3701{
3702 struct ext4_allocation_context ac;
3703 struct super_block *sb = e4b->bd_sb;
3704 struct ext4_sb_info *sbi = EXT4_SB(sb);
3705 unsigned long end;
3706 unsigned long next;
3707 ext4_group_t group;
3708 ext4_grpblk_t bit;
3709 sector_t start;
3710 int err = 0;
3711 int free = 0;
3712
3713 BUG_ON(pa->pa_deleted == 0);
3714 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3715 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3716 end = bit + pa->pa_len;
3717
3718 ac.ac_sb = sb;
3719 ac.ac_inode = pa->pa_inode;
3720 ac.ac_op = EXT4_MB_HISTORY_DISCARD;
3721
3722 while (bit < end) {
3723 bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3724 if (bit >= end)
3725 break;
3726 next = ext4_find_next_bit(bitmap_bh->b_data, end, bit);
3727 if (next > end)
3728 next = end;
3729 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3730 le32_to_cpu(sbi->s_es->s_first_data_block);
3731 mb_debug(" free preallocated %u/%u in group %u\n",
3732 (unsigned) start, (unsigned) next - bit,
3733 (unsigned) group);
3734 free += next - bit;
3735
3736 ac.ac_b_ex.fe_group = group;
3737 ac.ac_b_ex.fe_start = bit;
3738 ac.ac_b_ex.fe_len = next - bit;
3739 ac.ac_b_ex.fe_logical = 0;
3740 ext4_mb_store_history(&ac);
3741
3742 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3743 bit = next + 1;
3744 }
3745 if (free != pa->pa_free) {
3746 printk(KERN_ERR "pa %p: logic %lu, phys. %lu, len %lu\n",
3747 pa, (unsigned long) pa->pa_lstart,
3748 (unsigned long) pa->pa_pstart,
3749 (unsigned long) pa->pa_len);
3750 printk(KERN_ERR "free %u, pa_free %u\n", free, pa->pa_free);
3751 }
3752 BUG_ON(free != pa->pa_free);
3753 atomic_add(free, &sbi->s_mb_discarded);
3754
3755 return err;
3756}
3757
3758static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3759 struct ext4_prealloc_space *pa)
3760{
3761 struct ext4_allocation_context ac;
3762 struct super_block *sb = e4b->bd_sb;
3763 ext4_group_t group;
3764 ext4_grpblk_t bit;
3765
3766 ac.ac_op = EXT4_MB_HISTORY_DISCARD;
3767
3768 BUG_ON(pa->pa_deleted == 0);
3769 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3770 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3771 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3772 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3773
3774 ac.ac_sb = sb;
3775 ac.ac_inode = NULL;
3776 ac.ac_b_ex.fe_group = group;
3777 ac.ac_b_ex.fe_start = bit;
3778 ac.ac_b_ex.fe_len = pa->pa_len;
3779 ac.ac_b_ex.fe_logical = 0;
3780 ext4_mb_store_history(&ac);
3781
3782 return 0;
3783}
3784
3785/*
3786 * releases all preallocations in given group
3787 *
3788 * first, we need to decide discard policy:
3789 * - when do we discard
3790 * 1) ENOSPC
3791 * - how many do we discard
3792 * 1) how many requested
3793 */
3794static int ext4_mb_discard_group_preallocations(struct super_block *sb,
3795 ext4_group_t group, int needed)
3796{
3797 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3798 struct buffer_head *bitmap_bh = NULL;
3799 struct ext4_prealloc_space *pa, *tmp;
3800 struct list_head list;
3801 struct ext4_buddy e4b;
3802 int err;
3803 int busy = 0;
3804 int free = 0;
3805
3806 mb_debug("discard preallocation for group %lu\n", group);
3807
3808 if (list_empty(&grp->bb_prealloc_list))
3809 return 0;
3810
3811 bitmap_bh = read_block_bitmap(sb, group);
3812 if (bitmap_bh == NULL) {
3813 /* error handling here */
3814 ext4_mb_release_desc(&e4b);
3815 BUG_ON(bitmap_bh == NULL);
3816 }
3817
3818 err = ext4_mb_load_buddy(sb, group, &e4b);
3819 BUG_ON(err != 0); /* error handling here */
3820
3821 if (needed == 0)
3822 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3823
3824 grp = ext4_get_group_info(sb, group);
3825 INIT_LIST_HEAD(&list);
3826
3827repeat:
3828 ext4_lock_group(sb, group);
3829 list_for_each_entry_safe(pa, tmp,
3830 &grp->bb_prealloc_list, pa_group_list) {
3831 spin_lock(&pa->pa_lock);
3832 if (atomic_read(&pa->pa_count)) {
3833 spin_unlock(&pa->pa_lock);
3834 busy = 1;
3835 continue;
3836 }
3837 if (pa->pa_deleted) {
3838 spin_unlock(&pa->pa_lock);
3839 continue;
3840 }
3841
3842 /* seems this one can be freed ... */
3843 pa->pa_deleted = 1;
3844
3845 /* we can trust pa_free ... */
3846 free += pa->pa_free;
3847
3848 spin_unlock(&pa->pa_lock);
3849
3850 list_del(&pa->pa_group_list);
3851 list_add(&pa->u.pa_tmp_list, &list);
3852 }
3853
3854 /* if we still need more blocks and some PAs were used, try again */
3855 if (free < needed && busy) {
3856 busy = 0;
3857 ext4_unlock_group(sb, group);
3858 /*
3859 * Yield the CPU here so that we don't get soft lockup
3860 * in non preempt case.
3861 */
3862 yield();
3863 goto repeat;
3864 }
3865
3866 /* found anything to free? */
3867 if (list_empty(&list)) {
3868 BUG_ON(free != 0);
3869 goto out;
3870 }
3871
3872 /* now free all selected PAs */
3873 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3874
3875 /* remove from object (inode or locality group) */
3876 spin_lock(pa->pa_obj_lock);
3877 list_del_rcu(&pa->pa_inode_list);
3878 spin_unlock(pa->pa_obj_lock);
3879
3880 if (pa->pa_linear)
3881 ext4_mb_release_group_pa(&e4b, pa);
3882 else
3883 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3884
3885 list_del(&pa->u.pa_tmp_list);
3886 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3887 }
3888
3889out:
3890 ext4_unlock_group(sb, group);
3891 ext4_mb_release_desc(&e4b);
3892 put_bh(bitmap_bh);
3893 return free;
3894}
3895
3896/*
3897 * releases all non-used preallocated blocks for given inode
3898 *
3899 * It's important to discard preallocations under i_data_sem
3900 * We don't want another block to be served from the prealloc
3901 * space when we are discarding the inode prealloc space.
3902 *
3903 * FIXME!! Make sure it is valid at all the call sites
3904 */
3905void ext4_mb_discard_inode_preallocations(struct inode *inode)
3906{
3907 struct ext4_inode_info *ei = EXT4_I(inode);
3908 struct super_block *sb = inode->i_sb;
3909 struct buffer_head *bitmap_bh = NULL;
3910 struct ext4_prealloc_space *pa, *tmp;
3911 ext4_group_t group = 0;
3912 struct list_head list;
3913 struct ext4_buddy e4b;
3914 int err;
3915
3916 if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
3917 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3918 return;
3919 }
3920
3921 mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
3922
3923 INIT_LIST_HEAD(&list);
3924
3925repeat:
3926 /* first, collect all pa's in the inode */
3927 spin_lock(&ei->i_prealloc_lock);
3928 while (!list_empty(&ei->i_prealloc_list)) {
3929 pa = list_entry(ei->i_prealloc_list.next,
3930 struct ext4_prealloc_space, pa_inode_list);
3931 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3932 spin_lock(&pa->pa_lock);
3933 if (atomic_read(&pa->pa_count)) {
3934 /* this shouldn't happen often - nobody should
3935 * use preallocation while we're discarding it */
3936 spin_unlock(&pa->pa_lock);
3937 spin_unlock(&ei->i_prealloc_lock);
3938 printk(KERN_ERR "uh-oh! used pa while discarding\n");
3939 WARN_ON(1);
3940 schedule_timeout_uninterruptible(HZ);
3941 goto repeat;
3942
3943 }
3944 if (pa->pa_deleted == 0) {
3945 pa->pa_deleted = 1;
3946 spin_unlock(&pa->pa_lock);
3947 list_del_rcu(&pa->pa_inode_list);
3948 list_add(&pa->u.pa_tmp_list, &list);
3949 continue;
3950 }
3951
3952 /* someone is deleting pa right now */
3953 spin_unlock(&pa->pa_lock);
3954 spin_unlock(&ei->i_prealloc_lock);
3955
3956 /* we have to wait here because pa_deleted
3957 * doesn't mean pa is already unlinked from
3958 * the list. as we might be called from
3959 * ->clear_inode() the inode will get freed
3960 * and concurrent thread which is unlinking
3961 * pa from inode's list may access already
3962 * freed memory, bad-bad-bad */
3963
3964 /* XXX: if this happens too often, we can
3965 * add a flag to force wait only in case
3966 * of ->clear_inode(), but not in case of
3967 * regular truncate */
3968 schedule_timeout_uninterruptible(HZ);
3969 goto repeat;
3970 }
3971 spin_unlock(&ei->i_prealloc_lock);
3972
3973 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3974 BUG_ON(pa->pa_linear != 0);
3975 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
3976
3977 err = ext4_mb_load_buddy(sb, group, &e4b);
3978 BUG_ON(err != 0); /* error handling here */
3979
3980 bitmap_bh = read_block_bitmap(sb, group);
3981 if (bitmap_bh == NULL) {
3982 /* error handling here */
3983 ext4_mb_release_desc(&e4b);
3984 BUG_ON(bitmap_bh == NULL);
3985 }
3986
3987 ext4_lock_group(sb, group);
3988 list_del(&pa->pa_group_list);
3989 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3990 ext4_unlock_group(sb, group);
3991
3992 ext4_mb_release_desc(&e4b);
3993 put_bh(bitmap_bh);
3994
3995 list_del(&pa->u.pa_tmp_list);
3996 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3997 }
3998}
3999
4000/*
4001 * finds all preallocated spaces and return blocks being freed to them
4002 * if preallocated space becomes full (no block is used from the space)
4003 * then the function frees space in buddy
4004 * XXX: at the moment, truncate (which is the only way to free blocks)
4005 * discards all preallocations
4006 */
4007static void ext4_mb_return_to_preallocation(struct inode *inode,
4008 struct ext4_buddy *e4b,
4009 sector_t block, int count)
4010{
4011 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
4012}
4013#ifdef MB_DEBUG
4014static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4015{
4016 struct super_block *sb = ac->ac_sb;
4017 ext4_group_t i;
4018
4019 printk(KERN_ERR "EXT4-fs: Can't allocate:"
4020 " Allocation context details:\n");
4021 printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
4022 ac->ac_status, ac->ac_flags);
4023 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
4024 "best %lu/%lu/%lu@%lu cr %d\n",
4025 (unsigned long)ac->ac_o_ex.fe_group,
4026 (unsigned long)ac->ac_o_ex.fe_start,
4027 (unsigned long)ac->ac_o_ex.fe_len,
4028 (unsigned long)ac->ac_o_ex.fe_logical,
4029 (unsigned long)ac->ac_g_ex.fe_group,
4030 (unsigned long)ac->ac_g_ex.fe_start,
4031 (unsigned long)ac->ac_g_ex.fe_len,
4032 (unsigned long)ac->ac_g_ex.fe_logical,
4033 (unsigned long)ac->ac_b_ex.fe_group,
4034 (unsigned long)ac->ac_b_ex.fe_start,
4035 (unsigned long)ac->ac_b_ex.fe_len,
4036 (unsigned long)ac->ac_b_ex.fe_logical,
4037 (int)ac->ac_criteria);
4038 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
4039 ac->ac_found);
4040 printk(KERN_ERR "EXT4-fs: groups: \n");
4041 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
4042 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4043 struct ext4_prealloc_space *pa;
4044 ext4_grpblk_t start;
4045 struct list_head *cur;
4046 ext4_lock_group(sb, i);
4047 list_for_each(cur, &grp->bb_prealloc_list) {
4048 pa = list_entry(cur, struct ext4_prealloc_space,
4049 pa_group_list);
4050 spin_lock(&pa->pa_lock);
4051 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4052 NULL, &start);
4053 spin_unlock(&pa->pa_lock);
4054 printk(KERN_ERR "PA:%lu:%d:%u \n", i,
4055 start, pa->pa_len);
4056 }
4057 ext4_lock_group(sb, i);
4058
4059 if (grp->bb_free == 0)
4060 continue;
4061 printk(KERN_ERR "%lu: %d/%d \n",
4062 i, grp->bb_free, grp->bb_fragments);
4063 }
4064 printk(KERN_ERR "\n");
4065}
4066#else
4067static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4068{
4069 return;
4070}
4071#endif
4072
4073/*
4074 * We use locality group preallocation for small size file. The size of the
4075 * file is determined by the current size or the resulting size after
4076 * allocation which ever is larger
4077 *
4078 * One can tune this size via /proc/fs/ext4/<partition>/stream_req
4079 */
4080static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4081{
4082 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4083 int bsbits = ac->ac_sb->s_blocksize_bits;
4084 loff_t size, isize;
4085
4086 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4087 return;
4088
4089 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4090 isize = i_size_read(ac->ac_inode) >> bsbits;
4091 size = max(size, isize);
4092
4093 /* don't use group allocation for large files */
4094 if (size >= sbi->s_mb_stream_request)
4095 return;
4096
4097 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4098 return;
4099
4100 BUG_ON(ac->ac_lg != NULL);
4101 /*
4102 * locality group prealloc space are per cpu. The reason for having
4103 * per cpu locality group is to reduce the contention between block
4104 * request from multiple CPUs.
4105 */
4106 ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
4107 put_cpu();
4108
4109 /* we're going to use group allocation */
4110 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4111
4112 /* serialize all allocations in the group */
4113 mutex_lock(&ac->ac_lg->lg_mutex);
4114}
4115
4116static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4117 struct ext4_allocation_request *ar)
4118{
4119 struct super_block *sb = ar->inode->i_sb;
4120 struct ext4_sb_info *sbi = EXT4_SB(sb);
4121 struct ext4_super_block *es = sbi->s_es;
4122 ext4_group_t group;
4123 unsigned long len;
4124 unsigned long goal;
4125 ext4_grpblk_t block;
4126
4127 /* we can't allocate > group size */
4128 len = ar->len;
4129
4130 /* just a dirty hack to filter too big requests */
4131 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
4132 len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
4133
4134 /* start searching from the goal */
4135 goal = ar->goal;
4136 if (goal < le32_to_cpu(es->s_first_data_block) ||
4137 goal >= ext4_blocks_count(es))
4138 goal = le32_to_cpu(es->s_first_data_block);
4139 ext4_get_group_no_and_offset(sb, goal, &group, &block);
4140
4141 /* set up allocation goals */
4142 ac->ac_b_ex.fe_logical = ar->logical;
4143 ac->ac_b_ex.fe_group = 0;
4144 ac->ac_b_ex.fe_start = 0;
4145 ac->ac_b_ex.fe_len = 0;
4146 ac->ac_status = AC_STATUS_CONTINUE;
4147 ac->ac_groups_scanned = 0;
4148 ac->ac_ex_scanned = 0;
4149 ac->ac_found = 0;
4150 ac->ac_sb = sb;
4151 ac->ac_inode = ar->inode;
4152 ac->ac_o_ex.fe_logical = ar->logical;
4153 ac->ac_o_ex.fe_group = group;
4154 ac->ac_o_ex.fe_start = block;
4155 ac->ac_o_ex.fe_len = len;
4156 ac->ac_g_ex.fe_logical = ar->logical;
4157 ac->ac_g_ex.fe_group = group;
4158 ac->ac_g_ex.fe_start = block;
4159 ac->ac_g_ex.fe_len = len;
4160 ac->ac_f_ex.fe_len = 0;
4161 ac->ac_flags = ar->flags;
4162 ac->ac_2order = 0;
4163 ac->ac_criteria = 0;
4164 ac->ac_pa = NULL;
4165 ac->ac_bitmap_page = NULL;
4166 ac->ac_buddy_page = NULL;
4167 ac->ac_lg = NULL;
4168
4169 /* we have to define context: we'll we work with a file or
4170 * locality group. this is a policy, actually */
4171 ext4_mb_group_or_file(ac);
4172
4173 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4174 "left: %u/%u, right %u/%u to %swritable\n",
4175 (unsigned) ar->len, (unsigned) ar->logical,
4176 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4177 (unsigned) ar->lleft, (unsigned) ar->pleft,
4178 (unsigned) ar->lright, (unsigned) ar->pright,
4179 atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4180 return 0;
4181
4182}
4183
4184/*
4185 * release all resource we used in allocation
4186 */
4187static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4188{
4189 if (ac->ac_pa) {
4190 if (ac->ac_pa->pa_linear) {
4191 /* see comment in ext4_mb_use_group_pa() */
4192 spin_lock(&ac->ac_pa->pa_lock);
4193 ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
4194 ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
4195 ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
4196 ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
4197 spin_unlock(&ac->ac_pa->pa_lock);
4198 }
4199 ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
4200 }
4201 if (ac->ac_bitmap_page)
4202 page_cache_release(ac->ac_bitmap_page);
4203 if (ac->ac_buddy_page)
4204 page_cache_release(ac->ac_buddy_page);
4205 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4206 mutex_unlock(&ac->ac_lg->lg_mutex);
4207 ext4_mb_collect_stats(ac);
4208 return 0;
4209}
4210
4211static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4212{
4213 ext4_group_t i;
4214 int ret;
4215 int freed = 0;
4216
4217 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
4218 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4219 freed += ret;
4220 needed -= ret;
4221 }
4222
4223 return freed;
4224}
4225
4226/*
4227 * Main entry point into mballoc to allocate blocks
4228 * it tries to use preallocation first, then falls back
4229 * to usual allocation
4230 */
4231ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4232 struct ext4_allocation_request *ar, int *errp)
4233{
4234 struct ext4_allocation_context ac;
4235 struct ext4_sb_info *sbi;
4236 struct super_block *sb;
4237 ext4_fsblk_t block = 0;
4238 int freed;
4239 int inquota;
4240
4241 sb = ar->inode->i_sb;
4242 sbi = EXT4_SB(sb);
4243
4244 if (!test_opt(sb, MBALLOC)) {
4245 block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
4246 &(ar->len), errp);
4247 return block;
4248 }
4249
4250 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4251 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4252 ar->len--;
4253 }
4254 if (ar->len == 0) {
4255 *errp = -EDQUOT;
4256 return 0;
4257 }
4258 inquota = ar->len;
4259
4260 ext4_mb_poll_new_transaction(sb, handle);
4261
4262 *errp = ext4_mb_initialize_context(&ac, ar);
4263 if (*errp) {
4264 ar->len = 0;
4265 goto out;
4266 }
4267
4268 ac.ac_op = EXT4_MB_HISTORY_PREALLOC;
4269 if (!ext4_mb_use_preallocated(&ac)) {
4270
4271 ac.ac_op = EXT4_MB_HISTORY_ALLOC;
4272 ext4_mb_normalize_request(&ac, ar);
4273
4274repeat:
4275 /* allocate space in core */
4276 ext4_mb_regular_allocator(&ac);
4277
4278 /* as we've just preallocated more space than
4279 * user requested orinally, we store allocated
4280 * space in a special descriptor */
4281 if (ac.ac_status == AC_STATUS_FOUND &&
4282 ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len)
4283 ext4_mb_new_preallocation(&ac);
4284 }
4285
4286 if (likely(ac.ac_status == AC_STATUS_FOUND)) {
4287 ext4_mb_mark_diskspace_used(&ac, handle);
4288 *errp = 0;
4289 block = ext4_grp_offs_to_block(sb, &ac.ac_b_ex);
4290 ar->len = ac.ac_b_ex.fe_len;
4291 } else {
4292 freed = ext4_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len);
4293 if (freed)
4294 goto repeat;
4295 *errp = -ENOSPC;
4296 ac.ac_b_ex.fe_len = 0;
4297 ar->len = 0;
4298 ext4_mb_show_ac(&ac);
4299 }
4300
4301 ext4_mb_release_context(&ac);
4302
4303out:
4304 if (ar->len < inquota)
4305 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4306
4307 return block;
4308}
4309static void ext4_mb_poll_new_transaction(struct super_block *sb,
4310 handle_t *handle)
4311{
4312 struct ext4_sb_info *sbi = EXT4_SB(sb);
4313
4314 if (sbi->s_last_transaction == handle->h_transaction->t_tid)
4315 return;
4316
4317 /* new transaction! time to close last one and free blocks for
4318 * committed transaction. we know that only transaction can be
4319 * active, so previos transaction can be being logged and we
4320 * know that transaction before previous is known to be already
4321 * logged. this means that now we may free blocks freed in all
4322 * transactions before previous one. hope I'm clear enough ... */
4323
4324 spin_lock(&sbi->s_md_lock);
4325 if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
4326 mb_debug("new transaction %lu, old %lu\n",
4327 (unsigned long) handle->h_transaction->t_tid,
4328 (unsigned long) sbi->s_last_transaction);
4329 list_splice_init(&sbi->s_closed_transaction,
4330 &sbi->s_committed_transaction);
4331 list_splice_init(&sbi->s_active_transaction,
4332 &sbi->s_closed_transaction);
4333 sbi->s_last_transaction = handle->h_transaction->t_tid;
4334 }
4335 spin_unlock(&sbi->s_md_lock);
4336
4337 ext4_mb_free_committed_blocks(sb);
4338}
4339
4340static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4341 ext4_group_t group, ext4_grpblk_t block, int count)
4342{
4343 struct ext4_group_info *db = e4b->bd_info;
4344 struct super_block *sb = e4b->bd_sb;
4345 struct ext4_sb_info *sbi = EXT4_SB(sb);
4346 struct ext4_free_metadata *md;
4347 int i;
4348
4349 BUG_ON(e4b->bd_bitmap_page == NULL);
4350 BUG_ON(e4b->bd_buddy_page == NULL);
4351
4352 ext4_lock_group(sb, group);
4353 for (i = 0; i < count; i++) {
4354 md = db->bb_md_cur;
4355 if (md && db->bb_tid != handle->h_transaction->t_tid) {
4356 db->bb_md_cur = NULL;
4357 md = NULL;
4358 }
4359
4360 if (md == NULL) {
4361 ext4_unlock_group(sb, group);
4362 md = kmalloc(sizeof(*md), GFP_NOFS);
4363 if (md == NULL)
4364 return -ENOMEM;
4365 md->num = 0;
4366 md->group = group;
4367
4368 ext4_lock_group(sb, group);
4369 if (db->bb_md_cur == NULL) {
4370 spin_lock(&sbi->s_md_lock);
4371 list_add(&md->list, &sbi->s_active_transaction);
4372 spin_unlock(&sbi->s_md_lock);
4373 /* protect buddy cache from being freed,
4374 * otherwise we'll refresh it from
4375 * on-disk bitmap and lose not-yet-available
4376 * blocks */
4377 page_cache_get(e4b->bd_buddy_page);
4378 page_cache_get(e4b->bd_bitmap_page);
4379 db->bb_md_cur = md;
4380 db->bb_tid = handle->h_transaction->t_tid;
4381 mb_debug("new md 0x%p for group %lu\n",
4382 md, md->group);
4383 } else {
4384 kfree(md);
4385 md = db->bb_md_cur;
4386 }
4387 }
4388
4389 BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
4390 md->blocks[md->num] = block + i;
4391 md->num++;
4392 if (md->num == EXT4_BB_MAX_BLOCKS) {
4393 /* no more space, put full container on a sb's list */
4394 db->bb_md_cur = NULL;
4395 }
4396 }
4397 ext4_unlock_group(sb, group);
4398 return 0;
4399}
4400
4401/*
4402 * Main entry point into mballoc to free blocks
4403 */
4404void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4405 unsigned long block, unsigned long count,
4406 int metadata, unsigned long *freed)
4407{
4408 struct buffer_head *bitmap_bh = 0;
4409 struct super_block *sb = inode->i_sb;
4410 struct ext4_allocation_context ac;
4411 struct ext4_group_desc *gdp;
4412 struct ext4_super_block *es;
4413 unsigned long overflow;
4414 ext4_grpblk_t bit;
4415 struct buffer_head *gd_bh;
4416 ext4_group_t block_group;
4417 struct ext4_sb_info *sbi;
4418 struct ext4_buddy e4b;
4419 int err = 0;
4420 int ret;
4421
4422 *freed = 0;
4423
4424 ext4_mb_poll_new_transaction(sb, handle);
4425
4426 sbi = EXT4_SB(sb);
4427 es = EXT4_SB(sb)->s_es;
4428 if (block < le32_to_cpu(es->s_first_data_block) ||
4429 block + count < block ||
4430 block + count > ext4_blocks_count(es)) {
4431 ext4_error(sb, __FUNCTION__,
4432 "Freeing blocks not in datazone - "
4433 "block = %lu, count = %lu", block, count);
4434 goto error_return;
4435 }
4436
4437 ext4_debug("freeing block %lu\n", block);
4438
4439 ac.ac_op = EXT4_MB_HISTORY_FREE;
4440 ac.ac_inode = inode;
4441 ac.ac_sb = sb;
4442
4443do_more:
4444 overflow = 0;
4445 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4446
4447 /*
4448 * Check to see if we are freeing blocks across a group
4449 * boundary.
4450 */
4451 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4452 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
4453 count -= overflow;
4454 }
4455 bitmap_bh = read_block_bitmap(sb, block_group);
4456 if (!bitmap_bh)
4457 goto error_return;
4458 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4459 if (!gdp)
4460 goto error_return;
4461
4462 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4463 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4464 in_range(block, ext4_inode_table(sb, gdp),
4465 EXT4_SB(sb)->s_itb_per_group) ||
4466 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4467 EXT4_SB(sb)->s_itb_per_group)) {
4468
4469 ext4_error(sb, __FUNCTION__,
4470 "Freeing blocks in system zone - "
4471 "Block = %lu, count = %lu", block, count);
4472 }
4473
4474 BUFFER_TRACE(bitmap_bh, "getting write access");
4475 err = ext4_journal_get_write_access(handle, bitmap_bh);
4476 if (err)
4477 goto error_return;
4478
4479 /*
4480 * We are about to modify some metadata. Call the journal APIs
4481 * to unshare ->b_data if a currently-committing transaction is
4482 * using it
4483 */
4484 BUFFER_TRACE(gd_bh, "get_write_access");
4485 err = ext4_journal_get_write_access(handle, gd_bh);
4486 if (err)
4487 goto error_return;
4488
4489 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4490 if (err)
4491 goto error_return;
4492
4493#ifdef AGGRESSIVE_CHECK
4494 {
4495 int i;
4496 for (i = 0; i < count; i++)
4497 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4498 }
4499#endif
4500 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4501 bit, count);
4502
4503 /* We dirtied the bitmap block */
4504 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4505 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
4506
4507 ac.ac_b_ex.fe_group = block_group;
4508 ac.ac_b_ex.fe_start = bit;
4509 ac.ac_b_ex.fe_len = count;
4510 ext4_mb_store_history(&ac);
4511
4512 if (metadata) {
4513 /* blocks being freed are metadata. these blocks shouldn't
4514 * be used until this transaction is committed */
4515 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
4516 } else {
4517 ext4_lock_group(sb, block_group);
4518 err = mb_free_blocks(inode, &e4b, bit, count);
4519 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4520 ext4_unlock_group(sb, block_group);
4521 BUG_ON(err != 0);
4522 }
4523
4524 spin_lock(sb_bgl_lock(sbi, block_group));
4525 gdp->bg_free_blocks_count =
4526 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
4527 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4528 spin_unlock(sb_bgl_lock(sbi, block_group));
4529 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4530
4531 ext4_mb_release_desc(&e4b);
4532
4533 *freed += count;
4534
4535 /* And the group descriptor block */
4536 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4537 ret = ext4_journal_dirty_metadata(handle, gd_bh);
4538 if (!err)
4539 err = ret;
4540
4541 if (overflow && !err) {
4542 block += count;
4543 count = overflow;
4544 put_bh(bitmap_bh);
4545 goto do_more;
4546 }
4547 sb->s_dirt = 1;
4548error_return:
4549 brelse(bitmap_bh);
4550 ext4_std_error(sb, err);
4551 return;
4552}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
new file mode 100644
index 000000000000..3ebc2332f52e
--- /dev/null
+++ b/fs/ext4/migrate.c
@@ -0,0 +1,560 @@
1/*
2 * Copyright IBM Corporation, 2007
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/ext4_jbd2.h>
17#include <linux/ext4_fs_extents.h>
18
19/*
20 * The contiguous blocks details which can be
21 * represented by a single extent
22 */
23struct list_blocks_struct {
24 ext4_lblk_t first_block, last_block;
25 ext4_fsblk_t first_pblock, last_pblock;
26};
27
28static int finish_range(handle_t *handle, struct inode *inode,
29 struct list_blocks_struct *lb)
30
31{
32 int retval = 0, needed;
33 struct ext4_extent newext;
34 struct ext4_ext_path *path;
35 if (lb->first_pblock == 0)
36 return 0;
37
38 /* Add the extent to temp inode*/
39 newext.ee_block = cpu_to_le32(lb->first_block);
40 newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
41 ext4_ext_store_pblock(&newext, lb->first_pblock);
42 path = ext4_ext_find_extent(inode, lb->first_block, NULL);
43
44 if (IS_ERR(path)) {
45 retval = PTR_ERR(path);
46 goto err_out;
47 }
48
49 /*
50 * Calculate the credit needed to inserting this extent
51 * Since we are doing this in loop we may accumalate extra
52 * credit. But below we try to not accumalate too much
53 * of them by restarting the journal.
54 */
55 needed = ext4_ext_calc_credits_for_insert(inode, path);
56
57 /*
58 * Make sure the credit we accumalated is not really high
59 */
60 if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
61 retval = ext4_journal_restart(handle, needed);
62 if (retval)
63 goto err_out;
64 }
65 if (needed) {
66 retval = ext4_journal_extend(handle, needed);
67 if (retval != 0) {
68 /*
69 * IF not able to extend the journal restart the journal
70 */
71 retval = ext4_journal_restart(handle, needed);
72 if (retval)
73 goto err_out;
74 }
75 }
76 retval = ext4_ext_insert_extent(handle, inode, path, &newext);
77err_out:
78 lb->first_pblock = 0;
79 return retval;
80}
81
82static int update_extent_range(handle_t *handle, struct inode *inode,
83 ext4_fsblk_t pblock, ext4_lblk_t blk_num,
84 struct list_blocks_struct *lb)
85{
86 int retval;
87 /*
88 * See if we can add on to the existing range (if it exists)
89 */
90 if (lb->first_pblock &&
91 (lb->last_pblock+1 == pblock) &&
92 (lb->last_block+1 == blk_num)) {
93 lb->last_pblock = pblock;
94 lb->last_block = blk_num;
95 return 0;
96 }
97 /*
98 * Start a new range.
99 */
100 retval = finish_range(handle, inode, lb);
101 lb->first_pblock = lb->last_pblock = pblock;
102 lb->first_block = lb->last_block = blk_num;
103
104 return retval;
105}
106
107static int update_ind_extent_range(handle_t *handle, struct inode *inode,
108 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
109 struct list_blocks_struct *lb)
110{
111 struct buffer_head *bh;
112 __le32 *i_data;
113 int i, retval = 0;
114 ext4_lblk_t blk_count = *blk_nump;
115 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
116
117 if (!pblock) {
118 /* Only update the file block number */
119 *blk_nump += max_entries;
120 return 0;
121 }
122
123 bh = sb_bread(inode->i_sb, pblock);
124 if (!bh)
125 return -EIO;
126
127 i_data = (__le32 *)bh->b_data;
128 for (i = 0; i < max_entries; i++, blk_count++) {
129 if (i_data[i]) {
130 retval = update_extent_range(handle, inode,
131 le32_to_cpu(i_data[i]),
132 blk_count, lb);
133 if (retval)
134 break;
135 }
136 }
137
138 /* Update the file block number */
139 *blk_nump = blk_count;
140 put_bh(bh);
141 return retval;
142
143}
144
145static int update_dind_extent_range(handle_t *handle, struct inode *inode,
146 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
147 struct list_blocks_struct *lb)
148{
149 struct buffer_head *bh;
150 __le32 *i_data;
151 int i, retval = 0;
152 ext4_lblk_t blk_count = *blk_nump;
153 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
154
155 if (!pblock) {
156 /* Only update the file block number */
157 *blk_nump += max_entries * max_entries;
158 return 0;
159 }
160 bh = sb_bread(inode->i_sb, pblock);
161 if (!bh)
162 return -EIO;
163
164 i_data = (__le32 *)bh->b_data;
165 for (i = 0; i < max_entries; i++) {
166 if (i_data[i]) {
167 retval = update_ind_extent_range(handle, inode,
168 le32_to_cpu(i_data[i]),
169 &blk_count, lb);
170 if (retval)
171 break;
172 } else {
173 /* Only update the file block number */
174 blk_count += max_entries;
175 }
176 }
177
178 /* Update the file block number */
179 *blk_nump = blk_count;
180 put_bh(bh);
181 return retval;
182
183}
184
185static int update_tind_extent_range(handle_t *handle, struct inode *inode,
186 ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
187 struct list_blocks_struct *lb)
188{
189 struct buffer_head *bh;
190 __le32 *i_data;
191 int i, retval = 0;
192 ext4_lblk_t blk_count = *blk_nump;
193 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
194
195 if (!pblock) {
196 /* Only update the file block number */
197 *blk_nump += max_entries * max_entries * max_entries;
198 return 0;
199 }
200 bh = sb_bread(inode->i_sb, pblock);
201 if (!bh)
202 return -EIO;
203
204 i_data = (__le32 *)bh->b_data;
205 for (i = 0; i < max_entries; i++) {
206 if (i_data[i]) {
207 retval = update_dind_extent_range(handle, inode,
208 le32_to_cpu(i_data[i]),
209 &blk_count, lb);
210 if (retval)
211 break;
212 } else
213 /* Only update the file block number */
214 blk_count += max_entries * max_entries;
215 }
216 /* Update the file block number */
217 *blk_nump = blk_count;
218 put_bh(bh);
219 return retval;
220
221}
222
223static int free_dind_blocks(handle_t *handle,
224 struct inode *inode, __le32 i_data)
225{
226 int i;
227 __le32 *tmp_idata;
228 struct buffer_head *bh;
229 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
230
231 bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
232 if (!bh)
233 return -EIO;
234
235 tmp_idata = (__le32 *)bh->b_data;
236 for (i = 0; i < max_entries; i++) {
237 if (tmp_idata[i])
238 ext4_free_blocks(handle, inode,
239 le32_to_cpu(tmp_idata[i]), 1, 1);
240 }
241 put_bh(bh);
242 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
243 return 0;
244}
245
246static int free_tind_blocks(handle_t *handle,
247 struct inode *inode, __le32 i_data)
248{
249 int i, retval = 0;
250 __le32 *tmp_idata;
251 struct buffer_head *bh;
252 unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
253
254 bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
255 if (!bh)
256 return -EIO;
257
258 tmp_idata = (__le32 *)bh->b_data;
259 for (i = 0; i < max_entries; i++) {
260 if (tmp_idata[i]) {
261 retval = free_dind_blocks(handle,
262 inode, tmp_idata[i]);
263 if (retval) {
264 put_bh(bh);
265 return retval;
266 }
267 }
268 }
269 put_bh(bh);
270 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
271 return 0;
272}
273
274static int free_ind_block(handle_t *handle, struct inode *inode)
275{
276 int retval;
277 struct ext4_inode_info *ei = EXT4_I(inode);
278
279 if (ei->i_data[EXT4_IND_BLOCK])
280 ext4_free_blocks(handle, inode,
281 le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1);
282
283 if (ei->i_data[EXT4_DIND_BLOCK]) {
284 retval = free_dind_blocks(handle, inode,
285 ei->i_data[EXT4_DIND_BLOCK]);
286 if (retval)
287 return retval;
288 }
289
290 if (ei->i_data[EXT4_TIND_BLOCK]) {
291 retval = free_tind_blocks(handle, inode,
292 ei->i_data[EXT4_TIND_BLOCK]);
293 if (retval)
294 return retval;
295 }
296 return 0;
297}
298
299static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
300 struct inode *tmp_inode, int retval)
301{
302 struct ext4_inode_info *ei = EXT4_I(inode);
303 struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
304
305 retval = free_ind_block(handle, inode);
306 if (retval)
307 goto err_out;
308
309 /*
310 * One credit accounted for writing the
311 * i_data field of the original inode
312 */
313 retval = ext4_journal_extend(handle, 1);
314 if (retval != 0) {
315 retval = ext4_journal_restart(handle, 1);
316 if (retval)
317 goto err_out;
318 }
319
320 /*
321 * We have the extent map build with the tmp inode.
322 * Now copy the i_data across
323 */
324 ei->i_flags |= EXT4_EXTENTS_FL;
325 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
326
327 /*
328 * Update i_blocks with the new blocks that got
329 * allocated while adding extents for extent index
330 * blocks.
331 *
332 * While converting to extents we need not
333 * update the orignal inode i_blocks for extent blocks
334 * via quota APIs. The quota update happened via tmp_inode already.
335 */
336 spin_lock(&inode->i_lock);
337 inode->i_blocks += tmp_inode->i_blocks;
338 spin_unlock(&inode->i_lock);
339
340 ext4_mark_inode_dirty(handle, inode);
341err_out:
342 return retval;
343}
344
345static int free_ext_idx(handle_t *handle, struct inode *inode,
346 struct ext4_extent_idx *ix)
347{
348 int i, retval = 0;
349 ext4_fsblk_t block;
350 struct buffer_head *bh;
351 struct ext4_extent_header *eh;
352
353 block = idx_pblock(ix);
354 bh = sb_bread(inode->i_sb, block);
355 if (!bh)
356 return -EIO;
357
358 eh = (struct ext4_extent_header *)bh->b_data;
359 if (eh->eh_depth != 0) {
360 ix = EXT_FIRST_INDEX(eh);
361 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
362 retval = free_ext_idx(handle, inode, ix);
363 if (retval)
364 break;
365 }
366 }
367 put_bh(bh);
368 ext4_free_blocks(handle, inode, block, 1, 1);
369 return retval;
370}
371
372/*
373 * Free the extent meta data blocks only
374 */
375static int free_ext_block(handle_t *handle, struct inode *inode)
376{
377 int i, retval = 0;
378 struct ext4_inode_info *ei = EXT4_I(inode);
379 struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data;
380 struct ext4_extent_idx *ix;
381 if (eh->eh_depth == 0)
382 /*
383 * No extra blocks allocated for extent meta data
384 */
385 return 0;
386 ix = EXT_FIRST_INDEX(eh);
387 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
388 retval = free_ext_idx(handle, inode, ix);
389 if (retval)
390 return retval;
391 }
392 return retval;
393
394}
395
396int ext4_ext_migrate(struct inode *inode, struct file *filp,
397 unsigned int cmd, unsigned long arg)
398{
399 handle_t *handle;
400 int retval = 0, i;
401 __le32 *i_data;
402 ext4_lblk_t blk_count = 0;
403 struct ext4_inode_info *ei;
404 struct inode *tmp_inode = NULL;
405 struct list_blocks_struct lb;
406 unsigned long max_entries;
407
408 if (!test_opt(inode->i_sb, EXTENTS))
409 /*
410 * if mounted with noextents we don't allow the migrate
411 */
412 return -EINVAL;
413
414 if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
415 return -EINVAL;
416
417 down_write(&EXT4_I(inode)->i_data_sem);
418 handle = ext4_journal_start(inode,
419 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
420 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
421 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
422 + 1);
423 if (IS_ERR(handle)) {
424 retval = PTR_ERR(handle);
425 goto err_out;
426 }
427 tmp_inode = ext4_new_inode(handle,
428 inode->i_sb->s_root->d_inode,
429 S_IFREG);
430 if (IS_ERR(tmp_inode)) {
431 retval = -ENOMEM;
432 ext4_journal_stop(handle);
433 tmp_inode = NULL;
434 goto err_out;
435 }
436 i_size_write(tmp_inode, i_size_read(inode));
437 /*
438 * We don't want the inode to be reclaimed
439 * if we got interrupted in between. We have
440 * this tmp inode carrying reference to the
441 * data blocks of the original file. We set
442 * the i_nlink to zero at the last stage after
443 * switching the original file to extent format
444 */
445 tmp_inode->i_nlink = 1;
446
447 ext4_ext_tree_init(handle, tmp_inode);
448 ext4_orphan_add(handle, tmp_inode);
449 ext4_journal_stop(handle);
450
451 ei = EXT4_I(inode);
452 i_data = ei->i_data;
453 memset(&lb, 0, sizeof(lb));
454
455 /* 32 bit block address 4 bytes */
456 max_entries = inode->i_sb->s_blocksize >> 2;
457
458 /*
459 * start with one credit accounted for
460 * superblock modification.
461 *
462 * For the tmp_inode we already have commited the
463 * trascation that created the inode. Later as and
464 * when we add extents we extent the journal
465 */
466 handle = ext4_journal_start(inode, 1);
467 for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
468 if (i_data[i]) {
469 retval = update_extent_range(handle, tmp_inode,
470 le32_to_cpu(i_data[i]),
471 blk_count, &lb);
472 if (retval)
473 goto err_out;
474 }
475 }
476 if (i_data[EXT4_IND_BLOCK]) {
477 retval = update_ind_extent_range(handle, tmp_inode,
478 le32_to_cpu(i_data[EXT4_IND_BLOCK]),
479 &blk_count, &lb);
480 if (retval)
481 goto err_out;
482 } else
483 blk_count += max_entries;
484 if (i_data[EXT4_DIND_BLOCK]) {
485 retval = update_dind_extent_range(handle, tmp_inode,
486 le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
487 &blk_count, &lb);
488 if (retval)
489 goto err_out;
490 } else
491 blk_count += max_entries * max_entries;
492 if (i_data[EXT4_TIND_BLOCK]) {
493 retval = update_tind_extent_range(handle, tmp_inode,
494 le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
495 &blk_count, &lb);
496 if (retval)
497 goto err_out;
498 }
499 /*
500 * Build the last extent
501 */
502 retval = finish_range(handle, tmp_inode, &lb);
503err_out:
504 /*
505 * We are either freeing extent information or indirect
506 * blocks. During this we touch superblock, group descriptor
507 * and block bitmap. Later we mark the tmp_inode dirty
508 * via ext4_ext_tree_init. So allocate a credit of 4
509 * We may update quota (user and group).
510 *
511 * FIXME!! we may be touching bitmaps in different block groups.
512 */
513 if (ext4_journal_extend(handle,
514 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)) != 0)
515 ext4_journal_restart(handle,
516 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
517 if (retval)
518 /*
519 * Failure case delete the extent information with the
520 * tmp_inode
521 */
522 free_ext_block(handle, tmp_inode);
523 else
524 retval = ext4_ext_swap_inode_data(handle, inode,
525 tmp_inode, retval);
526
527 /*
528 * Mark the tmp_inode as of size zero
529 */
530 i_size_write(tmp_inode, 0);
531
532 /*
533 * set the i_blocks count to zero
534 * so that the ext4_delete_inode does the
535 * right job
536 *
537 * We don't need to take the i_lock because
538 * the inode is not visible to user space.
539 */
540 tmp_inode->i_blocks = 0;
541
542 /* Reset the extent details */
543 ext4_ext_tree_init(handle, tmp_inode);
544
545 /*
546 * Set the i_nlink to zero so that
547 * generic_drop_inode really deletes the
548 * inode
549 */
550 tmp_inode->i_nlink = 0;
551
552 ext4_journal_stop(handle);
553
554 up_write(&EXT4_I(inode)->i_data_sem);
555
556 if (tmp_inode)
557 iput(tmp_inode);
558
559 return retval;
560}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 94ee6f315dc1..67b6d8a1ceff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -51,7 +51,7 @@
51 51
52static struct buffer_head *ext4_append(handle_t *handle, 52static struct buffer_head *ext4_append(handle_t *handle,
53 struct inode *inode, 53 struct inode *inode,
54 u32 *block, int *err) 54 ext4_lblk_t *block, int *err)
55{ 55{
56 struct buffer_head *bh; 56 struct buffer_head *bh;
57 57
@@ -144,8 +144,8 @@ struct dx_map_entry
144 u16 size; 144 u16 size;
145}; 145};
146 146
147static inline unsigned dx_get_block (struct dx_entry *entry); 147static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
148static void dx_set_block (struct dx_entry *entry, unsigned value); 148static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
149static inline unsigned dx_get_hash (struct dx_entry *entry); 149static inline unsigned dx_get_hash (struct dx_entry *entry);
150static void dx_set_hash (struct dx_entry *entry, unsigned value); 150static void dx_set_hash (struct dx_entry *entry, unsigned value);
151static unsigned dx_get_count (struct dx_entry *entries); 151static unsigned dx_get_count (struct dx_entry *entries);
@@ -166,7 +166,8 @@ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
166static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, 166static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
167 struct dx_map_entry *offsets, int count); 167 struct dx_map_entry *offsets, int count);
168static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); 168static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
169static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); 169static void dx_insert_block(struct dx_frame *frame,
170 u32 hash, ext4_lblk_t block);
170static int ext4_htree_next_block(struct inode *dir, __u32 hash, 171static int ext4_htree_next_block(struct inode *dir, __u32 hash,
171 struct dx_frame *frame, 172 struct dx_frame *frame,
172 struct dx_frame *frames, 173 struct dx_frame *frames,
@@ -181,12 +182,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
181 * Mask them off for now. 182 * Mask them off for now.
182 */ 183 */
183 184
184static inline unsigned dx_get_block (struct dx_entry *entry) 185static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
185{ 186{
186 return le32_to_cpu(entry->block) & 0x00ffffff; 187 return le32_to_cpu(entry->block) & 0x00ffffff;
187} 188}
188 189
189static inline void dx_set_block (struct dx_entry *entry, unsigned value) 190static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
190{ 191{
191 entry->block = cpu_to_le32(value); 192 entry->block = cpu_to_le32(value);
192} 193}
@@ -243,8 +244,8 @@ static void dx_show_index (char * label, struct dx_entry *entries)
243 int i, n = dx_get_count (entries); 244 int i, n = dx_get_count (entries);
244 printk("%s index ", label); 245 printk("%s index ", label);
245 for (i = 0; i < n; i++) { 246 for (i = 0; i < n; i++) {
246 printk("%x->%u ", i? dx_get_hash(entries + i) : 247 printk("%x->%lu ", i? dx_get_hash(entries + i) :
247 0, dx_get_block(entries + i)); 248 0, (unsigned long)dx_get_block(entries + i));
248 } 249 }
249 printk("\n"); 250 printk("\n");
250} 251}
@@ -280,7 +281,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
280 space += EXT4_DIR_REC_LEN(de->name_len); 281 space += EXT4_DIR_REC_LEN(de->name_len);
281 names++; 282 names++;
282 } 283 }
283 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); 284 de = ext4_next_entry(de);
284 } 285 }
285 printk("(%i)\n", names); 286 printk("(%i)\n", names);
286 return (struct stats) { names, space, 1 }; 287 return (struct stats) { names, space, 1 };
@@ -297,7 +298,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
297 printk("%i indexed blocks...\n", count); 298 printk("%i indexed blocks...\n", count);
298 for (i = 0; i < count; i++, entries++) 299 for (i = 0; i < count; i++, entries++)
299 { 300 {
300 u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; 301 ext4_lblk_t block = dx_get_block(entries);
302 ext4_lblk_t hash = i ? dx_get_hash(entries): 0;
301 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; 303 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
302 struct stats stats; 304 struct stats stats;
303 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); 305 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
@@ -551,7 +553,8 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
551 */ 553 */
552static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p) 554static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
553{ 555{
554 return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); 556 return (struct ext4_dir_entry_2 *)((char *)p +
557 ext4_rec_len_from_disk(p->rec_len));
555} 558}
556 559
557/* 560/*
@@ -560,7 +563,7 @@ static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *
560 * into the tree. If there is an error it is returned in err. 563 * into the tree. If there is an error it is returned in err.
561 */ 564 */
562static int htree_dirblock_to_tree(struct file *dir_file, 565static int htree_dirblock_to_tree(struct file *dir_file,
563 struct inode *dir, int block, 566 struct inode *dir, ext4_lblk_t block,
564 struct dx_hash_info *hinfo, 567 struct dx_hash_info *hinfo,
565 __u32 start_hash, __u32 start_minor_hash) 568 __u32 start_hash, __u32 start_minor_hash)
566{ 569{
@@ -568,7 +571,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
568 struct ext4_dir_entry_2 *de, *top; 571 struct ext4_dir_entry_2 *de, *top;
569 int err, count = 0; 572 int err, count = 0;
570 573
571 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); 574 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
575 (unsigned long)block));
572 if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) 576 if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
573 return err; 577 return err;
574 578
@@ -620,9 +624,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
620 struct ext4_dir_entry_2 *de; 624 struct ext4_dir_entry_2 *de;
621 struct dx_frame frames[2], *frame; 625 struct dx_frame frames[2], *frame;
622 struct inode *dir; 626 struct inode *dir;
623 int block, err; 627 ext4_lblk_t block;
624 int count = 0; 628 int count = 0;
625 int ret; 629 int ret, err;
626 __u32 hashval; 630 __u32 hashval;
627 631
628 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, 632 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
@@ -720,7 +724,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
720 cond_resched(); 724 cond_resched();
721 } 725 }
722 /* XXX: do we need to check rec_len == 0 case? -Chris */ 726 /* XXX: do we need to check rec_len == 0 case? -Chris */
723 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); 727 de = ext4_next_entry(de);
724 } 728 }
725 return count; 729 return count;
726} 730}
@@ -752,7 +756,7 @@ static void dx_sort_map (struct dx_map_entry *map, unsigned count)
752 } while(more); 756 } while(more);
753} 757}
754 758
755static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) 759static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
756{ 760{
757 struct dx_entry *entries = frame->entries; 761 struct dx_entry *entries = frame->entries;
758 struct dx_entry *old = frame->at, *new = old + 1; 762 struct dx_entry *old = frame->at, *new = old + 1;
@@ -820,7 +824,7 @@ static inline int search_dirblock(struct buffer_head * bh,
820 return 1; 824 return 1;
821 } 825 }
822 /* prevent looping on a bad block */ 826 /* prevent looping on a bad block */
823 de_len = le16_to_cpu(de->rec_len); 827 de_len = ext4_rec_len_from_disk(de->rec_len);
824 if (de_len <= 0) 828 if (de_len <= 0)
825 return -1; 829 return -1;
826 offset += de_len; 830 offset += de_len;
@@ -847,23 +851,20 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
847 struct super_block * sb; 851 struct super_block * sb;
848 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 852 struct buffer_head * bh_use[NAMEI_RA_SIZE];
849 struct buffer_head * bh, *ret = NULL; 853 struct buffer_head * bh, *ret = NULL;
850 unsigned long start, block, b; 854 ext4_lblk_t start, block, b;
851 int ra_max = 0; /* Number of bh's in the readahead 855 int ra_max = 0; /* Number of bh's in the readahead
852 buffer, bh_use[] */ 856 buffer, bh_use[] */
853 int ra_ptr = 0; /* Current index into readahead 857 int ra_ptr = 0; /* Current index into readahead
854 buffer */ 858 buffer */
855 int num = 0; 859 int num = 0;
856 int nblocks, i, err; 860 ext4_lblk_t nblocks;
861 int i, err;
857 struct inode *dir = dentry->d_parent->d_inode; 862 struct inode *dir = dentry->d_parent->d_inode;
858 int namelen; 863 int namelen;
859 const u8 *name;
860 unsigned blocksize;
861 864
862 *res_dir = NULL; 865 *res_dir = NULL;
863 sb = dir->i_sb; 866 sb = dir->i_sb;
864 blocksize = sb->s_blocksize;
865 namelen = dentry->d_name.len; 867 namelen = dentry->d_name.len;
866 name = dentry->d_name.name;
867 if (namelen > EXT4_NAME_LEN) 868 if (namelen > EXT4_NAME_LEN)
868 return NULL; 869 return NULL;
869 if (is_dx(dir)) { 870 if (is_dx(dir)) {
@@ -914,7 +915,8 @@ restart:
914 if (!buffer_uptodate(bh)) { 915 if (!buffer_uptodate(bh)) {
915 /* read error, skip block & hope for the best */ 916 /* read error, skip block & hope for the best */
916 ext4_error(sb, __FUNCTION__, "reading directory #%lu " 917 ext4_error(sb, __FUNCTION__, "reading directory #%lu "
917 "offset %lu", dir->i_ino, block); 918 "offset %lu", dir->i_ino,
919 (unsigned long)block);
918 brelse(bh); 920 brelse(bh);
919 goto next; 921 goto next;
920 } 922 }
@@ -961,7 +963,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
961 struct dx_frame frames[2], *frame; 963 struct dx_frame frames[2], *frame;
962 struct ext4_dir_entry_2 *de, *top; 964 struct ext4_dir_entry_2 *de, *top;
963 struct buffer_head *bh; 965 struct buffer_head *bh;
964 unsigned long block; 966 ext4_lblk_t block;
965 int retval; 967 int retval;
966 int namelen = dentry->d_name.len; 968 int namelen = dentry->d_name.len;
967 const u8 *name = dentry->d_name.name; 969 const u8 *name = dentry->d_name.name;
@@ -1128,7 +1130,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1128 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1130 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1129 memcpy (to, de, rec_len); 1131 memcpy (to, de, rec_len);
1130 ((struct ext4_dir_entry_2 *) to)->rec_len = 1132 ((struct ext4_dir_entry_2 *) to)->rec_len =
1131 cpu_to_le16(rec_len); 1133 ext4_rec_len_to_disk(rec_len);
1132 de->inode = 0; 1134 de->inode = 0;
1133 map++; 1135 map++;
1134 to += rec_len; 1136 to += rec_len;
@@ -1147,13 +1149,12 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
1147 1149
1148 prev = to = de; 1150 prev = to = de;
1149 while ((char*)de < base + size) { 1151 while ((char*)de < base + size) {
1150 next = (struct ext4_dir_entry_2 *) ((char *) de + 1152 next = ext4_next_entry(de);
1151 le16_to_cpu(de->rec_len));
1152 if (de->inode && de->name_len) { 1153 if (de->inode && de->name_len) {
1153 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1154 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1154 if (de > to) 1155 if (de > to)
1155 memmove(to, de, rec_len); 1156 memmove(to, de, rec_len);
1156 to->rec_len = cpu_to_le16(rec_len); 1157 to->rec_len = ext4_rec_len_to_disk(rec_len);
1157 prev = to; 1158 prev = to;
1158 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); 1159 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1159 } 1160 }
@@ -1174,7 +1175,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1174 unsigned blocksize = dir->i_sb->s_blocksize; 1175 unsigned blocksize = dir->i_sb->s_blocksize;
1175 unsigned count, continued; 1176 unsigned count, continued;
1176 struct buffer_head *bh2; 1177 struct buffer_head *bh2;
1177 u32 newblock; 1178 ext4_lblk_t newblock;
1178 u32 hash2; 1179 u32 hash2;
1179 struct dx_map_entry *map; 1180 struct dx_map_entry *map;
1180 char *data1 = (*bh)->b_data, *data2; 1181 char *data1 = (*bh)->b_data, *data2;
@@ -1221,14 +1222,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1221 split = count - move; 1222 split = count - move;
1222 hash2 = map[split].hash; 1223 hash2 = map[split].hash;
1223 continued = hash2 == map[split - 1].hash; 1224 continued = hash2 == map[split - 1].hash;
1224 dxtrace(printk("Split block %i at %x, %i/%i\n", 1225 dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
1225 dx_get_block(frame->at), hash2, split, count-split)); 1226 (unsigned long)dx_get_block(frame->at),
1227 hash2, split, count-split));
1226 1228
1227 /* Fancy dance to stay within two buffers */ 1229 /* Fancy dance to stay within two buffers */
1228 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1230 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1229 de = dx_pack_dirents(data1,blocksize); 1231 de = dx_pack_dirents(data1,blocksize);
1230 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); 1232 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
1231 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); 1233 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
1232 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1234 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1233 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1235 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1234 1236
@@ -1297,7 +1299,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1297 return -EEXIST; 1299 return -EEXIST;
1298 } 1300 }
1299 nlen = EXT4_DIR_REC_LEN(de->name_len); 1301 nlen = EXT4_DIR_REC_LEN(de->name_len);
1300 rlen = le16_to_cpu(de->rec_len); 1302 rlen = ext4_rec_len_from_disk(de->rec_len);
1301 if ((de->inode? rlen - nlen: rlen) >= reclen) 1303 if ((de->inode? rlen - nlen: rlen) >= reclen)
1302 break; 1304 break;
1303 de = (struct ext4_dir_entry_2 *)((char *)de + rlen); 1305 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1316,11 +1318,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1316 1318
1317 /* By now the buffer is marked for journaling */ 1319 /* By now the buffer is marked for journaling */
1318 nlen = EXT4_DIR_REC_LEN(de->name_len); 1320 nlen = EXT4_DIR_REC_LEN(de->name_len);
1319 rlen = le16_to_cpu(de->rec_len); 1321 rlen = ext4_rec_len_from_disk(de->rec_len);
1320 if (de->inode) { 1322 if (de->inode) {
1321 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); 1323 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1322 de1->rec_len = cpu_to_le16(rlen - nlen); 1324 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
1323 de->rec_len = cpu_to_le16(nlen); 1325 de->rec_len = ext4_rec_len_to_disk(nlen);
1324 de = de1; 1326 de = de1;
1325 } 1327 }
1326 de->file_type = EXT4_FT_UNKNOWN; 1328 de->file_type = EXT4_FT_UNKNOWN;
@@ -1374,7 +1376,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1374 int retval; 1376 int retval;
1375 unsigned blocksize; 1377 unsigned blocksize;
1376 struct dx_hash_info hinfo; 1378 struct dx_hash_info hinfo;
1377 u32 block; 1379 ext4_lblk_t block;
1378 struct fake_dirent *fde; 1380 struct fake_dirent *fde;
1379 1381
1380 blocksize = dir->i_sb->s_blocksize; 1382 blocksize = dir->i_sb->s_blocksize;
@@ -1397,17 +1399,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1397 1399
1398 /* The 0th block becomes the root, move the dirents out */ 1400 /* The 0th block becomes the root, move the dirents out */
1399 fde = &root->dotdot; 1401 fde = &root->dotdot;
1400 de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len)); 1402 de = (struct ext4_dir_entry_2 *)((char *)fde +
1403 ext4_rec_len_from_disk(fde->rec_len));
1401 len = ((char *) root) + blocksize - (char *) de; 1404 len = ((char *) root) + blocksize - (char *) de;
1402 memcpy (data1, de, len); 1405 memcpy (data1, de, len);
1403 de = (struct ext4_dir_entry_2 *) data1; 1406 de = (struct ext4_dir_entry_2 *) data1;
1404 top = data1 + len; 1407 top = data1 + len;
1405 while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top) 1408 while ((char *)(de2 = ext4_next_entry(de)) < top)
1406 de = de2; 1409 de = de2;
1407 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); 1410 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
1408 /* Initialize the root; the dot dirents already exist */ 1411 /* Initialize the root; the dot dirents already exist */
1409 de = (struct ext4_dir_entry_2 *) (&root->dotdot); 1412 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1410 de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2)); 1413 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
1411 memset (&root->info, 0, sizeof(root->info)); 1414 memset (&root->info, 0, sizeof(root->info));
1412 root->info.info_length = sizeof(root->info); 1415 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1416 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1454,7 +1457,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
1454 int retval; 1457 int retval;
1455 int dx_fallback=0; 1458 int dx_fallback=0;
1456 unsigned blocksize; 1459 unsigned blocksize;
1457 u32 block, blocks; 1460 ext4_lblk_t block, blocks;
1458 1461
1459 sb = dir->i_sb; 1462 sb = dir->i_sb;
1460 blocksize = sb->s_blocksize; 1463 blocksize = sb->s_blocksize;
@@ -1487,7 +1490,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
1487 return retval; 1490 return retval;
1488 de = (struct ext4_dir_entry_2 *) bh->b_data; 1491 de = (struct ext4_dir_entry_2 *) bh->b_data;
1489 de->inode = 0; 1492 de->inode = 0;
1490 de->rec_len = cpu_to_le16(blocksize); 1493 de->rec_len = ext4_rec_len_to_disk(blocksize);
1491 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1494 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1492} 1495}
1493 1496
@@ -1531,7 +1534,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1531 dx_get_count(entries), dx_get_limit(entries))); 1534 dx_get_count(entries), dx_get_limit(entries)));
1532 /* Need to split index? */ 1535 /* Need to split index? */
1533 if (dx_get_count(entries) == dx_get_limit(entries)) { 1536 if (dx_get_count(entries) == dx_get_limit(entries)) {
1534 u32 newblock; 1537 ext4_lblk_t newblock;
1535 unsigned icount = dx_get_count(entries); 1538 unsigned icount = dx_get_count(entries);
1536 int levels = frame - frames; 1539 int levels = frame - frames;
1537 struct dx_entry *entries2; 1540 struct dx_entry *entries2;
@@ -1550,7 +1553,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1550 goto cleanup; 1553 goto cleanup;
1551 node2 = (struct dx_node *)(bh2->b_data); 1554 node2 = (struct dx_node *)(bh2->b_data);
1552 entries2 = node2->entries; 1555 entries2 = node2->entries;
1553 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); 1556 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
1554 node2->fake.inode = 0; 1557 node2->fake.inode = 0;
1555 BUFFER_TRACE(frame->bh, "get_write_access"); 1558 BUFFER_TRACE(frame->bh, "get_write_access");
1556 err = ext4_journal_get_write_access(handle, frame->bh); 1559 err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1648,9 +1651,9 @@ static int ext4_delete_entry (handle_t *handle,
1648 BUFFER_TRACE(bh, "get_write_access"); 1651 BUFFER_TRACE(bh, "get_write_access");
1649 ext4_journal_get_write_access(handle, bh); 1652 ext4_journal_get_write_access(handle, bh);
1650 if (pde) 1653 if (pde)
1651 pde->rec_len = 1654 pde->rec_len = ext4_rec_len_to_disk(
1652 cpu_to_le16(le16_to_cpu(pde->rec_len) + 1655 ext4_rec_len_from_disk(pde->rec_len) +
1653 le16_to_cpu(de->rec_len)); 1656 ext4_rec_len_from_disk(de->rec_len));
1654 else 1657 else
1655 de->inode = 0; 1658 de->inode = 0;
1656 dir->i_version++; 1659 dir->i_version++;
@@ -1658,10 +1661,9 @@ static int ext4_delete_entry (handle_t *handle,
1658 ext4_journal_dirty_metadata(handle, bh); 1661 ext4_journal_dirty_metadata(handle, bh);
1659 return 0; 1662 return 0;
1660 } 1663 }
1661 i += le16_to_cpu(de->rec_len); 1664 i += ext4_rec_len_from_disk(de->rec_len);
1662 pde = de; 1665 pde = de;
1663 de = (struct ext4_dir_entry_2 *) 1666 de = ext4_next_entry(de);
1664 ((char *) de + le16_to_cpu(de->rec_len));
1665 } 1667 }
1666 return -ENOENT; 1668 return -ENOENT;
1667} 1669}
@@ -1824,13 +1826,13 @@ retry:
1824 de = (struct ext4_dir_entry_2 *) dir_block->b_data; 1826 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1825 de->inode = cpu_to_le32(inode->i_ino); 1827 de->inode = cpu_to_le32(inode->i_ino);
1826 de->name_len = 1; 1828 de->name_len = 1;
1827 de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len)); 1829 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
1828 strcpy (de->name, "."); 1830 strcpy (de->name, ".");
1829 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1831 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1830 de = (struct ext4_dir_entry_2 *) 1832 de = ext4_next_entry(de);
1831 ((char *) de + le16_to_cpu(de->rec_len));
1832 de->inode = cpu_to_le32(dir->i_ino); 1833 de->inode = cpu_to_le32(dir->i_ino);
1833 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1)); 1834 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
1835 EXT4_DIR_REC_LEN(1));
1834 de->name_len = 2; 1836 de->name_len = 2;
1835 strcpy (de->name, ".."); 1837 strcpy (de->name, "..");
1836 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1838 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1882,8 +1884,7 @@ static int empty_dir (struct inode * inode)
1882 return 1; 1884 return 1;
1883 } 1885 }
1884 de = (struct ext4_dir_entry_2 *) bh->b_data; 1886 de = (struct ext4_dir_entry_2 *) bh->b_data;
1885 de1 = (struct ext4_dir_entry_2 *) 1887 de1 = ext4_next_entry(de);
1886 ((char *) de + le16_to_cpu(de->rec_len));
1887 if (le32_to_cpu(de->inode) != inode->i_ino || 1888 if (le32_to_cpu(de->inode) != inode->i_ino ||
1888 !le32_to_cpu(de1->inode) || 1889 !le32_to_cpu(de1->inode) ||
1889 strcmp (".", de->name) || 1890 strcmp (".", de->name) ||
@@ -1894,9 +1895,9 @@ static int empty_dir (struct inode * inode)
1894 brelse (bh); 1895 brelse (bh);
1895 return 1; 1896 return 1;
1896 } 1897 }
1897 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); 1898 offset = ext4_rec_len_from_disk(de->rec_len) +
1898 de = (struct ext4_dir_entry_2 *) 1899 ext4_rec_len_from_disk(de1->rec_len);
1899 ((char *) de1 + le16_to_cpu(de1->rec_len)); 1900 de = ext4_next_entry(de1);
1900 while (offset < inode->i_size ) { 1901 while (offset < inode->i_size ) {
1901 if (!bh || 1902 if (!bh ||
1902 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1903 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1925,9 +1926,8 @@ static int empty_dir (struct inode * inode)
1925 brelse (bh); 1926 brelse (bh);
1926 return 0; 1927 return 0;
1927 } 1928 }
1928 offset += le16_to_cpu(de->rec_len); 1929 offset += ext4_rec_len_from_disk(de->rec_len);
1929 de = (struct ext4_dir_entry_2 *) 1930 de = ext4_next_entry(de);
1930 ((char *) de + le16_to_cpu(de->rec_len));
1931 } 1931 }
1932 brelse (bh); 1932 brelse (bh);
1933 return 1; 1933 return 1;
@@ -2282,8 +2282,7 @@ retry:
2282} 2282}
2283 2283
2284#define PARENT_INO(buffer) \ 2284#define PARENT_INO(buffer) \
2285 ((struct ext4_dir_entry_2 *) ((char *) buffer + \ 2285 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
2286 le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
2287 2286
2288/* 2287/*
2289 * Anybody can rename anything with this: the permission checks are left to the 2288 * Anybody can rename anything with this: the permission checks are left to the
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bd8a52bb3999..4fbba60816f4 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -28,7 +28,7 @@ static int verify_group_input(struct super_block *sb,
28 struct ext4_super_block *es = sbi->s_es; 28 struct ext4_super_block *es = sbi->s_es;
29 ext4_fsblk_t start = ext4_blocks_count(es); 29 ext4_fsblk_t start = ext4_blocks_count(es);
30 ext4_fsblk_t end = start + input->blocks_count; 30 ext4_fsblk_t end = start + input->blocks_count;
31 unsigned group = input->group; 31 ext4_group_t group = input->group;
32 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 32 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
33 unsigned overhead = ext4_bg_has_super(sb, group) ? 33 unsigned overhead = ext4_bg_has_super(sb, group) ?
34 (1 + ext4_bg_num_gdb(sb, group) + 34 (1 + ext4_bg_num_gdb(sb, group) +
@@ -206,7 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb,
206 } 206 }
207 207
208 if (ext4_bg_has_super(sb, input->group)) { 208 if (ext4_bg_has_super(sb, input->group)) {
209 ext4_debug("mark backup superblock %#04lx (+0)\n", start); 209 ext4_debug("mark backup superblock %#04llx (+0)\n", start);
210 ext4_set_bit(0, bh->b_data); 210 ext4_set_bit(0, bh->b_data);
211 } 211 }
212 212
@@ -215,7 +215,7 @@ static int setup_new_group_blocks(struct super_block *sb,
215 i < gdblocks; i++, block++, bit++) { 215 i < gdblocks; i++, block++, bit++) {
216 struct buffer_head *gdb; 216 struct buffer_head *gdb;
217 217
218 ext4_debug("update backup group %#04lx (+%d)\n", block, bit); 218 ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
219 219
220 if ((err = extend_or_restart_transaction(handle, 1, bh))) 220 if ((err = extend_or_restart_transaction(handle, 1, bh)))
221 goto exit_bh; 221 goto exit_bh;
@@ -243,7 +243,7 @@ static int setup_new_group_blocks(struct super_block *sb,
243 i < reserved_gdb; i++, block++, bit++) { 243 i < reserved_gdb; i++, block++, bit++) {
244 struct buffer_head *gdb; 244 struct buffer_head *gdb;
245 245
246 ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit); 246 ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
247 247
248 if ((err = extend_or_restart_transaction(handle, 1, bh))) 248 if ((err = extend_or_restart_transaction(handle, 1, bh)))
249 goto exit_bh; 249 goto exit_bh;
@@ -256,10 +256,10 @@ static int setup_new_group_blocks(struct super_block *sb,
256 ext4_set_bit(bit, bh->b_data); 256 ext4_set_bit(bit, bh->b_data);
257 brelse(gdb); 257 brelse(gdb);
258 } 258 }
259 ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, 259 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
260 input->block_bitmap - start); 260 input->block_bitmap - start);
261 ext4_set_bit(input->block_bitmap - start, bh->b_data); 261 ext4_set_bit(input->block_bitmap - start, bh->b_data);
262 ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, 262 ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
263 input->inode_bitmap - start); 263 input->inode_bitmap - start);
264 ext4_set_bit(input->inode_bitmap - start, bh->b_data); 264 ext4_set_bit(input->inode_bitmap - start, bh->b_data);
265 265
@@ -268,7 +268,7 @@ static int setup_new_group_blocks(struct super_block *sb,
268 i < sbi->s_itb_per_group; i++, bit++, block++) { 268 i < sbi->s_itb_per_group; i++, bit++, block++) {
269 struct buffer_head *it; 269 struct buffer_head *it;
270 270
271 ext4_debug("clear inode block %#04lx (+%d)\n", block, bit); 271 ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
272 272
273 if ((err = extend_or_restart_transaction(handle, 1, bh))) 273 if ((err = extend_or_restart_transaction(handle, 1, bh)))
274 goto exit_bh; 274 goto exit_bh;
@@ -291,7 +291,7 @@ static int setup_new_group_blocks(struct super_block *sb,
291 brelse(bh); 291 brelse(bh);
292 292
293 /* Mark unused entries in inode bitmap used */ 293 /* Mark unused entries in inode bitmap used */
294 ext4_debug("clear inode bitmap %#04x (+%ld)\n", 294 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
295 input->inode_bitmap, input->inode_bitmap - start); 295 input->inode_bitmap, input->inode_bitmap - start);
296 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { 296 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
297 err = PTR_ERR(bh); 297 err = PTR_ERR(bh);
@@ -357,7 +357,7 @@ static int verify_reserved_gdb(struct super_block *sb,
357 struct buffer_head *primary) 357 struct buffer_head *primary)
358{ 358{
359 const ext4_fsblk_t blk = primary->b_blocknr; 359 const ext4_fsblk_t blk = primary->b_blocknr;
360 const unsigned long end = EXT4_SB(sb)->s_groups_count; 360 const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
361 unsigned three = 1; 361 unsigned three = 1;
362 unsigned five = 5; 362 unsigned five = 5;
363 unsigned seven = 7; 363 unsigned seven = 7;
@@ -656,12 +656,12 @@ static void update_backups(struct super_block *sb,
656 int blk_off, char *data, int size) 656 int blk_off, char *data, int size)
657{ 657{
658 struct ext4_sb_info *sbi = EXT4_SB(sb); 658 struct ext4_sb_info *sbi = EXT4_SB(sb);
659 const unsigned long last = sbi->s_groups_count; 659 const ext4_group_t last = sbi->s_groups_count;
660 const int bpg = EXT4_BLOCKS_PER_GROUP(sb); 660 const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
661 unsigned three = 1; 661 unsigned three = 1;
662 unsigned five = 5; 662 unsigned five = 5;
663 unsigned seven = 7; 663 unsigned seven = 7;
664 unsigned group; 664 ext4_group_t group;
665 int rest = sb->s_blocksize - size; 665 int rest = sb->s_blocksize - size;
666 handle_t *handle; 666 handle_t *handle;
667 int err = 0, err2; 667 int err = 0, err2;
@@ -716,7 +716,7 @@ static void update_backups(struct super_block *sb,
716exit_err: 716exit_err:
717 if (err) { 717 if (err) {
718 ext4_warning(sb, __FUNCTION__, 718 ext4_warning(sb, __FUNCTION__,
719 "can't update backup for group %d (err %d), " 719 "can't update backup for group %lu (err %d), "
720 "forcing fsck on next reboot", group, err); 720 "forcing fsck on next reboot", group, err);
721 sbi->s_mount_state &= ~EXT4_VALID_FS; 721 sbi->s_mount_state &= ~EXT4_VALID_FS;
722 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 722 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -952,7 +952,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
952 ext4_fsblk_t n_blocks_count) 952 ext4_fsblk_t n_blocks_count)
953{ 953{
954 ext4_fsblk_t o_blocks_count; 954 ext4_fsblk_t o_blocks_count;
955 unsigned long o_groups_count; 955 ext4_group_t o_groups_count;
956 ext4_grpblk_t last; 956 ext4_grpblk_t last;
957 ext4_grpblk_t add; 957 ext4_grpblk_t add;
958 struct buffer_head * bh; 958 struct buffer_head * bh;
@@ -1054,7 +1054,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1054 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1054 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
1055 sb->s_dirt = 1; 1055 sb->s_dirt = 1;
1056 unlock_super(sb); 1056 unlock_super(sb);
1057 ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count, 1057 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1058 o_blocks_count + add); 1058 o_blocks_count + add);
1059 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1059 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1060 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1060 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1ca0f546c466..055a0cd0168e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -373,6 +373,66 @@ void ext4_update_dynamic_rev(struct super_block *sb)
373 */ 373 */
374} 374}
375 375
376int ext4_update_compat_feature(handle_t *handle,
377 struct super_block *sb, __u32 compat)
378{
379 int err = 0;
380 if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
381 err = ext4_journal_get_write_access(handle,
382 EXT4_SB(sb)->s_sbh);
383 if (err)
384 return err;
385 EXT4_SET_COMPAT_FEATURE(sb, compat);
386 sb->s_dirt = 1;
387 handle->h_sync = 1;
388 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
389 "call ext4_journal_dirty_met adata");
390 err = ext4_journal_dirty_metadata(handle,
391 EXT4_SB(sb)->s_sbh);
392 }
393 return err;
394}
395
396int ext4_update_rocompat_feature(handle_t *handle,
397 struct super_block *sb, __u32 rocompat)
398{
399 int err = 0;
400 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
401 err = ext4_journal_get_write_access(handle,
402 EXT4_SB(sb)->s_sbh);
403 if (err)
404 return err;
405 EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
406 sb->s_dirt = 1;
407 handle->h_sync = 1;
408 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
409 "call ext4_journal_dirty_met adata");
410 err = ext4_journal_dirty_metadata(handle,
411 EXT4_SB(sb)->s_sbh);
412 }
413 return err;
414}
415
416int ext4_update_incompat_feature(handle_t *handle,
417 struct super_block *sb, __u32 incompat)
418{
419 int err = 0;
420 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
421 err = ext4_journal_get_write_access(handle,
422 EXT4_SB(sb)->s_sbh);
423 if (err)
424 return err;
425 EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
426 sb->s_dirt = 1;
427 handle->h_sync = 1;
428 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
429 "call ext4_journal_dirty_met adata");
430 err = ext4_journal_dirty_metadata(handle,
431 EXT4_SB(sb)->s_sbh);
432 }
433 return err;
434}
435
376/* 436/*
377 * Open the external journal device 437 * Open the external journal device
378 */ 438 */
@@ -443,6 +503,7 @@ static void ext4_put_super (struct super_block * sb)
443 struct ext4_super_block *es = sbi->s_es; 503 struct ext4_super_block *es = sbi->s_es;
444 int i; 504 int i;
445 505
506 ext4_mb_release(sb);
446 ext4_ext_release(sb); 507 ext4_ext_release(sb);
447 ext4_xattr_put_super(sb); 508 ext4_xattr_put_super(sb);
448 jbd2_journal_destroy(sbi->s_journal); 509 jbd2_journal_destroy(sbi->s_journal);
@@ -509,6 +570,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
509 ei->i_block_alloc_info = NULL; 570 ei->i_block_alloc_info = NULL;
510 ei->vfs_inode.i_version = 1; 571 ei->vfs_inode.i_version = 1;
511 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 572 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
573 INIT_LIST_HEAD(&ei->i_prealloc_list);
574 spin_lock_init(&ei->i_prealloc_lock);
512 return &ei->vfs_inode; 575 return &ei->vfs_inode;
513} 576}
514 577
@@ -533,7 +596,7 @@ static void init_once(struct kmem_cache *cachep, void *foo)
533#ifdef CONFIG_EXT4DEV_FS_XATTR 596#ifdef CONFIG_EXT4DEV_FS_XATTR
534 init_rwsem(&ei->xattr_sem); 597 init_rwsem(&ei->xattr_sem);
535#endif 598#endif
536 mutex_init(&ei->truncate_mutex); 599 init_rwsem(&ei->i_data_sem);
537 inode_init_once(&ei->vfs_inode); 600 inode_init_once(&ei->vfs_inode);
538} 601}
539 602
@@ -605,18 +668,20 @@ static inline void ext4_show_quota_options(struct seq_file *seq, struct super_bl
605 */ 668 */
606static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) 669static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
607{ 670{
671 int def_errors;
672 unsigned long def_mount_opts;
608 struct super_block *sb = vfs->mnt_sb; 673 struct super_block *sb = vfs->mnt_sb;
609 struct ext4_sb_info *sbi = EXT4_SB(sb); 674 struct ext4_sb_info *sbi = EXT4_SB(sb);
610 struct ext4_super_block *es = sbi->s_es; 675 struct ext4_super_block *es = sbi->s_es;
611 unsigned long def_mount_opts;
612 676
613 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 677 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
678 def_errors = le16_to_cpu(es->s_errors);
614 679
615 if (sbi->s_sb_block != 1) 680 if (sbi->s_sb_block != 1)
616 seq_printf(seq, ",sb=%llu", sbi->s_sb_block); 681 seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
617 if (test_opt(sb, MINIX_DF)) 682 if (test_opt(sb, MINIX_DF))
618 seq_puts(seq, ",minixdf"); 683 seq_puts(seq, ",minixdf");
619 if (test_opt(sb, GRPID)) 684 if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
620 seq_puts(seq, ",grpid"); 685 seq_puts(seq, ",grpid");
621 if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) 686 if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
622 seq_puts(seq, ",nogrpid"); 687 seq_puts(seq, ",nogrpid");
@@ -628,34 +693,33 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
628 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { 693 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
629 seq_printf(seq, ",resgid=%u", sbi->s_resgid); 694 seq_printf(seq, ",resgid=%u", sbi->s_resgid);
630 } 695 }
631 if (test_opt(sb, ERRORS_CONT)) { 696 if (test_opt(sb, ERRORS_RO)) {
632 int def_errors = le16_to_cpu(es->s_errors);
633
634 if (def_errors == EXT4_ERRORS_PANIC || 697 if (def_errors == EXT4_ERRORS_PANIC ||
635 def_errors == EXT4_ERRORS_RO) { 698 def_errors == EXT4_ERRORS_CONTINUE) {
636 seq_puts(seq, ",errors=continue"); 699 seq_puts(seq, ",errors=remount-ro");
637 } 700 }
638 } 701 }
639 if (test_opt(sb, ERRORS_RO)) 702 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
640 seq_puts(seq, ",errors=remount-ro"); 703 seq_puts(seq, ",errors=continue");
641 if (test_opt(sb, ERRORS_PANIC)) 704 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
642 seq_puts(seq, ",errors=panic"); 705 seq_puts(seq, ",errors=panic");
643 if (test_opt(sb, NO_UID32)) 706 if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
644 seq_puts(seq, ",nouid32"); 707 seq_puts(seq, ",nouid32");
645 if (test_opt(sb, DEBUG)) 708 if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
646 seq_puts(seq, ",debug"); 709 seq_puts(seq, ",debug");
647 if (test_opt(sb, OLDALLOC)) 710 if (test_opt(sb, OLDALLOC))
648 seq_puts(seq, ",oldalloc"); 711 seq_puts(seq, ",oldalloc");
649#ifdef CONFIG_EXT4_FS_XATTR 712#ifdef CONFIG_EXT4DEV_FS_XATTR
650 if (test_opt(sb, XATTR_USER)) 713 if (test_opt(sb, XATTR_USER) &&
714 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
651 seq_puts(seq, ",user_xattr"); 715 seq_puts(seq, ",user_xattr");
652 if (!test_opt(sb, XATTR_USER) && 716 if (!test_opt(sb, XATTR_USER) &&
653 (def_mount_opts & EXT4_DEFM_XATTR_USER)) { 717 (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
654 seq_puts(seq, ",nouser_xattr"); 718 seq_puts(seq, ",nouser_xattr");
655 } 719 }
656#endif 720#endif
657#ifdef CONFIG_EXT4_FS_POSIX_ACL 721#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
658 if (test_opt(sb, POSIX_ACL)) 722 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
659 seq_puts(seq, ",acl"); 723 seq_puts(seq, ",acl");
660 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 724 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
661 seq_puts(seq, ",noacl"); 725 seq_puts(seq, ",noacl");
@@ -672,7 +736,17 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
672 seq_puts(seq, ",nobh"); 736 seq_puts(seq, ",nobh");
673 if (!test_opt(sb, EXTENTS)) 737 if (!test_opt(sb, EXTENTS))
674 seq_puts(seq, ",noextents"); 738 seq_puts(seq, ",noextents");
739 if (!test_opt(sb, MBALLOC))
740 seq_puts(seq, ",nomballoc");
741 if (test_opt(sb, I_VERSION))
742 seq_puts(seq, ",i_version");
675 743
744 if (sbi->s_stripe)
745 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
746 /*
747 * journal mode get enabled in different ways
748 * So just print the value even if we didn't specify it
749 */
676 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 750 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
677 seq_puts(seq, ",data=journal"); 751 seq_puts(seq, ",data=journal");
678 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 752 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
@@ -681,7 +755,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
681 seq_puts(seq, ",data=writeback"); 755 seq_puts(seq, ",data=writeback");
682 756
683 ext4_show_quota_options(seq, sb); 757 ext4_show_quota_options(seq, sb);
684
685 return 0; 758 return 0;
686} 759}
687 760
@@ -809,11 +882,13 @@ enum {
809 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 882 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
810 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 883 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
811 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 884 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
885 Opt_journal_checksum, Opt_journal_async_commit,
812 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 886 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
813 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 887 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
814 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 888 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
815 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 889 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
816 Opt_grpquota, Opt_extents, Opt_noextents, 890 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
891 Opt_mballoc, Opt_nomballoc, Opt_stripe,
817}; 892};
818 893
819static match_table_t tokens = { 894static match_table_t tokens = {
@@ -848,6 +923,8 @@ static match_table_t tokens = {
848 {Opt_journal_update, "journal=update"}, 923 {Opt_journal_update, "journal=update"},
849 {Opt_journal_inum, "journal=%u"}, 924 {Opt_journal_inum, "journal=%u"},
850 {Opt_journal_dev, "journal_dev=%u"}, 925 {Opt_journal_dev, "journal_dev=%u"},
926 {Opt_journal_checksum, "journal_checksum"},
927 {Opt_journal_async_commit, "journal_async_commit"},
851 {Opt_abort, "abort"}, 928 {Opt_abort, "abort"},
852 {Opt_data_journal, "data=journal"}, 929 {Opt_data_journal, "data=journal"},
853 {Opt_data_ordered, "data=ordered"}, 930 {Opt_data_ordered, "data=ordered"},
@@ -865,6 +942,10 @@ static match_table_t tokens = {
865 {Opt_barrier, "barrier=%u"}, 942 {Opt_barrier, "barrier=%u"},
866 {Opt_extents, "extents"}, 943 {Opt_extents, "extents"},
867 {Opt_noextents, "noextents"}, 944 {Opt_noextents, "noextents"},
945 {Opt_i_version, "i_version"},
946 {Opt_mballoc, "mballoc"},
947 {Opt_nomballoc, "nomballoc"},
948 {Opt_stripe, "stripe=%u"},
868 {Opt_err, NULL}, 949 {Opt_err, NULL},
869 {Opt_resize, "resize"}, 950 {Opt_resize, "resize"},
870}; 951};
@@ -1035,6 +1116,13 @@ static int parse_options (char *options, struct super_block *sb,
1035 return 0; 1116 return 0;
1036 *journal_devnum = option; 1117 *journal_devnum = option;
1037 break; 1118 break;
1119 case Opt_journal_checksum:
1120 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1121 break;
1122 case Opt_journal_async_commit:
1123 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1124 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1125 break;
1038 case Opt_noload: 1126 case Opt_noload:
1039 set_opt (sbi->s_mount_opt, NOLOAD); 1127 set_opt (sbi->s_mount_opt, NOLOAD);
1040 break; 1128 break;
@@ -1203,6 +1291,23 @@ clear_qf_name:
1203 case Opt_noextents: 1291 case Opt_noextents:
1204 clear_opt (sbi->s_mount_opt, EXTENTS); 1292 clear_opt (sbi->s_mount_opt, EXTENTS);
1205 break; 1293 break;
1294 case Opt_i_version:
1295 set_opt(sbi->s_mount_opt, I_VERSION);
1296 sb->s_flags |= MS_I_VERSION;
1297 break;
1298 case Opt_mballoc:
1299 set_opt(sbi->s_mount_opt, MBALLOC);
1300 break;
1301 case Opt_nomballoc:
1302 clear_opt(sbi->s_mount_opt, MBALLOC);
1303 break;
1304 case Opt_stripe:
1305 if (match_int(&args[0], &option))
1306 return 0;
1307 if (option < 0)
1308 return 0;
1309 sbi->s_stripe = option;
1310 break;
1206 default: 1311 default:
1207 printk (KERN_ERR 1312 printk (KERN_ERR
1208 "EXT4-fs: Unrecognized mount option \"%s\" " 1313 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1364,7 +1469,7 @@ static int ext4_check_descriptors (struct super_block * sb)
1364 struct ext4_group_desc * gdp = NULL; 1469 struct ext4_group_desc * gdp = NULL;
1365 int desc_block = 0; 1470 int desc_block = 0;
1366 int flexbg_flag = 0; 1471 int flexbg_flag = 0;
1367 int i; 1472 ext4_group_t i;
1368 1473
1369 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 1474 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1370 flexbg_flag = 1; 1475 flexbg_flag = 1;
@@ -1386,7 +1491,7 @@ static int ext4_check_descriptors (struct super_block * sb)
1386 if (block_bitmap < first_block || block_bitmap > last_block) 1491 if (block_bitmap < first_block || block_bitmap > last_block)
1387 { 1492 {
1388 ext4_error (sb, "ext4_check_descriptors", 1493 ext4_error (sb, "ext4_check_descriptors",
1389 "Block bitmap for group %d" 1494 "Block bitmap for group %lu"
1390 " not in group (block %llu)!", 1495 " not in group (block %llu)!",
1391 i, block_bitmap); 1496 i, block_bitmap);
1392 return 0; 1497 return 0;
@@ -1395,7 +1500,7 @@ static int ext4_check_descriptors (struct super_block * sb)
1395 if (inode_bitmap < first_block || inode_bitmap > last_block) 1500 if (inode_bitmap < first_block || inode_bitmap > last_block)
1396 { 1501 {
1397 ext4_error (sb, "ext4_check_descriptors", 1502 ext4_error (sb, "ext4_check_descriptors",
1398 "Inode bitmap for group %d" 1503 "Inode bitmap for group %lu"
1399 " not in group (block %llu)!", 1504 " not in group (block %llu)!",
1400 i, inode_bitmap); 1505 i, inode_bitmap);
1401 return 0; 1506 return 0;
@@ -1405,17 +1510,16 @@ static int ext4_check_descriptors (struct super_block * sb)
1405 inode_table + sbi->s_itb_per_group - 1 > last_block) 1510 inode_table + sbi->s_itb_per_group - 1 > last_block)
1406 { 1511 {
1407 ext4_error (sb, "ext4_check_descriptors", 1512 ext4_error (sb, "ext4_check_descriptors",
1408 "Inode table for group %d" 1513 "Inode table for group %lu"
1409 " not in group (block %llu)!", 1514 " not in group (block %llu)!",
1410 i, inode_table); 1515 i, inode_table);
1411 return 0; 1516 return 0;
1412 } 1517 }
1413 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { 1518 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1414 ext4_error(sb, __FUNCTION__, 1519 ext4_error(sb, __FUNCTION__,
1415 "Checksum for group %d failed (%u!=%u)\n", i, 1520 "Checksum for group %lu failed (%u!=%u)\n",
1416 le16_to_cpu(ext4_group_desc_csum(sbi, i, 1521 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1417 gdp)), 1522 gdp)), le16_to_cpu(gdp->bg_checksum));
1418 le16_to_cpu(gdp->bg_checksum));
1419 return 0; 1523 return 0;
1420 } 1524 }
1421 if (!flexbg_flag) 1525 if (!flexbg_flag)
@@ -1429,7 +1533,6 @@ static int ext4_check_descriptors (struct super_block * sb)
1429 return 1; 1533 return 1;
1430} 1534}
1431 1535
1432
1433/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at 1536/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
1434 * the superblock) which were deleted from all directories, but held open by 1537 * the superblock) which were deleted from all directories, but held open by
1435 * a process at the time of a crash. We walk the list and try to delete these 1538 * a process at the time of a crash. We walk the list and try to delete these
@@ -1542,20 +1645,95 @@ static void ext4_orphan_cleanup (struct super_block * sb,
1542#endif 1645#endif
1543 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 1646 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1544} 1647}
1648/*
1649 * Maximal extent format file size.
1650 * Resulting logical blkno at s_maxbytes must fit in our on-disk
1651 * extent format containers, within a sector_t, and within i_blocks
1652 * in the vfs. ext4 inode has 48 bits of i_block in fsblock units,
1653 * so that won't be a limiting factor.
1654 *
1655 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
1656 */
1657static loff_t ext4_max_size(int blkbits)
1658{
1659 loff_t res;
1660 loff_t upper_limit = MAX_LFS_FILESIZE;
1661
1662 /* small i_blocks in vfs inode? */
1663 if (sizeof(blkcnt_t) < sizeof(u64)) {
1664 /*
1665 * CONFIG_LSF is not enabled implies the inode
1666 * i_block represent total blocks in 512 bytes
1667 * 32 == size of vfs inode i_blocks * 8
1668 */
1669 upper_limit = (1LL << 32) - 1;
1670
1671 /* total blocks in file system block size */
1672 upper_limit >>= (blkbits - 9);
1673 upper_limit <<= blkbits;
1674 }
1675
1676 /* 32-bit extent-start container, ee_block */
1677 res = 1LL << 32;
1678 res <<= blkbits;
1679 res -= 1;
1680
1681 /* Sanity check against vm- & vfs- imposed limits */
1682 if (res > upper_limit)
1683 res = upper_limit;
1684
1685 return res;
1686}
1545 1687
1546/* 1688/*
1547 * Maximal file size. There is a direct, and {,double-,triple-}indirect 1689 * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect
1548 * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. 1690 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
1549 * We need to be 1 filesystem block less than the 2^32 sector limit. 1691 * We need to be 1 filesystem block less than the 2^48 sector limit.
1550 */ 1692 */
1551static loff_t ext4_max_size(int bits) 1693static loff_t ext4_max_bitmap_size(int bits)
1552{ 1694{
1553 loff_t res = EXT4_NDIR_BLOCKS; 1695 loff_t res = EXT4_NDIR_BLOCKS;
1554 /* This constant is calculated to be the largest file size for a 1696 int meta_blocks;
1555 * dense, 4k-blocksize file such that the total number of 1697 loff_t upper_limit;
1698 /* This is calculated to be the largest file size for a
1699 * dense, bitmapped file such that the total number of
1556 * sectors in the file, including data and all indirect blocks, 1700 * sectors in the file, including data and all indirect blocks,
1557 * does not exceed 2^32. */ 1701 * does not exceed 2^48 -1
1558 const loff_t upper_limit = 0x1ff7fffd000LL; 1702 * __u32 i_blocks_lo and _u16 i_blocks_high representing the
1703 * total number of 512 bytes blocks of the file
1704 */
1705
1706 if (sizeof(blkcnt_t) < sizeof(u64)) {
1707 /*
1708 * CONFIG_LSF is not enabled implies the inode
1709 * i_block represent total blocks in 512 bytes
1710 * 32 == size of vfs inode i_blocks * 8
1711 */
1712 upper_limit = (1LL << 32) - 1;
1713
1714 /* total blocks in file system block size */
1715 upper_limit >>= (bits - 9);
1716
1717 } else {
1718 /*
1719 * We use 48 bit ext4_inode i_blocks
1720 * With EXT4_HUGE_FILE_FL set the i_blocks
1721 * represent total number of blocks in
1722 * file system block size
1723 */
1724 upper_limit = (1LL << 48) - 1;
1725
1726 }
1727
1728 /* indirect blocks */
1729 meta_blocks = 1;
1730 /* double indirect blocks */
1731 meta_blocks += 1 + (1LL << (bits-2));
1732 /* tripple indirect blocks */
1733 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
1734
1735 upper_limit -= meta_blocks;
1736 upper_limit <<= bits;
1559 1737
1560 res += 1LL << (bits-2); 1738 res += 1LL << (bits-2);
1561 res += 1LL << (2*(bits-2)); 1739 res += 1LL << (2*(bits-2));
@@ -1563,6 +1741,10 @@ static loff_t ext4_max_size(int bits)
1563 res <<= bits; 1741 res <<= bits;
1564 if (res > upper_limit) 1742 if (res > upper_limit)
1565 res = upper_limit; 1743 res = upper_limit;
1744
1745 if (res > MAX_LFS_FILESIZE)
1746 res = MAX_LFS_FILESIZE;
1747
1566 return res; 1748 return res;
1567} 1749}
1568 1750
@@ -1570,7 +1752,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1570 ext4_fsblk_t logical_sb_block, int nr) 1752 ext4_fsblk_t logical_sb_block, int nr)
1571{ 1753{
1572 struct ext4_sb_info *sbi = EXT4_SB(sb); 1754 struct ext4_sb_info *sbi = EXT4_SB(sb);
1573 unsigned long bg, first_meta_bg; 1755 ext4_group_t bg, first_meta_bg;
1574 int has_super = 0; 1756 int has_super = 0;
1575 1757
1576 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); 1758 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
@@ -1584,8 +1766,39 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1584 return (has_super + ext4_group_first_block_no(sb, bg)); 1766 return (has_super + ext4_group_first_block_no(sb, bg));
1585} 1767}
1586 1768
1769/**
1770 * ext4_get_stripe_size: Get the stripe size.
1771 * @sbi: In memory super block info
1772 *
1773 * If we have specified it via mount option, then
1774 * use the mount option value. If the value specified at mount time is
1775 * greater than the blocks per group use the super block value.
1776 * If the super block value is greater than blocks per group return 0.
1777 * Allocator needs it be less than blocks per group.
1778 *
1779 */
1780static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1781{
1782 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
1783 unsigned long stripe_width =
1784 le32_to_cpu(sbi->s_es->s_raid_stripe_width);
1785
1786 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
1787 return sbi->s_stripe;
1788
1789 if (stripe_width <= sbi->s_blocks_per_group)
1790 return stripe_width;
1791
1792 if (stride <= sbi->s_blocks_per_group)
1793 return stride;
1794
1795 return 0;
1796}
1587 1797
1588static int ext4_fill_super (struct super_block *sb, void *data, int silent) 1798static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1799 __releases(kernel_sem)
1800 __acquires(kernel_sem)
1801
1589{ 1802{
1590 struct buffer_head * bh; 1803 struct buffer_head * bh;
1591 struct ext4_super_block *es = NULL; 1804 struct ext4_super_block *es = NULL;
@@ -1599,7 +1812,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1599 unsigned long def_mount_opts; 1812 unsigned long def_mount_opts;
1600 struct inode *root; 1813 struct inode *root;
1601 int blocksize; 1814 int blocksize;
1602 int hblock;
1603 int db_count; 1815 int db_count;
1604 int i; 1816 int i;
1605 int needs_recovery; 1817 int needs_recovery;
@@ -1624,6 +1836,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1624 goto out_fail; 1836 goto out_fail;
1625 } 1837 }
1626 1838
1839 if (!sb_set_blocksize(sb, blocksize)) {
1840 printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
1841 goto out_fail;
1842 }
1843
1627 /* 1844 /*
1628 * The ext4 superblock will not be buffer aligned for other than 1kB 1845 * The ext4 superblock will not be buffer aligned for other than 1kB
1629 * block sizes. We need to calculate the offset from buffer start. 1846 * block sizes. We need to calculate the offset from buffer start.
@@ -1674,10 +1891,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1674 1891
1675 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 1892 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
1676 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1893 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1677 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO) 1894 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
1678 set_opt(sbi->s_mount_opt, ERRORS_RO);
1679 else
1680 set_opt(sbi->s_mount_opt, ERRORS_CONT); 1895 set_opt(sbi->s_mount_opt, ERRORS_CONT);
1896 else
1897 set_opt(sbi->s_mount_opt, ERRORS_RO);
1681 1898
1682 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 1899 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1683 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 1900 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -1689,6 +1906,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1689 * User -o noextents to turn it off 1906 * User -o noextents to turn it off
1690 */ 1907 */
1691 set_opt(sbi->s_mount_opt, EXTENTS); 1908 set_opt(sbi->s_mount_opt, EXTENTS);
1909 /*
1910 * turn on mballoc feature by default in ext4 filesystem
1911 * User -o nomballoc to turn it off
1912 */
1913 set_opt(sbi->s_mount_opt, MBALLOC);
1692 1914
1693 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, 1915 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1694 NULL, 0)) 1916 NULL, 0))
@@ -1723,6 +1945,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1723 sb->s_id, le32_to_cpu(features)); 1945 sb->s_id, le32_to_cpu(features));
1724 goto failed_mount; 1946 goto failed_mount;
1725 } 1947 }
1948 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
1949 /*
1950 * Large file size enabled file system can only be
1951 * mount if kernel is build with CONFIG_LSF
1952 */
1953 if (sizeof(root->i_blocks) < sizeof(u64) &&
1954 !(sb->s_flags & MS_RDONLY)) {
1955 printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
1956 "files cannot be mounted read-write "
1957 "without CONFIG_LSF.\n", sb->s_id);
1958 goto failed_mount;
1959 }
1960 }
1726 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 1961 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1727 1962
1728 if (blocksize < EXT4_MIN_BLOCK_SIZE || 1963 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -1733,20 +1968,16 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1733 goto failed_mount; 1968 goto failed_mount;
1734 } 1969 }
1735 1970
1736 hblock = bdev_hardsect_size(sb->s_bdev);
1737 if (sb->s_blocksize != blocksize) { 1971 if (sb->s_blocksize != blocksize) {
1738 /* 1972
1739 * Make sure the blocksize for the filesystem is larger 1973 /* Validate the filesystem blocksize */
1740 * than the hardware sectorsize for the machine. 1974 if (!sb_set_blocksize(sb, blocksize)) {
1741 */ 1975 printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
1742 if (blocksize < hblock) { 1976 blocksize);
1743 printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
1744 "device blocksize %d.\n", blocksize, hblock);
1745 goto failed_mount; 1977 goto failed_mount;
1746 } 1978 }
1747 1979
1748 brelse (bh); 1980 brelse (bh);
1749 sb_set_blocksize(sb, blocksize);
1750 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; 1981 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
1751 offset = do_div(logical_sb_block, blocksize); 1982 offset = do_div(logical_sb_block, blocksize);
1752 bh = sb_bread(sb, logical_sb_block); 1983 bh = sb_bread(sb, logical_sb_block);
@@ -1764,6 +1995,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1764 } 1995 }
1765 } 1996 }
1766 1997
1998 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
1767 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); 1999 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
1768 2000
1769 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 2001 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
@@ -1838,6 +2070,17 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1838 2070
1839 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 2071 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
1840 goto cantfind_ext4; 2072 goto cantfind_ext4;
2073
2074 /* ensure blocks_count calculation below doesn't sign-extend */
2075 if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2076 le32_to_cpu(es->s_first_data_block) + 1) {
2077 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2078 "first data block %u, blocks per group %lu\n",
2079 ext4_blocks_count(es),
2080 le32_to_cpu(es->s_first_data_block),
2081 EXT4_BLOCKS_PER_GROUP(sb));
2082 goto failed_mount;
2083 }
1841 blocks_count = (ext4_blocks_count(es) - 2084 blocks_count = (ext4_blocks_count(es) -
1842 le32_to_cpu(es->s_first_data_block) + 2085 le32_to_cpu(es->s_first_data_block) +
1843 EXT4_BLOCKS_PER_GROUP(sb) - 1); 2086 EXT4_BLOCKS_PER_GROUP(sb) - 1);
@@ -1900,6 +2143,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1900 sbi->s_rsv_window_head.rsv_goal_size = 0; 2143 sbi->s_rsv_window_head.rsv_goal_size = 0;
1901 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head); 2144 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
1902 2145
2146 sbi->s_stripe = ext4_get_stripe_size(sbi);
2147
1903 /* 2148 /*
1904 * set up enough so that it can read an inode 2149 * set up enough so that it can read an inode
1905 */ 2150 */
@@ -1944,6 +2189,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1944 goto failed_mount4; 2189 goto failed_mount4;
1945 } 2190 }
1946 2191
2192 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
2193 jbd2_journal_set_features(sbi->s_journal,
2194 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2195 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2196 } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
2197 jbd2_journal_set_features(sbi->s_journal,
2198 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2199 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2200 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2201 } else {
2202 jbd2_journal_clear_features(sbi->s_journal,
2203 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2204 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2205 }
2206
1947 /* We have now updated the journal if required, so we can 2207 /* We have now updated the journal if required, so we can
1948 * validate the data journaling mode. */ 2208 * validate the data journaling mode. */
1949 switch (test_opt(sb, DATA_FLAGS)) { 2209 switch (test_opt(sb, DATA_FLAGS)) {
@@ -2044,6 +2304,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2044 "writeback"); 2304 "writeback");
2045 2305
2046 ext4_ext_init(sb); 2306 ext4_ext_init(sb);
2307 ext4_mb_init(sb, needs_recovery);
2047 2308
2048 lock_kernel(); 2309 lock_kernel();
2049 return 0; 2310 return 0;
@@ -2673,7 +2934,7 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
2673 if (test_opt(sb, MINIX_DF)) { 2934 if (test_opt(sb, MINIX_DF)) {
2674 sbi->s_overhead_last = 0; 2935 sbi->s_overhead_last = 0;
2675 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { 2936 } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
2676 unsigned long ngroups = sbi->s_groups_count, i; 2937 ext4_group_t ngroups = sbi->s_groups_count, i;
2677 ext4_fsblk_t overhead = 0; 2938 ext4_fsblk_t overhead = 0;
2678 smp_rmb(); 2939 smp_rmb();
2679 2940
@@ -2909,7 +3170,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
2909 size_t len, loff_t off) 3170 size_t len, loff_t off)
2910{ 3171{
2911 struct inode *inode = sb_dqopt(sb)->files[type]; 3172 struct inode *inode = sb_dqopt(sb)->files[type];
2912 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 3173 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2913 int err = 0; 3174 int err = 0;
2914 int offset = off & (sb->s_blocksize - 1); 3175 int offset = off & (sb->s_blocksize - 1);
2915 int tocopy; 3176 int tocopy;
@@ -2947,7 +3208,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
2947 const char *data, size_t len, loff_t off) 3208 const char *data, size_t len, loff_t off)
2948{ 3209{
2949 struct inode *inode = sb_dqopt(sb)->files[type]; 3210 struct inode *inode = sb_dqopt(sb)->files[type];
2950 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 3211 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2951 int err = 0; 3212 int err = 0;
2952 int offset = off & (sb->s_blocksize - 1); 3213 int offset = off & (sb->s_blocksize - 1);
2953 int tocopy; 3214 int tocopy;
@@ -3002,7 +3263,6 @@ out:
3002 i_size_write(inode, off+len-towrite); 3263 i_size_write(inode, off+len-towrite);
3003 EXT4_I(inode)->i_disksize = inode->i_size; 3264 EXT4_I(inode)->i_disksize = inode->i_size;
3004 } 3265 }
3005 inode->i_version++;
3006 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3266 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3007 ext4_mark_inode_dirty(handle, inode); 3267 ext4_mark_inode_dirty(handle, inode);
3008 mutex_unlock(&inode->i_mutex); 3268 mutex_unlock(&inode->i_mutex);
@@ -3027,9 +3287,15 @@ static struct file_system_type ext4dev_fs_type = {
3027 3287
3028static int __init init_ext4_fs(void) 3288static int __init init_ext4_fs(void)
3029{ 3289{
3030 int err = init_ext4_xattr(); 3290 int err;
3291
3292 err = init_ext4_mballoc();
3031 if (err) 3293 if (err)
3032 return err; 3294 return err;
3295
3296 err = init_ext4_xattr();
3297 if (err)
3298 goto out2;
3033 err = init_inodecache(); 3299 err = init_inodecache();
3034 if (err) 3300 if (err)
3035 goto out1; 3301 goto out1;
@@ -3041,6 +3307,8 @@ out:
3041 destroy_inodecache(); 3307 destroy_inodecache();
3042out1: 3308out1:
3043 exit_ext4_xattr(); 3309 exit_ext4_xattr();
3310out2:
3311 exit_ext4_mballoc();
3044 return err; 3312 return err;
3045} 3313}
3046 3314
@@ -3049,6 +3317,7 @@ static void __exit exit_ext4_fs(void)
3049 unregister_filesystem(&ext4dev_fs_type); 3317 unregister_filesystem(&ext4dev_fs_type);
3050 destroy_inodecache(); 3318 destroy_inodecache();
3051 exit_ext4_xattr(); 3319 exit_ext4_xattr();
3320 exit_ext4_mballoc();
3052} 3321}
3053 3322
3054MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3323MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 86387302c2a9..d7962139c010 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -480,7 +480,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
480 ea_bdebug(bh, "refcount now=0; freeing"); 480 ea_bdebug(bh, "refcount now=0; freeing");
481 if (ce) 481 if (ce)
482 mb_cache_entry_free(ce); 482 mb_cache_entry_free(ce);
483 ext4_free_blocks(handle, inode, bh->b_blocknr, 1); 483 ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
484 get_bh(bh); 484 get_bh(bh);
485 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 485 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
486 } else { 486 } else {
@@ -821,7 +821,7 @@ inserted:
821 new_bh = sb_getblk(sb, block); 821 new_bh = sb_getblk(sb, block);
822 if (!new_bh) { 822 if (!new_bh) {
823getblk_failed: 823getblk_failed:
824 ext4_free_blocks(handle, inode, block, 1); 824 ext4_free_blocks(handle, inode, block, 1, 1);
825 error = -EIO; 825 error = -EIO;
826 goto cleanup; 826 goto cleanup;
827 } 827 }
diff --git a/fs/inode.c b/fs/inode.c
index ed35383d0b6c..276ffd6b6fdd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1276,6 +1276,11 @@ void file_update_time(struct file *file)
1276 sync_it = 1; 1276 sync_it = 1;
1277 } 1277 }
1278 1278
1279 if (IS_I_VERSION(inode)) {
1280 inode_inc_iversion(inode);
1281 sync_it = 1;
1282 }
1283
1279 if (sync_it) 1284 if (sync_it)
1280 mark_inode_dirty_sync(inode); 1285 mark_inode_dirty_sync(inode);
1281} 1286}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 3fccde7ba008..1b7f282c1ae9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -232,7 +232,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
233 */ 233 */
234static int __process_buffer(journal_t *journal, struct journal_head *jh, 234static int __process_buffer(journal_t *journal, struct journal_head *jh,
235 struct buffer_head **bhs, int *batch_count) 235 struct buffer_head **bhs, int *batch_count,
236 transaction_t *transaction)
236{ 237{
237 struct buffer_head *bh = jh2bh(jh); 238 struct buffer_head *bh = jh2bh(jh);
238 int ret = 0; 239 int ret = 0;
@@ -250,6 +251,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
250 transaction_t *t = jh->b_transaction; 251 transaction_t *t = jh->b_transaction;
251 tid_t tid = t->t_tid; 252 tid_t tid = t->t_tid;
252 253
254 transaction->t_chp_stats.cs_forced_to_close++;
253 spin_unlock(&journal->j_list_lock); 255 spin_unlock(&journal->j_list_lock);
254 jbd_unlock_bh_state(bh); 256 jbd_unlock_bh_state(bh);
255 jbd2_log_start_commit(journal, tid); 257 jbd2_log_start_commit(journal, tid);
@@ -279,6 +281,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
279 bhs[*batch_count] = bh; 281 bhs[*batch_count] = bh;
280 __buffer_relink_io(jh); 282 __buffer_relink_io(jh);
281 jbd_unlock_bh_state(bh); 283 jbd_unlock_bh_state(bh);
284 transaction->t_chp_stats.cs_written++;
282 (*batch_count)++; 285 (*batch_count)++;
283 if (*batch_count == NR_BATCH) { 286 if (*batch_count == NR_BATCH) {
284 spin_unlock(&journal->j_list_lock); 287 spin_unlock(&journal->j_list_lock);
@@ -322,6 +325,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
322 if (!journal->j_checkpoint_transactions) 325 if (!journal->j_checkpoint_transactions)
323 goto out; 326 goto out;
324 transaction = journal->j_checkpoint_transactions; 327 transaction = journal->j_checkpoint_transactions;
328 if (transaction->t_chp_stats.cs_chp_time == 0)
329 transaction->t_chp_stats.cs_chp_time = jiffies;
325 this_tid = transaction->t_tid; 330 this_tid = transaction->t_tid;
326restart: 331restart:
327 /* 332 /*
@@ -346,7 +351,8 @@ restart:
346 retry = 1; 351 retry = 1;
347 break; 352 break;
348 } 353 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count); 354 retry = __process_buffer(journal, jh, bhs, &batch_count,
355 transaction);
350 if (!retry && lock_need_resched(&journal->j_list_lock)){ 356 if (!retry && lock_need_resched(&journal->j_list_lock)){
351 spin_unlock(&journal->j_list_lock); 357 spin_unlock(&journal->j_list_lock);
352 retry = 1; 358 retry = 1;
@@ -602,15 +608,15 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
602 608
603 /* 609 /*
604 * There is one special case to worry about: if we have just pulled the 610 * There is one special case to worry about: if we have just pulled the
605 * buffer off a committing transaction's forget list, then even if the 611 * buffer off a running or committing transaction's checkpoing list,
606 * checkpoint list is empty, the transaction obviously cannot be 612 * then even if the checkpoint list is empty, the transaction obviously
607 * dropped! 613 * cannot be dropped!
608 * 614 *
609 * The locking here around j_committing_transaction is a bit sleazy. 615 * The locking here around t_state is a bit sleazy.
610 * See the comment at the end of jbd2_journal_commit_transaction(). 616 * See the comment at the end of jbd2_journal_commit_transaction().
611 */ 617 */
612 if (transaction == journal->j_committing_transaction) { 618 if (transaction->t_state != T_FINISHED) {
613 JBUFFER_TRACE(jh, "belongs to committing transaction"); 619 JBUFFER_TRACE(jh, "belongs to running/committing transaction");
614 goto out; 620 goto out;
615 } 621 }
616 622
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6986f334c643..da8d0eb3b7b9 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -20,6 +20,8 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/jiffies.h>
24#include <linux/crc32.h>
23 25
24/* 26/*
25 * Default IO end handler for temporary BJ_IO buffer_heads. 27 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -92,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh)
92 return 1; 94 return 1;
93} 95}
94 96
95/* Done it all: now write the commit record. We should have 97/*
98 * Done it all: now submit the commit record. We should have
96 * cleaned up our previous buffers by now, so if we are in abort 99 * cleaned up our previous buffers by now, so if we are in abort
97 * mode we can now just skip the rest of the journal write 100 * mode we can now just skip the rest of the journal write
98 * entirely. 101 * entirely.
99 * 102 *
100 * Returns 1 if the journal needs to be aborted or 0 on success 103 * Returns 1 if the journal needs to be aborted or 0 on success
101 */ 104 */
102static int journal_write_commit_record(journal_t *journal, 105static int journal_submit_commit_record(journal_t *journal,
103 transaction_t *commit_transaction) 106 transaction_t *commit_transaction,
107 struct buffer_head **cbh,
108 __u32 crc32_sum)
104{ 109{
105 struct journal_head *descriptor; 110 struct journal_head *descriptor;
111 struct commit_header *tmp;
106 struct buffer_head *bh; 112 struct buffer_head *bh;
107 int i, ret; 113 int ret;
108 int barrier_done = 0; 114 int barrier_done = 0;
109 115
110 if (is_journal_aborted(journal)) 116 if (is_journal_aborted(journal))
@@ -116,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal,
116 122
117 bh = jh2bh(descriptor); 123 bh = jh2bh(descriptor);
118 124
119 /* AKPM: buglet - add `i' to tmp! */ 125 tmp = (struct commit_header *)bh->b_data;
120 for (i = 0; i < bh->b_size; i += 512) { 126 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
121 journal_header_t *tmp = (journal_header_t*)bh->b_data; 127 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
122 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 128 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
123 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 129
124 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); 130 if (JBD2_HAS_COMPAT_FEATURE(journal,
131 JBD2_FEATURE_COMPAT_CHECKSUM)) {
132 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
133 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
134 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
125 } 135 }
126 136
127 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "submit commit block");
138 lock_buffer(bh);
139
128 set_buffer_dirty(bh); 140 set_buffer_dirty(bh);
129 if (journal->j_flags & JBD2_BARRIER) { 141 set_buffer_uptodate(bh);
142 bh->b_end_io = journal_end_buffer_io_sync;
143
144 if (journal->j_flags & JBD2_BARRIER &&
145 !JBD2_HAS_COMPAT_FEATURE(journal,
146 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
130 set_buffer_ordered(bh); 147 set_buffer_ordered(bh);
131 barrier_done = 1; 148 barrier_done = 1;
132 } 149 }
133 ret = sync_dirty_buffer(bh); 150 ret = submit_bh(WRITE, bh);
151
134 /* is it possible for another commit to fail at roughly 152 /* is it possible for another commit to fail at roughly
135 * the same time as this one? If so, we don't want to 153 * the same time as this one? If so, we don't want to
136 * trust the barrier flag in the super, but instead want 154 * trust the barrier flag in the super, but instead want
@@ -151,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal,
151 clear_buffer_ordered(bh); 169 clear_buffer_ordered(bh);
152 set_buffer_uptodate(bh); 170 set_buffer_uptodate(bh);
153 set_buffer_dirty(bh); 171 set_buffer_dirty(bh);
154 ret = sync_dirty_buffer(bh); 172 ret = submit_bh(WRITE, bh);
155 } 173 }
156 put_bh(bh); /* One for getblk() */ 174 *cbh = bh;
157 jbd2_journal_put_journal_head(descriptor); 175 return ret;
176}
177
178/*
179 * This function along with journal_submit_commit_record
180 * allows to write the commit record asynchronously.
181 */
182static int journal_wait_on_commit_record(struct buffer_head *bh)
183{
184 int ret = 0;
185
186 clear_buffer_dirty(bh);
187 wait_on_buffer(bh);
188
189 if (unlikely(!buffer_uptodate(bh)))
190 ret = -EIO;
191 put_bh(bh); /* One for getblk() */
192 jbd2_journal_put_journal_head(bh2jh(bh));
158 193
159 return (ret == -EIO); 194 return ret;
160} 195}
161 196
197/*
198 * Wait for all submitted IO to complete.
199 */
200static int journal_wait_on_locked_list(journal_t *journal,
201 transaction_t *commit_transaction)
202{
203 int ret = 0;
204 struct journal_head *jh;
205
206 while (commit_transaction->t_locked_list) {
207 struct buffer_head *bh;
208
209 jh = commit_transaction->t_locked_list->b_tprev;
210 bh = jh2bh(jh);
211 get_bh(bh);
212 if (buffer_locked(bh)) {
213 spin_unlock(&journal->j_list_lock);
214 wait_on_buffer(bh);
215 if (unlikely(!buffer_uptodate(bh)))
216 ret = -EIO;
217 spin_lock(&journal->j_list_lock);
218 }
219 if (!inverted_lock(journal, bh)) {
220 put_bh(bh);
221 spin_lock(&journal->j_list_lock);
222 continue;
223 }
224 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
225 __jbd2_journal_unfile_buffer(jh);
226 jbd_unlock_bh_state(bh);
227 jbd2_journal_remove_journal_head(bh);
228 put_bh(bh);
229 } else {
230 jbd_unlock_bh_state(bh);
231 }
232 put_bh(bh);
233 cond_resched_lock(&journal->j_list_lock);
234 }
235 return ret;
236 }
237
162static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 238static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
163{ 239{
164 int i; 240 int i;
@@ -274,7 +350,21 @@ write_out_data:
274 journal_do_submit_data(wbuf, bufs); 350 journal_do_submit_data(wbuf, bufs);
275} 351}
276 352
277static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, 353static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
354{
355 struct page *page = bh->b_page;
356 char *addr;
357 __u32 checksum;
358
359 addr = kmap_atomic(page, KM_USER0);
360 checksum = crc32_be(crc32_sum,
361 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
362 kunmap_atomic(addr, KM_USER0);
363
364 return checksum;
365}
366
367static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
278 unsigned long long block) 368 unsigned long long block)
279{ 369{
280 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 370 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
@@ -290,6 +380,7 @@ static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
290 */ 380 */
291void jbd2_journal_commit_transaction(journal_t *journal) 381void jbd2_journal_commit_transaction(journal_t *journal)
292{ 382{
383 struct transaction_stats_s stats;
293 transaction_t *commit_transaction; 384 transaction_t *commit_transaction;
294 struct journal_head *jh, *new_jh, *descriptor; 385 struct journal_head *jh, *new_jh, *descriptor;
295 struct buffer_head **wbuf = journal->j_wbuf; 386 struct buffer_head **wbuf = journal->j_wbuf;
@@ -305,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
305 int tag_flag; 396 int tag_flag;
306 int i; 397 int i;
307 int tag_bytes = journal_tag_bytes(journal); 398 int tag_bytes = journal_tag_bytes(journal);
399 struct buffer_head *cbh = NULL; /* For transactional checksums */
400 __u32 crc32_sum = ~0;
308 401
309 /* 402 /*
310 * First job: lock down the current transaction and wait for 403 * First job: lock down the current transaction and wait for
@@ -337,6 +430,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
337 spin_lock(&journal->j_state_lock); 430 spin_lock(&journal->j_state_lock);
338 commit_transaction->t_state = T_LOCKED; 431 commit_transaction->t_state = T_LOCKED;
339 432
433 stats.u.run.rs_wait = commit_transaction->t_max_wait;
434 stats.u.run.rs_locked = jiffies;
435 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
436 stats.u.run.rs_locked);
437
340 spin_lock(&commit_transaction->t_handle_lock); 438 spin_lock(&commit_transaction->t_handle_lock);
341 while (commit_transaction->t_updates) { 439 while (commit_transaction->t_updates) {
342 DEFINE_WAIT(wait); 440 DEFINE_WAIT(wait);
@@ -407,6 +505,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
407 */ 505 */
408 jbd2_journal_switch_revoke_table(journal); 506 jbd2_journal_switch_revoke_table(journal);
409 507
508 stats.u.run.rs_flushing = jiffies;
509 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
510 stats.u.run.rs_flushing);
511
410 commit_transaction->t_state = T_FLUSH; 512 commit_transaction->t_state = T_FLUSH;
411 journal->j_committing_transaction = commit_transaction; 513 journal->j_committing_transaction = commit_transaction;
412 journal->j_running_transaction = NULL; 514 journal->j_running_transaction = NULL;
@@ -440,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
440 journal_submit_data_buffers(journal, commit_transaction); 542 journal_submit_data_buffers(journal, commit_transaction);
441 543
442 /* 544 /*
443 * Wait for all previously submitted IO to complete. 545 * Wait for all previously submitted IO to complete if commit
546 * record is to be written synchronously.
444 */ 547 */
445 spin_lock(&journal->j_list_lock); 548 spin_lock(&journal->j_list_lock);
446 while (commit_transaction->t_locked_list) { 549 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
447 struct buffer_head *bh; 550 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
551 err = journal_wait_on_locked_list(journal,
552 commit_transaction);
448 553
449 jh = commit_transaction->t_locked_list->b_tprev;
450 bh = jh2bh(jh);
451 get_bh(bh);
452 if (buffer_locked(bh)) {
453 spin_unlock(&journal->j_list_lock);
454 wait_on_buffer(bh);
455 if (unlikely(!buffer_uptodate(bh)))
456 err = -EIO;
457 spin_lock(&journal->j_list_lock);
458 }
459 if (!inverted_lock(journal, bh)) {
460 put_bh(bh);
461 spin_lock(&journal->j_list_lock);
462 continue;
463 }
464 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
465 __jbd2_journal_unfile_buffer(jh);
466 jbd_unlock_bh_state(bh);
467 jbd2_journal_remove_journal_head(bh);
468 put_bh(bh);
469 } else {
470 jbd_unlock_bh_state(bh);
471 }
472 put_bh(bh);
473 cond_resched_lock(&journal->j_list_lock);
474 }
475 spin_unlock(&journal->j_list_lock); 554 spin_unlock(&journal->j_list_lock);
476 555
477 if (err) 556 if (err)
@@ -498,6 +577,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
498 */ 577 */
499 commit_transaction->t_state = T_COMMIT; 578 commit_transaction->t_state = T_COMMIT;
500 579
580 stats.u.run.rs_logging = jiffies;
581 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
582 stats.u.run.rs_logging);
583 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
584 stats.u.run.rs_blocks_logged = 0;
585
501 descriptor = NULL; 586 descriptor = NULL;
502 bufs = 0; 587 bufs = 0;
503 while (commit_transaction->t_buffers) { 588 while (commit_transaction->t_buffers) {
@@ -639,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
639start_journal_io: 724start_journal_io:
640 for (i = 0; i < bufs; i++) { 725 for (i = 0; i < bufs; i++) {
641 struct buffer_head *bh = wbuf[i]; 726 struct buffer_head *bh = wbuf[i];
727 /*
728 * Compute checksum.
729 */
730 if (JBD2_HAS_COMPAT_FEATURE(journal,
731 JBD2_FEATURE_COMPAT_CHECKSUM)) {
732 crc32_sum =
733 jbd2_checksum_data(crc32_sum, bh);
734 }
735
642 lock_buffer(bh); 736 lock_buffer(bh);
643 clear_buffer_dirty(bh); 737 clear_buffer_dirty(bh);
644 set_buffer_uptodate(bh); 738 set_buffer_uptodate(bh);
@@ -646,6 +740,7 @@ start_journal_io:
646 submit_bh(WRITE, bh); 740 submit_bh(WRITE, bh);
647 } 741 }
648 cond_resched(); 742 cond_resched();
743 stats.u.run.rs_blocks_logged += bufs;
649 744
650 /* Force a new descriptor to be generated next 745 /* Force a new descriptor to be generated next
651 time round the loop. */ 746 time round the loop. */
@@ -654,6 +749,23 @@ start_journal_io:
654 } 749 }
655 } 750 }
656 751
752 /* Done it all: now write the commit record asynchronously. */
753
754 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
755 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
756 err = journal_submit_commit_record(journal, commit_transaction,
757 &cbh, crc32_sum);
758 if (err)
759 __jbd2_journal_abort_hard(journal);
760
761 spin_lock(&journal->j_list_lock);
762 err = journal_wait_on_locked_list(journal,
763 commit_transaction);
764 spin_unlock(&journal->j_list_lock);
765 if (err)
766 __jbd2_journal_abort_hard(journal);
767 }
768
657 /* Lo and behold: we have just managed to send a transaction to 769 /* Lo and behold: we have just managed to send a transaction to
658 the log. Before we can commit it, wait for the IO so far to 770 the log. Before we can commit it, wait for the IO so far to
659 complete. Control buffers being written are on the 771 complete. Control buffers being written are on the
@@ -753,8 +865,14 @@ wait_for_iobuf:
753 865
754 jbd_debug(3, "JBD: commit phase 6\n"); 866 jbd_debug(3, "JBD: commit phase 6\n");
755 867
756 if (journal_write_commit_record(journal, commit_transaction)) 868 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
757 err = -EIO; 869 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
870 err = journal_submit_commit_record(journal, commit_transaction,
871 &cbh, crc32_sum);
872 if (err)
873 __jbd2_journal_abort_hard(journal);
874 }
875 err = journal_wait_on_commit_record(cbh);
758 876
759 if (err) 877 if (err)
760 jbd2_journal_abort(journal, err); 878 jbd2_journal_abort(journal, err);
@@ -816,6 +934,7 @@ restart_loop:
816 cp_transaction = jh->b_cp_transaction; 934 cp_transaction = jh->b_cp_transaction;
817 if (cp_transaction) { 935 if (cp_transaction) {
818 JBUFFER_TRACE(jh, "remove from old cp transaction"); 936 JBUFFER_TRACE(jh, "remove from old cp transaction");
937 cp_transaction->t_chp_stats.cs_dropped++;
819 __jbd2_journal_remove_checkpoint(jh); 938 __jbd2_journal_remove_checkpoint(jh);
820 } 939 }
821 940
@@ -867,10 +986,10 @@ restart_loop:
867 } 986 }
868 spin_unlock(&journal->j_list_lock); 987 spin_unlock(&journal->j_list_lock);
869 /* 988 /*
870 * This is a bit sleazy. We borrow j_list_lock to protect 989 * This is a bit sleazy. We use j_list_lock to protect transition
871 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint. 990 * of a transaction into T_FINISHED state and calling
872 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but 991 * __jbd2_journal_drop_transaction(). Otherwise we could race with
873 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint 992 * other checkpointing code processing the transaction...
874 */ 993 */
875 spin_lock(&journal->j_state_lock); 994 spin_lock(&journal->j_state_lock);
876 spin_lock(&journal->j_list_lock); 995 spin_lock(&journal->j_list_lock);
@@ -890,6 +1009,36 @@ restart_loop:
890 1009
891 J_ASSERT(commit_transaction->t_state == T_COMMIT); 1010 J_ASSERT(commit_transaction->t_state == T_COMMIT);
892 1011
1012 commit_transaction->t_start = jiffies;
1013 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1014 commit_transaction->t_start);
1015
1016 /*
1017 * File the transaction for history
1018 */
1019 stats.ts_type = JBD2_STATS_RUN;
1020 stats.ts_tid = commit_transaction->t_tid;
1021 stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1022 spin_lock(&journal->j_history_lock);
1023 memcpy(journal->j_history + journal->j_history_cur, &stats,
1024 sizeof(stats));
1025 if (++journal->j_history_cur == journal->j_history_max)
1026 journal->j_history_cur = 0;
1027
1028 /*
1029 * Calculate overall stats
1030 */
1031 journal->j_stats.ts_tid++;
1032 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1033 journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1034 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1035 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1036 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1037 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1038 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1039 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1040 spin_unlock(&journal->j_history_lock);
1041
893 commit_transaction->t_state = T_FINISHED; 1042 commit_transaction->t_state = T_FINISHED;
894 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1043 J_ASSERT(commit_transaction == journal->j_committing_transaction);
895 journal->j_commit_sequence = commit_transaction->t_tid; 1044 journal->j_commit_sequence = commit_transaction->t_tid;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 6ddc5531587c..96ba846992e9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -36,6 +36,7 @@
36#include <linux/poison.h> 36#include <linux/poison.h>
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/seq_file.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/page.h> 42#include <asm/page.h>
@@ -640,6 +641,312 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
640 return jbd2_journal_add_journal_head(bh); 641 return jbd2_journal_add_journal_head(bh);
641} 642}
642 643
644struct jbd2_stats_proc_session {
645 journal_t *journal;
646 struct transaction_stats_s *stats;
647 int start;
648 int max;
649};
650
651static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
652 struct transaction_stats_s *ts,
653 int first)
654{
655 if (ts == s->stats + s->max)
656 ts = s->stats;
657 if (!first && ts == s->stats + s->start)
658 return NULL;
659 while (ts->ts_type == 0) {
660 ts++;
661 if (ts == s->stats + s->max)
662 ts = s->stats;
663 if (ts == s->stats + s->start)
664 return NULL;
665 }
666 return ts;
667
668}
669
670static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
671{
672 struct jbd2_stats_proc_session *s = seq->private;
673 struct transaction_stats_s *ts;
674 int l = *pos;
675
676 if (l == 0)
677 return SEQ_START_TOKEN;
678 ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
679 if (!ts)
680 return NULL;
681 l--;
682 while (l) {
683 ts = jbd2_history_skip_empty(s, ++ts, 0);
684 if (!ts)
685 break;
686 l--;
687 }
688 return ts;
689}
690
691static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
692{
693 struct jbd2_stats_proc_session *s = seq->private;
694 struct transaction_stats_s *ts = v;
695
696 ++*pos;
697 if (v == SEQ_START_TOKEN)
698 return jbd2_history_skip_empty(s, s->stats + s->start, 1);
699 else
700 return jbd2_history_skip_empty(s, ++ts, 0);
701}
702
703static int jbd2_seq_history_show(struct seq_file *seq, void *v)
704{
705 struct transaction_stats_s *ts = v;
706 if (v == SEQ_START_TOKEN) {
707 seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
708 "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
709 "wait", "run", "lock", "flush", "log", "hndls",
710 "block", "inlog", "ctime", "write", "drop",
711 "close");
712 return 0;
713 }
714 if (ts->ts_type == JBD2_STATS_RUN)
715 seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
716 "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
717 jiffies_to_msecs(ts->u.run.rs_wait),
718 jiffies_to_msecs(ts->u.run.rs_running),
719 jiffies_to_msecs(ts->u.run.rs_locked),
720 jiffies_to_msecs(ts->u.run.rs_flushing),
721 jiffies_to_msecs(ts->u.run.rs_logging),
722 ts->u.run.rs_handle_count,
723 ts->u.run.rs_blocks,
724 ts->u.run.rs_blocks_logged);
725 else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
726 seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
727 "C", ts->ts_tid, " ",
728 jiffies_to_msecs(ts->u.chp.cs_chp_time),
729 ts->u.chp.cs_written, ts->u.chp.cs_dropped,
730 ts->u.chp.cs_forced_to_close);
731 else
732 J_ASSERT(0);
733 return 0;
734}
735
736static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
737{
738}
739
740static struct seq_operations jbd2_seq_history_ops = {
741 .start = jbd2_seq_history_start,
742 .next = jbd2_seq_history_next,
743 .stop = jbd2_seq_history_stop,
744 .show = jbd2_seq_history_show,
745};
746
747static int jbd2_seq_history_open(struct inode *inode, struct file *file)
748{
749 journal_t *journal = PDE(inode)->data;
750 struct jbd2_stats_proc_session *s;
751 int rc, size;
752
753 s = kmalloc(sizeof(*s), GFP_KERNEL);
754 if (s == NULL)
755 return -ENOMEM;
756 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
757 s->stats = kmalloc(size, GFP_KERNEL);
758 if (s->stats == NULL) {
759 kfree(s);
760 return -ENOMEM;
761 }
762 spin_lock(&journal->j_history_lock);
763 memcpy(s->stats, journal->j_history, size);
764 s->max = journal->j_history_max;
765 s->start = journal->j_history_cur % s->max;
766 spin_unlock(&journal->j_history_lock);
767
768 rc = seq_open(file, &jbd2_seq_history_ops);
769 if (rc == 0) {
770 struct seq_file *m = file->private_data;
771 m->private = s;
772 } else {
773 kfree(s->stats);
774 kfree(s);
775 }
776 return rc;
777
778}
779
780static int jbd2_seq_history_release(struct inode *inode, struct file *file)
781{
782 struct seq_file *seq = file->private_data;
783 struct jbd2_stats_proc_session *s = seq->private;
784
785 kfree(s->stats);
786 kfree(s);
787 return seq_release(inode, file);
788}
789
790static struct file_operations jbd2_seq_history_fops = {
791 .owner = THIS_MODULE,
792 .open = jbd2_seq_history_open,
793 .read = seq_read,
794 .llseek = seq_lseek,
795 .release = jbd2_seq_history_release,
796};
797
798static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
799{
800 return *pos ? NULL : SEQ_START_TOKEN;
801}
802
803static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
804{
805 return NULL;
806}
807
808static int jbd2_seq_info_show(struct seq_file *seq, void *v)
809{
810 struct jbd2_stats_proc_session *s = seq->private;
811
812 if (v != SEQ_START_TOKEN)
813 return 0;
814 seq_printf(seq, "%lu transaction, each upto %u blocks\n",
815 s->stats->ts_tid,
816 s->journal->j_max_transaction_buffers);
817 if (s->stats->ts_tid == 0)
818 return 0;
819 seq_printf(seq, "average: \n %ums waiting for transaction\n",
820 jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid));
821 seq_printf(seq, " %ums running transaction\n",
822 jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid));
823 seq_printf(seq, " %ums transaction was being locked\n",
824 jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid));
825 seq_printf(seq, " %ums flushing data (in ordered mode)\n",
826 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
827 seq_printf(seq, " %ums logging transaction\n",
828 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
829 seq_printf(seq, " %lu handles per transaction\n",
830 s->stats->u.run.rs_handle_count / s->stats->ts_tid);
831 seq_printf(seq, " %lu blocks per transaction\n",
832 s->stats->u.run.rs_blocks / s->stats->ts_tid);
833 seq_printf(seq, " %lu logged blocks per transaction\n",
834 s->stats->u.run.rs_blocks_logged / s->stats->ts_tid);
835 return 0;
836}
837
838static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
839{
840}
841
842static struct seq_operations jbd2_seq_info_ops = {
843 .start = jbd2_seq_info_start,
844 .next = jbd2_seq_info_next,
845 .stop = jbd2_seq_info_stop,
846 .show = jbd2_seq_info_show,
847};
848
849static int jbd2_seq_info_open(struct inode *inode, struct file *file)
850{
851 journal_t *journal = PDE(inode)->data;
852 struct jbd2_stats_proc_session *s;
853 int rc, size;
854
855 s = kmalloc(sizeof(*s), GFP_KERNEL);
856 if (s == NULL)
857 return -ENOMEM;
858 size = sizeof(struct transaction_stats_s);
859 s->stats = kmalloc(size, GFP_KERNEL);
860 if (s->stats == NULL) {
861 kfree(s);
862 return -ENOMEM;
863 }
864 spin_lock(&journal->j_history_lock);
865 memcpy(s->stats, &journal->j_stats, size);
866 s->journal = journal;
867 spin_unlock(&journal->j_history_lock);
868
869 rc = seq_open(file, &jbd2_seq_info_ops);
870 if (rc == 0) {
871 struct seq_file *m = file->private_data;
872 m->private = s;
873 } else {
874 kfree(s->stats);
875 kfree(s);
876 }
877 return rc;
878
879}
880
881static int jbd2_seq_info_release(struct inode *inode, struct file *file)
882{
883 struct seq_file *seq = file->private_data;
884 struct jbd2_stats_proc_session *s = seq->private;
885 kfree(s->stats);
886 kfree(s);
887 return seq_release(inode, file);
888}
889
890static struct file_operations jbd2_seq_info_fops = {
891 .owner = THIS_MODULE,
892 .open = jbd2_seq_info_open,
893 .read = seq_read,
894 .llseek = seq_lseek,
895 .release = jbd2_seq_info_release,
896};
897
898static struct proc_dir_entry *proc_jbd2_stats;
899
900static void jbd2_stats_proc_init(journal_t *journal)
901{
902 char name[BDEVNAME_SIZE];
903
904 snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
905 journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
906 if (journal->j_proc_entry) {
907 struct proc_dir_entry *p;
908 p = create_proc_entry("history", S_IRUGO,
909 journal->j_proc_entry);
910 if (p) {
911 p->proc_fops = &jbd2_seq_history_fops;
912 p->data = journal;
913 p = create_proc_entry("info", S_IRUGO,
914 journal->j_proc_entry);
915 if (p) {
916 p->proc_fops = &jbd2_seq_info_fops;
917 p->data = journal;
918 }
919 }
920 }
921}
922
923static void jbd2_stats_proc_exit(journal_t *journal)
924{
925 char name[BDEVNAME_SIZE];
926
927 snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
928 remove_proc_entry("info", journal->j_proc_entry);
929 remove_proc_entry("history", journal->j_proc_entry);
930 remove_proc_entry(name, proc_jbd2_stats);
931}
932
933static void journal_init_stats(journal_t *journal)
934{
935 int size;
936
937 if (!proc_jbd2_stats)
938 return;
939
940 journal->j_history_max = 100;
941 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
942 journal->j_history = kzalloc(size, GFP_KERNEL);
943 if (!journal->j_history) {
944 journal->j_history_max = 0;
945 return;
946 }
947 spin_lock_init(&journal->j_history_lock);
948}
949
643/* 950/*
644 * Management for journal control blocks: functions to create and 951 * Management for journal control blocks: functions to create and
645 * destroy journal_t structures, and to initialise and read existing 952 * destroy journal_t structures, and to initialise and read existing
@@ -681,6 +988,9 @@ static journal_t * journal_init_common (void)
681 kfree(journal); 988 kfree(journal);
682 goto fail; 989 goto fail;
683 } 990 }
991
992 journal_init_stats(journal);
993
684 return journal; 994 return journal;
685fail: 995fail:
686 return NULL; 996 return NULL;
@@ -735,6 +1045,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
735 journal->j_fs_dev = fs_dev; 1045 journal->j_fs_dev = fs_dev;
736 journal->j_blk_offset = start; 1046 journal->j_blk_offset = start;
737 journal->j_maxlen = len; 1047 journal->j_maxlen = len;
1048 jbd2_stats_proc_init(journal);
738 1049
739 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1050 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
740 J_ASSERT(bh != NULL); 1051 J_ASSERT(bh != NULL);
@@ -773,6 +1084,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
773 1084
774 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; 1085 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
775 journal->j_blocksize = inode->i_sb->s_blocksize; 1086 journal->j_blocksize = inode->i_sb->s_blocksize;
1087 jbd2_stats_proc_init(journal);
776 1088
777 /* journal descriptor can store up to n blocks -bzzz */ 1089 /* journal descriptor can store up to n blocks -bzzz */
778 n = journal->j_blocksize / sizeof(journal_block_tag_t); 1090 n = journal->j_blocksize / sizeof(journal_block_tag_t);
@@ -1153,6 +1465,8 @@ void jbd2_journal_destroy(journal_t *journal)
1153 brelse(journal->j_sb_buffer); 1465 brelse(journal->j_sb_buffer);
1154 } 1466 }
1155 1467
1468 if (journal->j_proc_entry)
1469 jbd2_stats_proc_exit(journal);
1156 if (journal->j_inode) 1470 if (journal->j_inode)
1157 iput(journal->j_inode); 1471 iput(journal->j_inode);
1158 if (journal->j_revoke) 1472 if (journal->j_revoke)
@@ -1264,6 +1578,32 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1264 return 1; 1578 return 1;
1265} 1579}
1266 1580
1581/*
1582 * jbd2_journal_clear_features () - Clear a given journal feature in the
1583 * superblock
1584 * @journal: Journal to act on.
1585 * @compat: bitmask of compatible features
1586 * @ro: bitmask of features that force read-only mount
1587 * @incompat: bitmask of incompatible features
1588 *
1589 * Clear a given journal feature as present on the
1590 * superblock.
1591 */
1592void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
1593 unsigned long ro, unsigned long incompat)
1594{
1595 journal_superblock_t *sb;
1596
1597 jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
1598 compat, ro, incompat);
1599
1600 sb = journal->j_superblock;
1601
1602 sb->s_feature_compat &= ~cpu_to_be32(compat);
1603 sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
1604 sb->s_feature_incompat &= ~cpu_to_be32(incompat);
1605}
1606EXPORT_SYMBOL(jbd2_journal_clear_features);
1267 1607
1268/** 1608/**
1269 * int jbd2_journal_update_format () - Update on-disk journal structure. 1609 * int jbd2_journal_update_format () - Update on-disk journal structure.
@@ -1633,7 +1973,7 @@ static int journal_init_jbd2_journal_head_cache(void)
1633 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", 1973 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
1634 sizeof(struct journal_head), 1974 sizeof(struct journal_head),
1635 0, /* offset */ 1975 0, /* offset */
1636 0, /* flags */ 1976 SLAB_TEMPORARY, /* flags */
1637 NULL); /* ctor */ 1977 NULL); /* ctor */
1638 retval = 0; 1978 retval = 0;
1639 if (jbd2_journal_head_cache == 0) { 1979 if (jbd2_journal_head_cache == 0) {
@@ -1900,6 +2240,28 @@ static void __exit jbd2_remove_debugfs_entry(void)
1900 2240
1901#endif 2241#endif
1902 2242
2243#ifdef CONFIG_PROC_FS
2244
2245#define JBD2_STATS_PROC_NAME "fs/jbd2"
2246
2247static void __init jbd2_create_jbd_stats_proc_entry(void)
2248{
2249 proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
2250}
2251
2252static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2253{
2254 if (proc_jbd2_stats)
2255 remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
2256}
2257
2258#else
2259
2260#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
2261#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
2262
2263#endif
2264
1903struct kmem_cache *jbd2_handle_cache; 2265struct kmem_cache *jbd2_handle_cache;
1904 2266
1905static int __init journal_init_handle_cache(void) 2267static int __init journal_init_handle_cache(void)
@@ -1907,7 +2269,7 @@ static int __init journal_init_handle_cache(void)
1907 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", 2269 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
1908 sizeof(handle_t), 2270 sizeof(handle_t),
1909 0, /* offset */ 2271 0, /* offset */
1910 0, /* flags */ 2272 SLAB_TEMPORARY, /* flags */
1911 NULL); /* ctor */ 2273 NULL); /* ctor */
1912 if (jbd2_handle_cache == NULL) { 2274 if (jbd2_handle_cache == NULL) {
1913 printk(KERN_EMERG "JBD: failed to create handle cache\n"); 2275 printk(KERN_EMERG "JBD: failed to create handle cache\n");
@@ -1955,6 +2317,7 @@ static int __init journal_init(void)
1955 if (ret != 0) 2317 if (ret != 0)
1956 jbd2_journal_destroy_caches(); 2318 jbd2_journal_destroy_caches();
1957 jbd2_create_debugfs_entry(); 2319 jbd2_create_debugfs_entry();
2320 jbd2_create_jbd_stats_proc_entry();
1958 return ret; 2321 return ret;
1959} 2322}
1960 2323
@@ -1966,6 +2329,7 @@ static void __exit journal_exit(void)
1966 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 2329 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
1967#endif 2330#endif
1968 jbd2_remove_debugfs_entry(); 2331 jbd2_remove_debugfs_entry();
2332 jbd2_remove_jbd_stats_proc_entry();
1969 jbd2_journal_destroy_caches(); 2333 jbd2_journal_destroy_caches();
1970} 2334}
1971 2335
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d0ce627539ef..921680663fa2 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/crc32.h>
24#endif 25#endif
25 26
26/* 27/*
@@ -316,6 +317,37 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
316 return block; 317 return block;
317} 318}
318 319
320/*
321 * calc_chksums calculates the checksums for the blocks described in the
322 * descriptor block.
323 */
324static int calc_chksums(journal_t *journal, struct buffer_head *bh,
325 unsigned long *next_log_block, __u32 *crc32_sum)
326{
327 int i, num_blks, err;
328 unsigned long io_block;
329 struct buffer_head *obh;
330
331 num_blks = count_tags(journal, bh);
332 /* Calculate checksum of the descriptor block. */
333 *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
334
335 for (i = 0; i < num_blks; i++) {
336 io_block = (*next_log_block)++;
337 wrap(journal, *next_log_block);
338 err = jread(&obh, journal, io_block);
339 if (err) {
340 printk(KERN_ERR "JBD: IO error %d recovering block "
341 "%lu in log\n", err, io_block);
342 return 1;
343 } else {
344 *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
345 obh->b_size);
346 }
347 }
348 return 0;
349}
350
319static int do_one_pass(journal_t *journal, 351static int do_one_pass(journal_t *journal,
320 struct recovery_info *info, enum passtype pass) 352 struct recovery_info *info, enum passtype pass)
321{ 353{
@@ -328,6 +360,7 @@ static int do_one_pass(journal_t *journal,
328 unsigned int sequence; 360 unsigned int sequence;
329 int blocktype; 361 int blocktype;
330 int tag_bytes = journal_tag_bytes(journal); 362 int tag_bytes = journal_tag_bytes(journal);
363 __u32 crc32_sum = ~0; /* Transactional Checksums */
331 364
332 /* Precompute the maximum metadata descriptors in a descriptor block */ 365 /* Precompute the maximum metadata descriptors in a descriptor block */
333 int MAX_BLOCKS_PER_DESC; 366 int MAX_BLOCKS_PER_DESC;
@@ -419,12 +452,26 @@ static int do_one_pass(journal_t *journal,
419 switch(blocktype) { 452 switch(blocktype) {
420 case JBD2_DESCRIPTOR_BLOCK: 453 case JBD2_DESCRIPTOR_BLOCK:
421 /* If it is a valid descriptor block, replay it 454 /* If it is a valid descriptor block, replay it
422 * in pass REPLAY; otherwise, just skip over the 455 * in pass REPLAY; if journal_checksums enabled, then
423 * blocks it describes. */ 456 * calculate checksums in PASS_SCAN, otherwise,
457 * just skip over the blocks it describes. */
424 if (pass != PASS_REPLAY) { 458 if (pass != PASS_REPLAY) {
459 if (pass == PASS_SCAN &&
460 JBD2_HAS_COMPAT_FEATURE(journal,
461 JBD2_FEATURE_COMPAT_CHECKSUM) &&
462 !info->end_transaction) {
463 if (calc_chksums(journal, bh,
464 &next_log_block,
465 &crc32_sum)) {
466 put_bh(bh);
467 break;
468 }
469 put_bh(bh);
470 continue;
471 }
425 next_log_block += count_tags(journal, bh); 472 next_log_block += count_tags(journal, bh);
426 wrap(journal, next_log_block); 473 wrap(journal, next_log_block);
427 brelse(bh); 474 put_bh(bh);
428 continue; 475 continue;
429 } 476 }
430 477
@@ -516,9 +563,96 @@ static int do_one_pass(journal_t *journal,
516 continue; 563 continue;
517 564
518 case JBD2_COMMIT_BLOCK: 565 case JBD2_COMMIT_BLOCK:
519 /* Found an expected commit block: not much to 566 /* How to differentiate between interrupted commit
520 * do other than move on to the next sequence 567 * and journal corruption ?
568 *
569 * {nth transaction}
570 * Checksum Verification Failed
571 * |
572 * ____________________
573 * | |
574 * async_commit sync_commit
575 * | |
576 * | GO TO NEXT "Journal Corruption"
577 * | TRANSACTION
578 * |
579 * {(n+1)th transanction}
580 * |
581 * _______|______________
582 * | |
583 * Commit block found Commit block not found
584 * | |
585 * "Journal Corruption" |
586 * _____________|_________
587 * | |
588 * nth trans corrupt OR nth trans
589 * and (n+1)th interrupted interrupted
590 * before commit block
591 * could reach the disk.
592 * (Cannot find the difference in above
593 * mentioned conditions. Hence assume
594 * "Interrupted Commit".)
595 */
596
597 /* Found an expected commit block: if checksums
598 * are present verify them in PASS_SCAN; else not
599 * much to do other than move on to the next sequence
521 * number. */ 600 * number. */
601 if (pass == PASS_SCAN &&
602 JBD2_HAS_COMPAT_FEATURE(journal,
603 JBD2_FEATURE_COMPAT_CHECKSUM)) {
604 int chksum_err, chksum_seen;
605 struct commit_header *cbh =
606 (struct commit_header *)bh->b_data;
607 unsigned found_chksum =
608 be32_to_cpu(cbh->h_chksum[0]);
609
610 chksum_err = chksum_seen = 0;
611
612 if (info->end_transaction) {
613 printk(KERN_ERR "JBD: Transaction %u "
614 "found to be corrupt.\n",
615 next_commit_ID - 1);
616 brelse(bh);
617 break;
618 }
619
620 if (crc32_sum == found_chksum &&
621 cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
622 cbh->h_chksum_size ==
623 JBD2_CRC32_CHKSUM_SIZE)
624 chksum_seen = 1;
625 else if (!(cbh->h_chksum_type == 0 &&
626 cbh->h_chksum_size == 0 &&
627 found_chksum == 0 &&
628 !chksum_seen))
629 /*
630 * If fs is mounted using an old kernel and then
631 * kernel with journal_chksum is used then we
632 * get a situation where the journal flag has
633 * checksum flag set but checksums are not
634 * present i.e chksum = 0, in the individual
635 * commit blocks.
636 * Hence to avoid checksum failures, in this
637 * situation, this extra check is added.
638 */
639 chksum_err = 1;
640
641 if (chksum_err) {
642 info->end_transaction = next_commit_ID;
643
644 if (!JBD2_HAS_COMPAT_FEATURE(journal,
645 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
646 printk(KERN_ERR
647 "JBD: Transaction %u "
648 "found to be corrupt.\n",
649 next_commit_ID);
650 brelse(bh);
651 break;
652 }
653 }
654 crc32_sum = ~0;
655 }
522 brelse(bh); 656 brelse(bh);
523 next_commit_ID++; 657 next_commit_ID++;
524 continue; 658 continue;
@@ -554,9 +688,10 @@ static int do_one_pass(journal_t *journal,
554 * transaction marks the end of the valid log. 688 * transaction marks the end of the valid log.
555 */ 689 */
556 690
557 if (pass == PASS_SCAN) 691 if (pass == PASS_SCAN) {
558 info->end_transaction = next_commit_ID; 692 if (!info->end_transaction)
559 else { 693 info->end_transaction = next_commit_ID;
694 } else {
560 /* It's really bad news if different passes end up at 695 /* It's really bad news if different passes end up at
561 * different places (but possible due to IO errors). */ 696 * different places (but possible due to IO errors). */
562 if (info->end_transaction != next_commit_ID) { 697 if (info->end_transaction != next_commit_ID) {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 3595fd432d5b..df36f42e19e1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -171,13 +171,15 @@ int __init jbd2_journal_init_revoke_caches(void)
171{ 171{
172 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", 172 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
173 sizeof(struct jbd2_revoke_record_s), 173 sizeof(struct jbd2_revoke_record_s),
174 0, SLAB_HWCACHE_ALIGN, NULL); 174 0,
175 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
176 NULL);
175 if (jbd2_revoke_record_cache == 0) 177 if (jbd2_revoke_record_cache == 0)
176 return -ENOMEM; 178 return -ENOMEM;
177 179
178 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", 180 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
179 sizeof(struct jbd2_revoke_table_s), 181 sizeof(struct jbd2_revoke_table_s),
180 0, 0, NULL); 182 0, SLAB_TEMPORARY, NULL);
181 if (jbd2_revoke_table_cache == 0) { 183 if (jbd2_revoke_table_cache == 0) {
182 kmem_cache_destroy(jbd2_revoke_record_cache); 184 kmem_cache_destroy(jbd2_revoke_record_cache);
183 jbd2_revoke_record_cache = NULL; 185 jbd2_revoke_record_cache = NULL;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b1fcf2b3dca3..b9b0b6f899b9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -54,11 +54,13 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
54 spin_lock_init(&transaction->t_handle_lock); 54 spin_lock_init(&transaction->t_handle_lock);
55 55
56 /* Set up the commit timer for the new transaction. */ 56 /* Set up the commit timer for the new transaction. */
57 journal->j_commit_timer.expires = transaction->t_expires; 57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
58 add_timer(&journal->j_commit_timer); 58 add_timer(&journal->j_commit_timer);
59 59
60 J_ASSERT(journal->j_running_transaction == NULL); 60 J_ASSERT(journal->j_running_transaction == NULL);
61 journal->j_running_transaction = transaction; 61 journal->j_running_transaction = transaction;
62 transaction->t_max_wait = 0;
63 transaction->t_start = jiffies;
62 64
63 return transaction; 65 return transaction;
64} 66}
@@ -85,6 +87,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
85 int nblocks = handle->h_buffer_credits; 87 int nblocks = handle->h_buffer_credits;
86 transaction_t *new_transaction = NULL; 88 transaction_t *new_transaction = NULL;
87 int ret = 0; 89 int ret = 0;
90 unsigned long ts = jiffies;
88 91
89 if (nblocks > journal->j_max_transaction_buffers) { 92 if (nblocks > journal->j_max_transaction_buffers) {
90 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 93 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -217,6 +220,12 @@ repeat_locked:
217 /* OK, account for the buffers that this operation expects to 220 /* OK, account for the buffers that this operation expects to
218 * use and add the handle to the running transaction. */ 221 * use and add the handle to the running transaction. */
219 222
223 if (time_after(transaction->t_start, ts)) {
224 ts = jbd2_time_diff(ts, transaction->t_start);
225 if (ts > transaction->t_max_wait)
226 transaction->t_max_wait = ts;
227 }
228
220 handle->h_transaction = transaction; 229 handle->h_transaction = transaction;
221 transaction->t_outstanding_credits += nblocks; 230 transaction->t_outstanding_credits += nblocks;
222 transaction->t_updates++; 231 transaction->t_updates++;
@@ -232,6 +241,8 @@ out:
232 return ret; 241 return ret;
233} 242}
234 243
244static struct lock_class_key jbd2_handle_key;
245
235/* Allocate a new handle. This should probably be in a slab... */ 246/* Allocate a new handle. This should probably be in a slab... */
236static handle_t *new_handle(int nblocks) 247static handle_t *new_handle(int nblocks)
237{ 248{
@@ -242,6 +253,9 @@ static handle_t *new_handle(int nblocks)
242 handle->h_buffer_credits = nblocks; 253 handle->h_buffer_credits = nblocks;
243 handle->h_ref = 1; 254 handle->h_ref = 1;
244 255
256 lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
257 &jbd2_handle_key, 0);
258
245 return handle; 259 return handle;
246} 260}
247 261
@@ -284,7 +298,11 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
284 jbd2_free_handle(handle); 298 jbd2_free_handle(handle);
285 current->journal_info = NULL; 299 current->journal_info = NULL;
286 handle = ERR_PTR(err); 300 handle = ERR_PTR(err);
301 goto out;
287 } 302 }
303
304 lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
305out:
288 return handle; 306 return handle;
289} 307}
290 308
@@ -1164,7 +1182,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1164 } 1182 }
1165 1183
1166 /* That test should have eliminated the following case: */ 1184 /* That test should have eliminated the following case: */
1167 J_ASSERT_JH(jh, jh->b_frozen_data == 0); 1185 J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1168 1186
1169 JBUFFER_TRACE(jh, "file as BJ_Metadata"); 1187 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1170 spin_lock(&journal->j_list_lock); 1188 spin_lock(&journal->j_list_lock);
@@ -1410,6 +1428,8 @@ int jbd2_journal_stop(handle_t *handle)
1410 spin_unlock(&journal->j_state_lock); 1428 spin_unlock(&journal->j_state_lock);
1411 } 1429 }
1412 1430
1431 lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
1432
1413 jbd2_free_handle(handle); 1433 jbd2_free_handle(handle);
1414 return err; 1434 return err;
1415} 1435}
@@ -1512,7 +1532,7 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1512 1532
1513 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); 1533 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1514 if (jh->b_jlist != BJ_None) 1534 if (jh->b_jlist != BJ_None)
1515 J_ASSERT_JH(jh, transaction != 0); 1535 J_ASSERT_JH(jh, transaction != NULL);
1516 1536
1517 switch (jh->b_jlist) { 1537 switch (jh->b_jlist) {
1518 case BJ_None: 1538 case BJ_None:
@@ -1581,11 +1601,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1581 if (buffer_locked(bh) || buffer_dirty(bh)) 1601 if (buffer_locked(bh) || buffer_dirty(bh))
1582 goto out; 1602 goto out;
1583 1603
1584 if (jh->b_next_transaction != 0) 1604 if (jh->b_next_transaction != NULL)
1585 goto out; 1605 goto out;
1586 1606
1587 spin_lock(&journal->j_list_lock); 1607 spin_lock(&journal->j_list_lock);
1588 if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { 1608 if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
1589 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { 1609 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1590 /* A written-back ordered data buffer */ 1610 /* A written-back ordered data buffer */
1591 JBUFFER_TRACE(jh, "release data"); 1611 JBUFFER_TRACE(jh, "release data");
@@ -1593,7 +1613,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1593 jbd2_journal_remove_journal_head(bh); 1613 jbd2_journal_remove_journal_head(bh);
1594 __brelse(bh); 1614 __brelse(bh);
1595 } 1615 }
1596 } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { 1616 } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1597 /* written-back checkpointed metadata buffer */ 1617 /* written-back checkpointed metadata buffer */
1598 if (jh->b_jlist == BJ_None) { 1618 if (jh->b_jlist == BJ_None) {
1599 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1619 JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1953,7 +1973,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
1953 1973
1954 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); 1974 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1955 J_ASSERT_JH(jh, jh->b_transaction == transaction || 1975 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1956 jh->b_transaction == 0); 1976 jh->b_transaction == NULL);
1957 1977
1958 if (jh->b_transaction && jh->b_jlist == jlist) 1978 if (jh->b_transaction && jh->b_jlist == jlist)
1959 return; 1979 return;
diff --git a/fs/read_write.c b/fs/read_write.c
index c4d3d17923f1..1c177f29e1b7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -446,6 +446,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
446 } 446 }
447 return seg; 447 return seg;
448} 448}
449EXPORT_SYMBOL(iov_shorten);
449 450
450ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, 451ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
451 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) 452 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)