diff options
46 files changed, 7679 insertions, 804 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6a4adcae9f9a..560f88dc7090 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -86,9 +86,21 @@ Alex is working on a new set of patches right now. | |||
86 | When mounting an ext4 filesystem, the following option are accepted: | 86 | When mounting an ext4 filesystem, the following option are accepted: |
87 | (*) == default | 87 | (*) == default |
88 | 88 | ||
89 | extents ext4 will use extents to address file data. The | 89 | extents (*) ext4 will use extents to address file data. The |
90 | file system will no longer be mountable by ext3. | 90 | file system will no longer be mountable by ext3. |
91 | 91 | ||
92 | noextents ext4 will not use extents for newly created files | ||
93 | |||
94 | journal_checksum Enable checksumming of the journal transactions. | ||
95 | This will allow the recovery code in e2fsck and the | ||
96 | kernel to detect corruption in the kernel. It is a | ||
97 | compatible change and will be ignored by older kernels. | ||
98 | |||
99 | journal_async_commit Commit block can be written to disk without waiting | ||
100 | for descriptor blocks. If enabled older kernels cannot | ||
101 | mount the device. This will enable 'journal_checksum' | ||
102 | internally. | ||
103 | |||
92 | journal=update Update the ext4 file system's journal to the current | 104 | journal=update Update the ext4 file system's journal to the current |
93 | format. | 105 | format. |
94 | 106 | ||
@@ -196,6 +208,12 @@ nobh (a) cache disk block mapping information | |||
196 | "nobh" option tries to avoid associating buffer | 208 | "nobh" option tries to avoid associating buffer |
197 | heads (supported only for "writeback" mode). | 209 | heads (supported only for "writeback" mode). |
198 | 210 | ||
211 | mballoc (*) Use the multiple block allocator for block allocation | ||
212 | nomballoc disabled multiple block allocator for block allocation. | ||
213 | stripe=n Number of filesystem blocks that mballoc will try | ||
214 | to use for allocation size and alignment. For RAID5/6 | ||
215 | systems this should be the number of data | ||
216 | disks * RAID chunk size in file system blocks. | ||
199 | 217 | ||
200 | Data Mode | 218 | Data Mode |
201 | --------- | 219 | --------- |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index dec99455321f..4413a2d4646f 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -857,6 +857,45 @@ CPUs. | |||
857 | The "procs_blocked" line gives the number of processes currently blocked, | 857 | The "procs_blocked" line gives the number of processes currently blocked, |
858 | waiting for I/O to complete. | 858 | waiting for I/O to complete. |
859 | 859 | ||
860 | 1.9 Ext4 file system parameters | ||
861 | ------------------------------ | ||
862 | Ext4 file system have one directory per partition under /proc/fs/ext4/ | ||
863 | # ls /proc/fs/ext4/hdc/ | ||
864 | group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req | ||
865 | stats stream_req | ||
866 | |||
867 | mb_groups: | ||
868 | This file gives the details of mutiblock allocator buddy cache of free blocks | ||
869 | |||
870 | mb_history: | ||
871 | Multiblock allocation history. | ||
872 | |||
873 | stats: | ||
874 | This file indicate whether the multiblock allocator should start collecting | ||
875 | statistics. The statistics are shown during unmount | ||
876 | |||
877 | group_prealloc: | ||
878 | The multiblock allocator normalize the block allocation request to | ||
879 | group_prealloc filesystem blocks if we don't have strip value set. | ||
880 | The stripe value can be specified at mount time or during mke2fs. | ||
881 | |||
882 | max_to_scan: | ||
883 | How long multiblock allocator can look for a best extent (in found extents) | ||
884 | |||
885 | min_to_scan: | ||
886 | How long multiblock allocator must look for a best extent | ||
887 | |||
888 | order2_req: | ||
889 | Multiblock allocator use 2^N search using buddies only for requests greater | ||
890 | than or equal to order2_req. The request size is specfied in file system | ||
891 | blocks. A value of 2 indicate only if the requests are greater than or equal | ||
892 | to 4 blocks. | ||
893 | |||
894 | stream_req: | ||
895 | Files smaller than stream_req are served by the stream allocator, whose | ||
896 | purpose is to pack requests as close each to other as possible to | ||
897 | produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16 | ||
898 | filesystem block size will use group based preallocation. | ||
860 | 899 | ||
861 | ------------------------------------------------------------------------------ | 900 | ------------------------------------------------------------------------------ |
862 | Summary | 901 | Summary |
diff --git a/fs/Kconfig b/fs/Kconfig index 9656139d2e99..219ec06a8c7e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig | |||
@@ -236,6 +236,7 @@ config JBD_DEBUG | |||
236 | 236 | ||
237 | config JBD2 | 237 | config JBD2 |
238 | tristate | 238 | tristate |
239 | select CRC32 | ||
239 | help | 240 | help |
240 | This is a generic journaling layer for block devices that support | 241 | This is a generic journaling layer for block devices that support |
241 | both 32-bit and 64-bit block numbers. It is currently used by | 242 | both 32-bit and 64-bit block numbers. It is currently used by |
diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 33fe39ad4e03..0cc3597c1197 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c | |||
@@ -546,11 +546,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, | |||
546 | dentry->d_op = &afs_fs_dentry_operations; | 546 | dentry->d_op = &afs_fs_dentry_operations; |
547 | 547 | ||
548 | d_add(dentry, inode); | 548 | d_add(dentry, inode); |
549 | _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }", | 549 | _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }", |
550 | fid.vnode, | 550 | fid.vnode, |
551 | fid.unique, | 551 | fid.unique, |
552 | dentry->d_inode->i_ino, | 552 | dentry->d_inode->i_ino, |
553 | dentry->d_inode->i_version); | 553 | (unsigned long long)dentry->d_inode->i_version); |
554 | 554 | ||
555 | return NULL; | 555 | return NULL; |
556 | } | 556 | } |
@@ -630,9 +630,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) | |||
630 | * been deleted and replaced, and the original vnode ID has | 630 | * been deleted and replaced, and the original vnode ID has |
631 | * been reused */ | 631 | * been reused */ |
632 | if (fid.unique != vnode->fid.unique) { | 632 | if (fid.unique != vnode->fid.unique) { |
633 | _debug("%s: file deleted (uq %u -> %u I:%lu)", | 633 | _debug("%s: file deleted (uq %u -> %u I:%llu)", |
634 | dentry->d_name.name, fid.unique, | 634 | dentry->d_name.name, fid.unique, |
635 | vnode->fid.unique, dentry->d_inode->i_version); | 635 | vnode->fid.unique, |
636 | (unsigned long long)dentry->d_inode->i_version); | ||
636 | spin_lock(&vnode->lock); | 637 | spin_lock(&vnode->lock); |
637 | set_bit(AFS_VNODE_DELETED, &vnode->flags); | 638 | set_bit(AFS_VNODE_DELETED, &vnode->flags); |
638 | spin_unlock(&vnode->lock); | 639 | spin_unlock(&vnode->lock); |
diff --git a/fs/afs/inode.c b/fs/afs/inode.c index d196840127c6..84750c8e9f95 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c | |||
@@ -301,7 +301,8 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
301 | 301 | ||
302 | inode = dentry->d_inode; | 302 | inode = dentry->d_inode; |
303 | 303 | ||
304 | _enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version); | 304 | _enter("{ ino=%lu v=%llu }", inode->i_ino, |
305 | (unsigned long long)inode->i_version); | ||
305 | 306 | ||
306 | generic_fillattr(inode, stat); | 307 | generic_fillattr(inode, stat); |
307 | return 0; | 308 | return 0; |
diff --git a/fs/buffer.c b/fs/buffer.c index 7249e014819e..456c9ab7705b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -3213,6 +3213,50 @@ static int buffer_cpu_notify(struct notifier_block *self, | |||
3213 | return NOTIFY_OK; | 3213 | return NOTIFY_OK; |
3214 | } | 3214 | } |
3215 | 3215 | ||
3216 | /** | ||
3217 | * bh_uptodate_or_lock: Test whether the buffer is uptodate | ||
3218 | * @bh: struct buffer_head | ||
3219 | * | ||
3220 | * Return true if the buffer is up-to-date and false, | ||
3221 | * with the buffer locked, if not. | ||
3222 | */ | ||
3223 | int bh_uptodate_or_lock(struct buffer_head *bh) | ||
3224 | { | ||
3225 | if (!buffer_uptodate(bh)) { | ||
3226 | lock_buffer(bh); | ||
3227 | if (!buffer_uptodate(bh)) | ||
3228 | return 0; | ||
3229 | unlock_buffer(bh); | ||
3230 | } | ||
3231 | return 1; | ||
3232 | } | ||
3233 | EXPORT_SYMBOL(bh_uptodate_or_lock); | ||
3234 | |||
3235 | /** | ||
3236 | * bh_submit_read: Submit a locked buffer for reading | ||
3237 | * @bh: struct buffer_head | ||
3238 | * | ||
3239 | * Returns zero on success and -EIO on error. | ||
3240 | */ | ||
3241 | int bh_submit_read(struct buffer_head *bh) | ||
3242 | { | ||
3243 | BUG_ON(!buffer_locked(bh)); | ||
3244 | |||
3245 | if (buffer_uptodate(bh)) { | ||
3246 | unlock_buffer(bh); | ||
3247 | return 0; | ||
3248 | } | ||
3249 | |||
3250 | get_bh(bh); | ||
3251 | bh->b_end_io = end_buffer_read_sync; | ||
3252 | submit_bh(READ, bh); | ||
3253 | wait_on_buffer(bh); | ||
3254 | if (buffer_uptodate(bh)) | ||
3255 | return 0; | ||
3256 | return -EIO; | ||
3257 | } | ||
3258 | EXPORT_SYMBOL(bh_submit_read); | ||
3259 | |||
3216 | void __init buffer_init(void) | 3260 | void __init buffer_init(void) |
3217 | { | 3261 | { |
3218 | int nrpages; | 3262 | int nrpages; |
diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 154e25f13d77..6abaf75163f0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c | |||
@@ -680,11 +680,31 @@ static int ext2_check_descriptors (struct super_block * sb) | |||
680 | static loff_t ext2_max_size(int bits) | 680 | static loff_t ext2_max_size(int bits) |
681 | { | 681 | { |
682 | loff_t res = EXT2_NDIR_BLOCKS; | 682 | loff_t res = EXT2_NDIR_BLOCKS; |
683 | /* This constant is calculated to be the largest file size for a | 683 | int meta_blocks; |
684 | * dense, 4k-blocksize file such that the total number of | 684 | loff_t upper_limit; |
685 | |||
686 | /* This is calculated to be the largest file size for a | ||
687 | * dense, file such that the total number of | ||
685 | * sectors in the file, including data and all indirect blocks, | 688 | * sectors in the file, including data and all indirect blocks, |
686 | * does not exceed 2^32. */ | 689 | * does not exceed 2^32 -1 |
687 | const loff_t upper_limit = 0x1ff7fffd000LL; | 690 | * __u32 i_blocks representing the total number of |
691 | * 512 bytes blocks of the file | ||
692 | */ | ||
693 | upper_limit = (1LL << 32) - 1; | ||
694 | |||
695 | /* total blocks in file system block size */ | ||
696 | upper_limit >>= (bits - 9); | ||
697 | |||
698 | |||
699 | /* indirect blocks */ | ||
700 | meta_blocks = 1; | ||
701 | /* double indirect blocks */ | ||
702 | meta_blocks += 1 + (1LL << (bits-2)); | ||
703 | /* tripple indirect blocks */ | ||
704 | meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); | ||
705 | |||
706 | upper_limit -= meta_blocks; | ||
707 | upper_limit <<= bits; | ||
688 | 708 | ||
689 | res += 1LL << (bits-2); | 709 | res += 1LL << (bits-2); |
690 | res += 1LL << (2*(bits-2)); | 710 | res += 1LL << (2*(bits-2)); |
@@ -692,6 +712,10 @@ static loff_t ext2_max_size(int bits) | |||
692 | res <<= bits; | 712 | res <<= bits; |
693 | if (res > upper_limit) | 713 | if (res > upper_limit) |
694 | res = upper_limit; | 714 | res = upper_limit; |
715 | |||
716 | if (res > MAX_LFS_FILESIZE) | ||
717 | res = MAX_LFS_FILESIZE; | ||
718 | |||
695 | return res; | 719 | return res; |
696 | } | 720 | } |
697 | 721 | ||
diff --git a/fs/ext3/super.c b/fs/ext3/super.c index cb14de1502c3..f3675cc630e9 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c | |||
@@ -1436,11 +1436,31 @@ static void ext3_orphan_cleanup (struct super_block * sb, | |||
1436 | static loff_t ext3_max_size(int bits) | 1436 | static loff_t ext3_max_size(int bits) |
1437 | { | 1437 | { |
1438 | loff_t res = EXT3_NDIR_BLOCKS; | 1438 | loff_t res = EXT3_NDIR_BLOCKS; |
1439 | /* This constant is calculated to be the largest file size for a | 1439 | int meta_blocks; |
1440 | * dense, 4k-blocksize file such that the total number of | 1440 | loff_t upper_limit; |
1441 | |||
1442 | /* This is calculated to be the largest file size for a | ||
1443 | * dense, file such that the total number of | ||
1441 | * sectors in the file, including data and all indirect blocks, | 1444 | * sectors in the file, including data and all indirect blocks, |
1442 | * does not exceed 2^32. */ | 1445 | * does not exceed 2^32 -1 |
1443 | const loff_t upper_limit = 0x1ff7fffd000LL; | 1446 | * __u32 i_blocks representing the total number of |
1447 | * 512 bytes blocks of the file | ||
1448 | */ | ||
1449 | upper_limit = (1LL << 32) - 1; | ||
1450 | |||
1451 | /* total blocks in file system block size */ | ||
1452 | upper_limit >>= (bits - 9); | ||
1453 | |||
1454 | |||
1455 | /* indirect blocks */ | ||
1456 | meta_blocks = 1; | ||
1457 | /* double indirect blocks */ | ||
1458 | meta_blocks += 1 + (1LL << (bits-2)); | ||
1459 | /* tripple indirect blocks */ | ||
1460 | meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); | ||
1461 | |||
1462 | upper_limit -= meta_blocks; | ||
1463 | upper_limit <<= bits; | ||
1444 | 1464 | ||
1445 | res += 1LL << (bits-2); | 1465 | res += 1LL << (bits-2); |
1446 | res += 1LL << (2*(bits-2)); | 1466 | res += 1LL << (2*(bits-2)); |
@@ -1448,6 +1468,10 @@ static loff_t ext3_max_size(int bits) | |||
1448 | res <<= bits; | 1468 | res <<= bits; |
1449 | if (res > upper_limit) | 1469 | if (res > upper_limit) |
1450 | res = upper_limit; | 1470 | res = upper_limit; |
1471 | |||
1472 | if (res > MAX_LFS_FILESIZE) | ||
1473 | res = MAX_LFS_FILESIZE; | ||
1474 | |||
1451 | return res; | 1475 | return res; |
1452 | } | 1476 | } |
1453 | 1477 | ||
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index ae6e7e502ac9..ac6fa8ca0a2f 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile | |||
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o | |||
6 | 6 | ||
7 | ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ | 7 | ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ |
8 | ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ | 8 | ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ |
9 | ext4_jbd2.o | 9 | ext4_jbd2.o migrate.o mballoc.o |
10 | 10 | ||
11 | ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o | 11 | ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o |
12 | ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o | 12 | ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o |
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 71ee95e534fd..ac75ea953d83 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -29,7 +29,7 @@ | |||
29 | * Calculate the block group number and offset, given a block number | 29 | * Calculate the block group number and offset, given a block number |
30 | */ | 30 | */ |
31 | void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, | 31 | void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, |
32 | unsigned long *blockgrpp, ext4_grpblk_t *offsetp) | 32 | ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) |
33 | { | 33 | { |
34 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; | 34 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; |
35 | ext4_grpblk_t offset; | 35 | ext4_grpblk_t offset; |
@@ -46,7 +46,7 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, | |||
46 | /* Initializes an uninitialized block bitmap if given, and returns the | 46 | /* Initializes an uninitialized block bitmap if given, and returns the |
47 | * number of blocks free in the group. */ | 47 | * number of blocks free in the group. */ |
48 | unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, | 48 | unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, |
49 | int block_group, struct ext4_group_desc *gdp) | 49 | ext4_group_t block_group, struct ext4_group_desc *gdp) |
50 | { | 50 | { |
51 | unsigned long start; | 51 | unsigned long start; |
52 | int bit, bit_max; | 52 | int bit, bit_max; |
@@ -60,7 +60,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, | |||
60 | * essentially implementing a per-group read-only flag. */ | 60 | * essentially implementing a per-group read-only flag. */ |
61 | if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { | 61 | if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { |
62 | ext4_error(sb, __FUNCTION__, | 62 | ext4_error(sb, __FUNCTION__, |
63 | "Checksum bad for group %u\n", block_group); | 63 | "Checksum bad for group %lu\n", block_group); |
64 | gdp->bg_free_blocks_count = 0; | 64 | gdp->bg_free_blocks_count = 0; |
65 | gdp->bg_free_inodes_count = 0; | 65 | gdp->bg_free_inodes_count = 0; |
66 | gdp->bg_itable_unused = 0; | 66 | gdp->bg_itable_unused = 0; |
@@ -153,7 +153,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, | |||
153 | * group descriptor | 153 | * group descriptor |
154 | */ | 154 | */ |
155 | struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, | 155 | struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, |
156 | unsigned int block_group, | 156 | ext4_group_t block_group, |
157 | struct buffer_head ** bh) | 157 | struct buffer_head ** bh) |
158 | { | 158 | { |
159 | unsigned long group_desc; | 159 | unsigned long group_desc; |
@@ -164,7 +164,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, | |||
164 | if (block_group >= sbi->s_groups_count) { | 164 | if (block_group >= sbi->s_groups_count) { |
165 | ext4_error (sb, "ext4_get_group_desc", | 165 | ext4_error (sb, "ext4_get_group_desc", |
166 | "block_group >= groups_count - " | 166 | "block_group >= groups_count - " |
167 | "block_group = %d, groups_count = %lu", | 167 | "block_group = %lu, groups_count = %lu", |
168 | block_group, sbi->s_groups_count); | 168 | block_group, sbi->s_groups_count); |
169 | 169 | ||
170 | return NULL; | 170 | return NULL; |
@@ -176,7 +176,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, | |||
176 | if (!sbi->s_group_desc[group_desc]) { | 176 | if (!sbi->s_group_desc[group_desc]) { |
177 | ext4_error (sb, "ext4_get_group_desc", | 177 | ext4_error (sb, "ext4_get_group_desc", |
178 | "Group descriptor not loaded - " | 178 | "Group descriptor not loaded - " |
179 | "block_group = %d, group_desc = %lu, desc = %lu", | 179 | "block_group = %lu, group_desc = %lu, desc = %lu", |
180 | block_group, group_desc, offset); | 180 | block_group, group_desc, offset); |
181 | return NULL; | 181 | return NULL; |
182 | } | 182 | } |
@@ -189,18 +189,70 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, | |||
189 | return desc; | 189 | return desc; |
190 | } | 190 | } |
191 | 191 | ||
192 | static int ext4_valid_block_bitmap(struct super_block *sb, | ||
193 | struct ext4_group_desc *desc, | ||
194 | unsigned int block_group, | ||
195 | struct buffer_head *bh) | ||
196 | { | ||
197 | ext4_grpblk_t offset; | ||
198 | ext4_grpblk_t next_zero_bit; | ||
199 | ext4_fsblk_t bitmap_blk; | ||
200 | ext4_fsblk_t group_first_block; | ||
201 | |||
202 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { | ||
203 | /* with FLEX_BG, the inode/block bitmaps and itable | ||
204 | * blocks may not be in the group at all | ||
205 | * so the bitmap validation will be skipped for those groups | ||
206 | * or it has to also read the block group where the bitmaps | ||
207 | * are located to verify they are set. | ||
208 | */ | ||
209 | return 1; | ||
210 | } | ||
211 | group_first_block = ext4_group_first_block_no(sb, block_group); | ||
212 | |||
213 | /* check whether block bitmap block number is set */ | ||
214 | bitmap_blk = ext4_block_bitmap(sb, desc); | ||
215 | offset = bitmap_blk - group_first_block; | ||
216 | if (!ext4_test_bit(offset, bh->b_data)) | ||
217 | /* bad block bitmap */ | ||
218 | goto err_out; | ||
219 | |||
220 | /* check whether the inode bitmap block number is set */ | ||
221 | bitmap_blk = ext4_inode_bitmap(sb, desc); | ||
222 | offset = bitmap_blk - group_first_block; | ||
223 | if (!ext4_test_bit(offset, bh->b_data)) | ||
224 | /* bad block bitmap */ | ||
225 | goto err_out; | ||
226 | |||
227 | /* check whether the inode table block number is set */ | ||
228 | bitmap_blk = ext4_inode_table(sb, desc); | ||
229 | offset = bitmap_blk - group_first_block; | ||
230 | next_zero_bit = ext4_find_next_zero_bit(bh->b_data, | ||
231 | offset + EXT4_SB(sb)->s_itb_per_group, | ||
232 | offset); | ||
233 | if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group) | ||
234 | /* good bitmap for inode tables */ | ||
235 | return 1; | ||
236 | |||
237 | err_out: | ||
238 | ext4_error(sb, __FUNCTION__, | ||
239 | "Invalid block bitmap - " | ||
240 | "block_group = %d, block = %llu", | ||
241 | block_group, bitmap_blk); | ||
242 | return 0; | ||
243 | } | ||
192 | /** | 244 | /** |
193 | * read_block_bitmap() | 245 | * read_block_bitmap() |
194 | * @sb: super block | 246 | * @sb: super block |
195 | * @block_group: given block group | 247 | * @block_group: given block group |
196 | * | 248 | * |
197 | * Read the bitmap for a given block_group, reading into the specified | 249 | * Read the bitmap for a given block_group,and validate the |
198 | * slot in the superblock's bitmap cache. | 250 | * bits for block/inode/inode tables are set in the bitmaps |
199 | * | 251 | * |
200 | * Return buffer_head on success or NULL in case of failure. | 252 | * Return buffer_head on success or NULL in case of failure. |
201 | */ | 253 | */ |
202 | struct buffer_head * | 254 | struct buffer_head * |
203 | read_block_bitmap(struct super_block *sb, unsigned int block_group) | 255 | read_block_bitmap(struct super_block *sb, ext4_group_t block_group) |
204 | { | 256 | { |
205 | struct ext4_group_desc * desc; | 257 | struct ext4_group_desc * desc; |
206 | struct buffer_head * bh = NULL; | 258 | struct buffer_head * bh = NULL; |
@@ -210,25 +262,36 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) | |||
210 | if (!desc) | 262 | if (!desc) |
211 | return NULL; | 263 | return NULL; |
212 | bitmap_blk = ext4_block_bitmap(sb, desc); | 264 | bitmap_blk = ext4_block_bitmap(sb, desc); |
265 | bh = sb_getblk(sb, bitmap_blk); | ||
266 | if (unlikely(!bh)) { | ||
267 | ext4_error(sb, __FUNCTION__, | ||
268 | "Cannot read block bitmap - " | ||
269 | "block_group = %d, block_bitmap = %llu", | ||
270 | (int)block_group, (unsigned long long)bitmap_blk); | ||
271 | return NULL; | ||
272 | } | ||
273 | if (bh_uptodate_or_lock(bh)) | ||
274 | return bh; | ||
275 | |||
213 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 276 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
214 | bh = sb_getblk(sb, bitmap_blk); | 277 | ext4_init_block_bitmap(sb, bh, block_group, desc); |
215 | if (!buffer_uptodate(bh)) { | 278 | set_buffer_uptodate(bh); |
216 | lock_buffer(bh); | 279 | unlock_buffer(bh); |
217 | if (!buffer_uptodate(bh)) { | 280 | return bh; |
218 | ext4_init_block_bitmap(sb, bh, block_group, | ||
219 | desc); | ||
220 | set_buffer_uptodate(bh); | ||
221 | } | ||
222 | unlock_buffer(bh); | ||
223 | } | ||
224 | } else { | ||
225 | bh = sb_bread(sb, bitmap_blk); | ||
226 | } | 281 | } |
227 | if (!bh) | 282 | if (bh_submit_read(bh) < 0) { |
228 | ext4_error (sb, __FUNCTION__, | 283 | put_bh(bh); |
284 | ext4_error(sb, __FUNCTION__, | ||
229 | "Cannot read block bitmap - " | 285 | "Cannot read block bitmap - " |
230 | "block_group = %d, block_bitmap = %llu", | 286 | "block_group = %d, block_bitmap = %llu", |
231 | block_group, bitmap_blk); | 287 | (int)block_group, (unsigned long long)bitmap_blk); |
288 | return NULL; | ||
289 | } | ||
290 | if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) { | ||
291 | put_bh(bh); | ||
292 | return NULL; | ||
293 | } | ||
294 | |||
232 | return bh; | 295 | return bh; |
233 | } | 296 | } |
234 | /* | 297 | /* |
@@ -320,7 +383,7 @@ restart: | |||
320 | */ | 383 | */ |
321 | static int | 384 | static int |
322 | goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal, | 385 | goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal, |
323 | unsigned int group, struct super_block * sb) | 386 | ext4_group_t group, struct super_block *sb) |
324 | { | 387 | { |
325 | ext4_fsblk_t group_first_block, group_last_block; | 388 | ext4_fsblk_t group_first_block, group_last_block; |
326 | 389 | ||
@@ -463,7 +526,7 @@ static inline int rsv_is_empty(struct ext4_reserve_window *rsv) | |||
463 | * when setting the reservation window size through ioctl before the file | 526 | * when setting the reservation window size through ioctl before the file |
464 | * is open for write (needs block allocation). | 527 | * is open for write (needs block allocation). |
465 | * | 528 | * |
466 | * Needs truncate_mutex protection prior to call this function. | 529 | * Needs down_write(i_data_sem) protection prior to call this function. |
467 | */ | 530 | */ |
468 | void ext4_init_block_alloc_info(struct inode *inode) | 531 | void ext4_init_block_alloc_info(struct inode *inode) |
469 | { | 532 | { |
@@ -514,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode) | |||
514 | struct ext4_reserve_window_node *rsv; | 577 | struct ext4_reserve_window_node *rsv; |
515 | spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock; | 578 | spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock; |
516 | 579 | ||
580 | ext4_mb_discard_inode_preallocations(inode); | ||
581 | |||
517 | if (!block_i) | 582 | if (!block_i) |
518 | return; | 583 | return; |
519 | 584 | ||
@@ -540,7 +605,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, | |||
540 | { | 605 | { |
541 | struct buffer_head *bitmap_bh = NULL; | 606 | struct buffer_head *bitmap_bh = NULL; |
542 | struct buffer_head *gd_bh; | 607 | struct buffer_head *gd_bh; |
543 | unsigned long block_group; | 608 | ext4_group_t block_group; |
544 | ext4_grpblk_t bit; | 609 | ext4_grpblk_t bit; |
545 | unsigned long i; | 610 | unsigned long i; |
546 | unsigned long overflow; | 611 | unsigned long overflow; |
@@ -587,11 +652,13 @@ do_more: | |||
587 | in_range(ext4_inode_bitmap(sb, desc), block, count) || | 652 | in_range(ext4_inode_bitmap(sb, desc), block, count) || |
588 | in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || | 653 | in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || |
589 | in_range(block + count - 1, ext4_inode_table(sb, desc), | 654 | in_range(block + count - 1, ext4_inode_table(sb, desc), |
590 | sbi->s_itb_per_group)) | 655 | sbi->s_itb_per_group)) { |
591 | ext4_error (sb, "ext4_free_blocks", | 656 | ext4_error (sb, "ext4_free_blocks", |
592 | "Freeing blocks in system zones - " | 657 | "Freeing blocks in system zones - " |
593 | "Block = %llu, count = %lu", | 658 | "Block = %llu, count = %lu", |
594 | block, count); | 659 | block, count); |
660 | goto error_return; | ||
661 | } | ||
595 | 662 | ||
596 | /* | 663 | /* |
597 | * We are about to start releasing blocks in the bitmap, | 664 | * We are about to start releasing blocks in the bitmap, |
@@ -720,19 +787,29 @@ error_return: | |||
720 | * @inode: inode | 787 | * @inode: inode |
721 | * @block: start physical block to free | 788 | * @block: start physical block to free |
722 | * @count: number of blocks to count | 789 | * @count: number of blocks to count |
790 | * @metadata: Are these metadata blocks | ||
723 | */ | 791 | */ |
724 | void ext4_free_blocks(handle_t *handle, struct inode *inode, | 792 | void ext4_free_blocks(handle_t *handle, struct inode *inode, |
725 | ext4_fsblk_t block, unsigned long count) | 793 | ext4_fsblk_t block, unsigned long count, |
794 | int metadata) | ||
726 | { | 795 | { |
727 | struct super_block * sb; | 796 | struct super_block * sb; |
728 | unsigned long dquot_freed_blocks; | 797 | unsigned long dquot_freed_blocks; |
729 | 798 | ||
799 | /* this isn't the right place to decide whether block is metadata | ||
800 | * inode.c/extents.c knows better, but for safety ... */ | ||
801 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || | ||
802 | ext4_should_journal_data(inode)) | ||
803 | metadata = 1; | ||
804 | |||
730 | sb = inode->i_sb; | 805 | sb = inode->i_sb; |
731 | if (!sb) { | 806 | |
732 | printk ("ext4_free_blocks: nonexistent device"); | 807 | if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info) |
733 | return; | 808 | ext4_free_blocks_sb(handle, sb, block, count, |
734 | } | 809 | &dquot_freed_blocks); |
735 | ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); | 810 | else |
811 | ext4_mb_free_blocks(handle, inode, block, count, | ||
812 | metadata, &dquot_freed_blocks); | ||
736 | if (dquot_freed_blocks) | 813 | if (dquot_freed_blocks) |
737 | DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); | 814 | DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); |
738 | return; | 815 | return; |
@@ -920,9 +997,10 @@ claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh) | |||
920 | * ext4_journal_release_buffer(), else we'll run out of credits. | 997 | * ext4_journal_release_buffer(), else we'll run out of credits. |
921 | */ | 998 | */ |
922 | static ext4_grpblk_t | 999 | static ext4_grpblk_t |
923 | ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group, | 1000 | ext4_try_to_allocate(struct super_block *sb, handle_t *handle, |
924 | struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal, | 1001 | ext4_group_t group, struct buffer_head *bitmap_bh, |
925 | unsigned long *count, struct ext4_reserve_window *my_rsv) | 1002 | ext4_grpblk_t grp_goal, unsigned long *count, |
1003 | struct ext4_reserve_window *my_rsv) | ||
926 | { | 1004 | { |
927 | ext4_fsblk_t group_first_block; | 1005 | ext4_fsblk_t group_first_block; |
928 | ext4_grpblk_t start, end; | 1006 | ext4_grpblk_t start, end; |
@@ -1156,7 +1234,7 @@ static int find_next_reservable_window( | |||
1156 | */ | 1234 | */ |
1157 | static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, | 1235 | static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, |
1158 | ext4_grpblk_t grp_goal, struct super_block *sb, | 1236 | ext4_grpblk_t grp_goal, struct super_block *sb, |
1159 | unsigned int group, struct buffer_head *bitmap_bh) | 1237 | ext4_group_t group, struct buffer_head *bitmap_bh) |
1160 | { | 1238 | { |
1161 | struct ext4_reserve_window_node *search_head; | 1239 | struct ext4_reserve_window_node *search_head; |
1162 | ext4_fsblk_t group_first_block, group_end_block, start_block; | 1240 | ext4_fsblk_t group_first_block, group_end_block, start_block; |
@@ -1354,7 +1432,7 @@ static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv, | |||
1354 | */ | 1432 | */ |
1355 | static ext4_grpblk_t | 1433 | static ext4_grpblk_t |
1356 | ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, | 1434 | ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, |
1357 | unsigned int group, struct buffer_head *bitmap_bh, | 1435 | ext4_group_t group, struct buffer_head *bitmap_bh, |
1358 | ext4_grpblk_t grp_goal, | 1436 | ext4_grpblk_t grp_goal, |
1359 | struct ext4_reserve_window_node * my_rsv, | 1437 | struct ext4_reserve_window_node * my_rsv, |
1360 | unsigned long *count, int *errp) | 1438 | unsigned long *count, int *errp) |
@@ -1510,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) | |||
1510 | } | 1588 | } |
1511 | 1589 | ||
1512 | /** | 1590 | /** |
1513 | * ext4_new_blocks() -- core block(s) allocation function | 1591 | * ext4_new_blocks_old() -- core block(s) allocation function |
1514 | * @handle: handle to this transaction | 1592 | * @handle: handle to this transaction |
1515 | * @inode: file inode | 1593 | * @inode: file inode |
1516 | * @goal: given target block(filesystem wide) | 1594 | * @goal: given target block(filesystem wide) |
@@ -1523,17 +1601,17 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) | |||
1523 | * any specific goal block. | 1601 | * any specific goal block. |
1524 | * | 1602 | * |
1525 | */ | 1603 | */ |
1526 | ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | 1604 | ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, |
1527 | ext4_fsblk_t goal, unsigned long *count, int *errp) | 1605 | ext4_fsblk_t goal, unsigned long *count, int *errp) |
1528 | { | 1606 | { |
1529 | struct buffer_head *bitmap_bh = NULL; | 1607 | struct buffer_head *bitmap_bh = NULL; |
1530 | struct buffer_head *gdp_bh; | 1608 | struct buffer_head *gdp_bh; |
1531 | unsigned long group_no; | 1609 | ext4_group_t group_no; |
1532 | int goal_group; | 1610 | ext4_group_t goal_group; |
1533 | ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */ | 1611 | ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */ |
1534 | ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ | 1612 | ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ |
1535 | ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */ | 1613 | ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */ |
1536 | int bgi; /* blockgroup iteration index */ | 1614 | ext4_group_t bgi; /* blockgroup iteration index */ |
1537 | int fatal = 0, err; | 1615 | int fatal = 0, err; |
1538 | int performed_allocation = 0; | 1616 | int performed_allocation = 0; |
1539 | ext4_grpblk_t free_blocks; /* number of free blocks in a group */ | 1617 | ext4_grpblk_t free_blocks; /* number of free blocks in a group */ |
@@ -1544,10 +1622,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | |||
1544 | struct ext4_reserve_window_node *my_rsv = NULL; | 1622 | struct ext4_reserve_window_node *my_rsv = NULL; |
1545 | struct ext4_block_alloc_info *block_i; | 1623 | struct ext4_block_alloc_info *block_i; |
1546 | unsigned short windowsz = 0; | 1624 | unsigned short windowsz = 0; |
1547 | #ifdef EXT4FS_DEBUG | 1625 | ext4_group_t ngroups; |
1548 | static int goal_hits, goal_attempts; | ||
1549 | #endif | ||
1550 | unsigned long ngroups; | ||
1551 | unsigned long num = *count; | 1626 | unsigned long num = *count; |
1552 | 1627 | ||
1553 | *errp = -ENOSPC; | 1628 | *errp = -ENOSPC; |
@@ -1567,7 +1642,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | |||
1567 | 1642 | ||
1568 | sbi = EXT4_SB(sb); | 1643 | sbi = EXT4_SB(sb); |
1569 | es = EXT4_SB(sb)->s_es; | 1644 | es = EXT4_SB(sb)->s_es; |
1570 | ext4_debug("goal=%lu.\n", goal); | 1645 | ext4_debug("goal=%llu.\n", goal); |
1571 | /* | 1646 | /* |
1572 | * Allocate a block from reservation only when | 1647 | * Allocate a block from reservation only when |
1573 | * filesystem is mounted with reservation(default,-o reservation), and | 1648 | * filesystem is mounted with reservation(default,-o reservation), and |
@@ -1677,7 +1752,7 @@ retry_alloc: | |||
1677 | 1752 | ||
1678 | allocated: | 1753 | allocated: |
1679 | 1754 | ||
1680 | ext4_debug("using block group %d(%d)\n", | 1755 | ext4_debug("using block group %lu(%d)\n", |
1681 | group_no, gdp->bg_free_blocks_count); | 1756 | group_no, gdp->bg_free_blocks_count); |
1682 | 1757 | ||
1683 | BUFFER_TRACE(gdp_bh, "get_write_access"); | 1758 | BUFFER_TRACE(gdp_bh, "get_write_access"); |
@@ -1692,11 +1767,13 @@ allocated: | |||
1692 | in_range(ret_block, ext4_inode_table(sb, gdp), | 1767 | in_range(ret_block, ext4_inode_table(sb, gdp), |
1693 | EXT4_SB(sb)->s_itb_per_group) || | 1768 | EXT4_SB(sb)->s_itb_per_group) || |
1694 | in_range(ret_block + num - 1, ext4_inode_table(sb, gdp), | 1769 | in_range(ret_block + num - 1, ext4_inode_table(sb, gdp), |
1695 | EXT4_SB(sb)->s_itb_per_group)) | 1770 | EXT4_SB(sb)->s_itb_per_group)) { |
1696 | ext4_error(sb, "ext4_new_block", | 1771 | ext4_error(sb, "ext4_new_block", |
1697 | "Allocating block in system zone - " | 1772 | "Allocating block in system zone - " |
1698 | "blocks from %llu, length %lu", | 1773 | "blocks from %llu, length %lu", |
1699 | ret_block, num); | 1774 | ret_block, num); |
1775 | goto out; | ||
1776 | } | ||
1700 | 1777 | ||
1701 | performed_allocation = 1; | 1778 | performed_allocation = 1; |
1702 | 1779 | ||
@@ -1743,9 +1820,6 @@ allocated: | |||
1743 | * list of some description. We don't know in advance whether | 1820 | * list of some description. We don't know in advance whether |
1744 | * the caller wants to use it as metadata or data. | 1821 | * the caller wants to use it as metadata or data. |
1745 | */ | 1822 | */ |
1746 | ext4_debug("allocating block %lu. Goal hits %d of %d.\n", | ||
1747 | ret_block, goal_hits, goal_attempts); | ||
1748 | |||
1749 | spin_lock(sb_bgl_lock(sbi, group_no)); | 1823 | spin_lock(sb_bgl_lock(sbi, group_no)); |
1750 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) | 1824 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) |
1751 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); | 1825 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); |
@@ -1787,13 +1861,46 @@ out: | |||
1787 | } | 1861 | } |
1788 | 1862 | ||
1789 | ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, | 1863 | ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, |
1790 | ext4_fsblk_t goal, int *errp) | 1864 | ext4_fsblk_t goal, int *errp) |
1791 | { | 1865 | { |
1792 | unsigned long count = 1; | 1866 | struct ext4_allocation_request ar; |
1867 | ext4_fsblk_t ret; | ||
1793 | 1868 | ||
1794 | return ext4_new_blocks(handle, inode, goal, &count, errp); | 1869 | if (!test_opt(inode->i_sb, MBALLOC)) { |
1870 | unsigned long count = 1; | ||
1871 | ret = ext4_new_blocks_old(handle, inode, goal, &count, errp); | ||
1872 | return ret; | ||
1873 | } | ||
1874 | |||
1875 | memset(&ar, 0, sizeof(ar)); | ||
1876 | ar.inode = inode; | ||
1877 | ar.goal = goal; | ||
1878 | ar.len = 1; | ||
1879 | ret = ext4_mb_new_blocks(handle, &ar, errp); | ||
1880 | return ret; | ||
1881 | } | ||
1882 | |||
1883 | ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | ||
1884 | ext4_fsblk_t goal, unsigned long *count, int *errp) | ||
1885 | { | ||
1886 | struct ext4_allocation_request ar; | ||
1887 | ext4_fsblk_t ret; | ||
1888 | |||
1889 | if (!test_opt(inode->i_sb, MBALLOC)) { | ||
1890 | ret = ext4_new_blocks_old(handle, inode, goal, count, errp); | ||
1891 | return ret; | ||
1892 | } | ||
1893 | |||
1894 | memset(&ar, 0, sizeof(ar)); | ||
1895 | ar.inode = inode; | ||
1896 | ar.goal = goal; | ||
1897 | ar.len = *count; | ||
1898 | ret = ext4_mb_new_blocks(handle, &ar, errp); | ||
1899 | *count = ar.len; | ||
1900 | return ret; | ||
1795 | } | 1901 | } |
1796 | 1902 | ||
1903 | |||
1797 | /** | 1904 | /** |
1798 | * ext4_count_free_blocks() -- count filesystem free blocks | 1905 | * ext4_count_free_blocks() -- count filesystem free blocks |
1799 | * @sb: superblock | 1906 | * @sb: superblock |
@@ -1804,8 +1911,8 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) | |||
1804 | { | 1911 | { |
1805 | ext4_fsblk_t desc_count; | 1912 | ext4_fsblk_t desc_count; |
1806 | struct ext4_group_desc *gdp; | 1913 | struct ext4_group_desc *gdp; |
1807 | int i; | 1914 | ext4_group_t i; |
1808 | unsigned long ngroups = EXT4_SB(sb)->s_groups_count; | 1915 | ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; |
1809 | #ifdef EXT4FS_DEBUG | 1916 | #ifdef EXT4FS_DEBUG |
1810 | struct ext4_super_block *es; | 1917 | struct ext4_super_block *es; |
1811 | ext4_fsblk_t bitmap_count; | 1918 | ext4_fsblk_t bitmap_count; |
@@ -1829,14 +1936,14 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) | |||
1829 | continue; | 1936 | continue; |
1830 | 1937 | ||
1831 | x = ext4_count_free(bitmap_bh, sb->s_blocksize); | 1938 | x = ext4_count_free(bitmap_bh, sb->s_blocksize); |
1832 | printk("group %d: stored = %d, counted = %lu\n", | 1939 | printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", |
1833 | i, le16_to_cpu(gdp->bg_free_blocks_count), x); | 1940 | i, le16_to_cpu(gdp->bg_free_blocks_count), x); |
1834 | bitmap_count += x; | 1941 | bitmap_count += x; |
1835 | } | 1942 | } |
1836 | brelse(bitmap_bh); | 1943 | brelse(bitmap_bh); |
1837 | printk("ext4_count_free_blocks: stored = %llu" | 1944 | printk("ext4_count_free_blocks: stored = %llu" |
1838 | ", computed = %llu, %llu\n", | 1945 | ", computed = %llu, %llu\n", |
1839 | EXT4_FREE_BLOCKS_COUNT(es), | 1946 | ext4_free_blocks_count(es), |
1840 | desc_count, bitmap_count); | 1947 | desc_count, bitmap_count); |
1841 | return bitmap_count; | 1948 | return bitmap_count; |
1842 | #else | 1949 | #else |
@@ -1853,7 +1960,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) | |||
1853 | #endif | 1960 | #endif |
1854 | } | 1961 | } |
1855 | 1962 | ||
1856 | static inline int test_root(int a, int b) | 1963 | static inline int test_root(ext4_group_t a, int b) |
1857 | { | 1964 | { |
1858 | int num = b; | 1965 | int num = b; |
1859 | 1966 | ||
@@ -1862,7 +1969,7 @@ static inline int test_root(int a, int b) | |||
1862 | return num == a; | 1969 | return num == a; |
1863 | } | 1970 | } |
1864 | 1971 | ||
1865 | static int ext4_group_sparse(int group) | 1972 | static int ext4_group_sparse(ext4_group_t group) |
1866 | { | 1973 | { |
1867 | if (group <= 1) | 1974 | if (group <= 1) |
1868 | return 1; | 1975 | return 1; |
@@ -1880,7 +1987,7 @@ static int ext4_group_sparse(int group) | |||
1880 | * Return the number of blocks used by the superblock (primary or backup) | 1987 | * Return the number of blocks used by the superblock (primary or backup) |
1881 | * in this group. Currently this will be only 0 or 1. | 1988 | * in this group. Currently this will be only 0 or 1. |
1882 | */ | 1989 | */ |
1883 | int ext4_bg_has_super(struct super_block *sb, int group) | 1990 | int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) |
1884 | { | 1991 | { |
1885 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, | 1992 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, |
1886 | EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && | 1993 | EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && |
@@ -1889,18 +1996,20 @@ int ext4_bg_has_super(struct super_block *sb, int group) | |||
1889 | return 1; | 1996 | return 1; |
1890 | } | 1997 | } |
1891 | 1998 | ||
1892 | static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group) | 1999 | static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, |
2000 | ext4_group_t group) | ||
1893 | { | 2001 | { |
1894 | unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); | 2002 | unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); |
1895 | unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb); | 2003 | ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); |
1896 | unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1; | 2004 | ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; |
1897 | 2005 | ||
1898 | if (group == first || group == first + 1 || group == last) | 2006 | if (group == first || group == first + 1 || group == last) |
1899 | return 1; | 2007 | return 1; |
1900 | return 0; | 2008 | return 0; |
1901 | } | 2009 | } |
1902 | 2010 | ||
1903 | static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group) | 2011 | static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, |
2012 | ext4_group_t group) | ||
1904 | { | 2013 | { |
1905 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, | 2014 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, |
1906 | EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && | 2015 | EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && |
@@ -1918,7 +2027,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group) | |||
1918 | * (primary or backup) in this group. In the future there may be a | 2027 | * (primary or backup) in this group. In the future there may be a |
1919 | * different number of descriptor blocks in each group. | 2028 | * different number of descriptor blocks in each group. |
1920 | */ | 2029 | */ |
1921 | unsigned long ext4_bg_num_gdb(struct super_block *sb, int group) | 2030 | unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) |
1922 | { | 2031 | { |
1923 | unsigned long first_meta_bg = | 2032 | unsigned long first_meta_bg = |
1924 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); | 2033 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); |
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f612bef98315..33888bb58144 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c | |||
@@ -67,7 +67,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir, | |||
67 | unsigned long offset) | 67 | unsigned long offset) |
68 | { | 68 | { |
69 | const char * error_msg = NULL; | 69 | const char * error_msg = NULL; |
70 | const int rlen = le16_to_cpu(de->rec_len); | 70 | const int rlen = ext4_rec_len_from_disk(de->rec_len); |
71 | 71 | ||
72 | if (rlen < EXT4_DIR_REC_LEN(1)) | 72 | if (rlen < EXT4_DIR_REC_LEN(1)) |
73 | error_msg = "rec_len is smaller than minimal"; | 73 | error_msg = "rec_len is smaller than minimal"; |
@@ -124,7 +124,7 @@ static int ext4_readdir(struct file * filp, | |||
124 | offset = filp->f_pos & (sb->s_blocksize - 1); | 124 | offset = filp->f_pos & (sb->s_blocksize - 1); |
125 | 125 | ||
126 | while (!error && !stored && filp->f_pos < inode->i_size) { | 126 | while (!error && !stored && filp->f_pos < inode->i_size) { |
127 | unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); | 127 | ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); |
128 | struct buffer_head map_bh; | 128 | struct buffer_head map_bh; |
129 | struct buffer_head *bh = NULL; | 129 | struct buffer_head *bh = NULL; |
130 | 130 | ||
@@ -172,10 +172,10 @@ revalidate: | |||
172 | * least that it is non-zero. A | 172 | * least that it is non-zero. A |
173 | * failure will be detected in the | 173 | * failure will be detected in the |
174 | * dirent test below. */ | 174 | * dirent test below. */ |
175 | if (le16_to_cpu(de->rec_len) < | 175 | if (ext4_rec_len_from_disk(de->rec_len) |
176 | EXT4_DIR_REC_LEN(1)) | 176 | < EXT4_DIR_REC_LEN(1)) |
177 | break; | 177 | break; |
178 | i += le16_to_cpu(de->rec_len); | 178 | i += ext4_rec_len_from_disk(de->rec_len); |
179 | } | 179 | } |
180 | offset = i; | 180 | offset = i; |
181 | filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) | 181 | filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) |
@@ -197,7 +197,7 @@ revalidate: | |||
197 | ret = stored; | 197 | ret = stored; |
198 | goto out; | 198 | goto out; |
199 | } | 199 | } |
200 | offset += le16_to_cpu(de->rec_len); | 200 | offset += ext4_rec_len_from_disk(de->rec_len); |
201 | if (le32_to_cpu(de->inode)) { | 201 | if (le32_to_cpu(de->inode)) { |
202 | /* We might block in the next section | 202 | /* We might block in the next section |
203 | * if the data destination is | 203 | * if the data destination is |
@@ -219,7 +219,7 @@ revalidate: | |||
219 | goto revalidate; | 219 | goto revalidate; |
220 | stored ++; | 220 | stored ++; |
221 | } | 221 | } |
222 | filp->f_pos += le16_to_cpu(de->rec_len); | 222 | filp->f_pos += ext4_rec_len_from_disk(de->rec_len); |
223 | } | 223 | } |
224 | offset = 0; | 224 | offset = 0; |
225 | brelse (bh); | 225 | brelse (bh); |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 85287742f2ae..bc7081f1fbe8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -61,7 +61,7 @@ static ext4_fsblk_t ext_pblock(struct ext4_extent *ex) | |||
61 | * idx_pblock: | 61 | * idx_pblock: |
62 | * combine low and high parts of a leaf physical block number into ext4_fsblk_t | 62 | * combine low and high parts of a leaf physical block number into ext4_fsblk_t |
63 | */ | 63 | */ |
64 | static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) | 64 | ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) |
65 | { | 65 | { |
66 | ext4_fsblk_t block; | 66 | ext4_fsblk_t block; |
67 | 67 | ||
@@ -75,7 +75,7 @@ static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) | |||
75 | * stores a large physical block number into an extent struct, | 75 | * stores a large physical block number into an extent struct, |
76 | * breaking it into parts | 76 | * breaking it into parts |
77 | */ | 77 | */ |
78 | static void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) | 78 | void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) |
79 | { | 79 | { |
80 | ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); | 80 | ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); |
81 | ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); | 81 | ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); |
@@ -144,7 +144,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode, | |||
144 | 144 | ||
145 | static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, | 145 | static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, |
146 | struct ext4_ext_path *path, | 146 | struct ext4_ext_path *path, |
147 | ext4_fsblk_t block) | 147 | ext4_lblk_t block) |
148 | { | 148 | { |
149 | struct ext4_inode_info *ei = EXT4_I(inode); | 149 | struct ext4_inode_info *ei = EXT4_I(inode); |
150 | ext4_fsblk_t bg_start; | 150 | ext4_fsblk_t bg_start; |
@@ -367,13 +367,14 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path) | |||
367 | * the header must be checked before calling this | 367 | * the header must be checked before calling this |
368 | */ | 368 | */ |
369 | static void | 369 | static void |
370 | ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block) | 370 | ext4_ext_binsearch_idx(struct inode *inode, |
371 | struct ext4_ext_path *path, ext4_lblk_t block) | ||
371 | { | 372 | { |
372 | struct ext4_extent_header *eh = path->p_hdr; | 373 | struct ext4_extent_header *eh = path->p_hdr; |
373 | struct ext4_extent_idx *r, *l, *m; | 374 | struct ext4_extent_idx *r, *l, *m; |
374 | 375 | ||
375 | 376 | ||
376 | ext_debug("binsearch for %d(idx): ", block); | 377 | ext_debug("binsearch for %u(idx): ", block); |
377 | 378 | ||
378 | l = EXT_FIRST_INDEX(eh) + 1; | 379 | l = EXT_FIRST_INDEX(eh) + 1; |
379 | r = EXT_LAST_INDEX(eh); | 380 | r = EXT_LAST_INDEX(eh); |
@@ -425,7 +426,8 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc | |||
425 | * the header must be checked before calling this | 426 | * the header must be checked before calling this |
426 | */ | 427 | */ |
427 | static void | 428 | static void |
428 | ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) | 429 | ext4_ext_binsearch(struct inode *inode, |
430 | struct ext4_ext_path *path, ext4_lblk_t block) | ||
429 | { | 431 | { |
430 | struct ext4_extent_header *eh = path->p_hdr; | 432 | struct ext4_extent_header *eh = path->p_hdr; |
431 | struct ext4_extent *r, *l, *m; | 433 | struct ext4_extent *r, *l, *m; |
@@ -438,7 +440,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block) | |||
438 | return; | 440 | return; |
439 | } | 441 | } |
440 | 442 | ||
441 | ext_debug("binsearch for %d: ", block); | 443 | ext_debug("binsearch for %u: ", block); |
442 | 444 | ||
443 | l = EXT_FIRST_EXTENT(eh) + 1; | 445 | l = EXT_FIRST_EXTENT(eh) + 1; |
444 | r = EXT_LAST_EXTENT(eh); | 446 | r = EXT_LAST_EXTENT(eh); |
@@ -494,7 +496,8 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) | |||
494 | } | 496 | } |
495 | 497 | ||
496 | struct ext4_ext_path * | 498 | struct ext4_ext_path * |
497 | ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path) | 499 | ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, |
500 | struct ext4_ext_path *path) | ||
498 | { | 501 | { |
499 | struct ext4_extent_header *eh; | 502 | struct ext4_extent_header *eh; |
500 | struct buffer_head *bh; | 503 | struct buffer_head *bh; |
@@ -763,7 +766,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, | |||
763 | while (k--) { | 766 | while (k--) { |
764 | oldblock = newblock; | 767 | oldblock = newblock; |
765 | newblock = ablocks[--a]; | 768 | newblock = ablocks[--a]; |
766 | bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock); | 769 | bh = sb_getblk(inode->i_sb, newblock); |
767 | if (!bh) { | 770 | if (!bh) { |
768 | err = -EIO; | 771 | err = -EIO; |
769 | goto cleanup; | 772 | goto cleanup; |
@@ -783,9 +786,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, | |||
783 | fidx->ei_block = border; | 786 | fidx->ei_block = border; |
784 | ext4_idx_store_pblock(fidx, oldblock); | 787 | ext4_idx_store_pblock(fidx, oldblock); |
785 | 788 | ||
786 | ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i, | 789 | ext_debug("int.index at %d (block %llu): %u -> %llu\n", |
787 | newblock, (unsigned long) le32_to_cpu(border), | 790 | i, newblock, le32_to_cpu(border), oldblock); |
788 | oldblock); | ||
789 | /* copy indexes */ | 791 | /* copy indexes */ |
790 | m = 0; | 792 | m = 0; |
791 | path[i].p_idx++; | 793 | path[i].p_idx++; |
@@ -851,7 +853,7 @@ cleanup: | |||
851 | for (i = 0; i < depth; i++) { | 853 | for (i = 0; i < depth; i++) { |
852 | if (!ablocks[i]) | 854 | if (!ablocks[i]) |
853 | continue; | 855 | continue; |
854 | ext4_free_blocks(handle, inode, ablocks[i], 1); | 856 | ext4_free_blocks(handle, inode, ablocks[i], 1, 1); |
855 | } | 857 | } |
856 | } | 858 | } |
857 | kfree(ablocks); | 859 | kfree(ablocks); |
@@ -979,8 +981,8 @@ repeat: | |||
979 | /* refill path */ | 981 | /* refill path */ |
980 | ext4_ext_drop_refs(path); | 982 | ext4_ext_drop_refs(path); |
981 | path = ext4_ext_find_extent(inode, | 983 | path = ext4_ext_find_extent(inode, |
982 | le32_to_cpu(newext->ee_block), | 984 | (ext4_lblk_t)le32_to_cpu(newext->ee_block), |
983 | path); | 985 | path); |
984 | if (IS_ERR(path)) | 986 | if (IS_ERR(path)) |
985 | err = PTR_ERR(path); | 987 | err = PTR_ERR(path); |
986 | } else { | 988 | } else { |
@@ -992,8 +994,8 @@ repeat: | |||
992 | /* refill path */ | 994 | /* refill path */ |
993 | ext4_ext_drop_refs(path); | 995 | ext4_ext_drop_refs(path); |
994 | path = ext4_ext_find_extent(inode, | 996 | path = ext4_ext_find_extent(inode, |
995 | le32_to_cpu(newext->ee_block), | 997 | (ext4_lblk_t)le32_to_cpu(newext->ee_block), |
996 | path); | 998 | path); |
997 | if (IS_ERR(path)) { | 999 | if (IS_ERR(path)) { |
998 | err = PTR_ERR(path); | 1000 | err = PTR_ERR(path); |
999 | goto out; | 1001 | goto out; |
@@ -1015,13 +1017,157 @@ out: | |||
1015 | } | 1017 | } |
1016 | 1018 | ||
1017 | /* | 1019 | /* |
1020 | * search the closest allocated block to the left for *logical | ||
1021 | * and returns it at @logical + it's physical address at @phys | ||
1022 | * if *logical is the smallest allocated block, the function | ||
1023 | * returns 0 at @phys | ||
1024 | * return value contains 0 (success) or error code | ||
1025 | */ | ||
1026 | int | ||
1027 | ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, | ||
1028 | ext4_lblk_t *logical, ext4_fsblk_t *phys) | ||
1029 | { | ||
1030 | struct ext4_extent_idx *ix; | ||
1031 | struct ext4_extent *ex; | ||
1032 | int depth, ee_len; | ||
1033 | |||
1034 | BUG_ON(path == NULL); | ||
1035 | depth = path->p_depth; | ||
1036 | *phys = 0; | ||
1037 | |||
1038 | if (depth == 0 && path->p_ext == NULL) | ||
1039 | return 0; | ||
1040 | |||
1041 | /* usually extent in the path covers blocks smaller | ||
1042 | * then *logical, but it can be that extent is the | ||
1043 | * first one in the file */ | ||
1044 | |||
1045 | ex = path[depth].p_ext; | ||
1046 | ee_len = ext4_ext_get_actual_len(ex); | ||
1047 | if (*logical < le32_to_cpu(ex->ee_block)) { | ||
1048 | BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); | ||
1049 | while (--depth >= 0) { | ||
1050 | ix = path[depth].p_idx; | ||
1051 | BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); | ||
1052 | } | ||
1053 | return 0; | ||
1054 | } | ||
1055 | |||
1056 | BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); | ||
1057 | |||
1058 | *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; | ||
1059 | *phys = ext_pblock(ex) + ee_len - 1; | ||
1060 | return 0; | ||
1061 | } | ||
1062 | |||
1063 | /* | ||
1064 | * search the closest allocated block to the right for *logical | ||
1065 | * and returns it at @logical + it's physical address at @phys | ||
1066 | * if *logical is the smallest allocated block, the function | ||
1067 | * returns 0 at @phys | ||
1068 | * return value contains 0 (success) or error code | ||
1069 | */ | ||
1070 | int | ||
1071 | ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, | ||
1072 | ext4_lblk_t *logical, ext4_fsblk_t *phys) | ||
1073 | { | ||
1074 | struct buffer_head *bh = NULL; | ||
1075 | struct ext4_extent_header *eh; | ||
1076 | struct ext4_extent_idx *ix; | ||
1077 | struct ext4_extent *ex; | ||
1078 | ext4_fsblk_t block; | ||
1079 | int depth, ee_len; | ||
1080 | |||
1081 | BUG_ON(path == NULL); | ||
1082 | depth = path->p_depth; | ||
1083 | *phys = 0; | ||
1084 | |||
1085 | if (depth == 0 && path->p_ext == NULL) | ||
1086 | return 0; | ||
1087 | |||
1088 | /* usually extent in the path covers blocks smaller | ||
1089 | * then *logical, but it can be that extent is the | ||
1090 | * first one in the file */ | ||
1091 | |||
1092 | ex = path[depth].p_ext; | ||
1093 | ee_len = ext4_ext_get_actual_len(ex); | ||
1094 | if (*logical < le32_to_cpu(ex->ee_block)) { | ||
1095 | BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); | ||
1096 | while (--depth >= 0) { | ||
1097 | ix = path[depth].p_idx; | ||
1098 | BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); | ||
1099 | } | ||
1100 | *logical = le32_to_cpu(ex->ee_block); | ||
1101 | *phys = ext_pblock(ex); | ||
1102 | return 0; | ||
1103 | } | ||
1104 | |||
1105 | BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); | ||
1106 | |||
1107 | if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { | ||
1108 | /* next allocated block in this leaf */ | ||
1109 | ex++; | ||
1110 | *logical = le32_to_cpu(ex->ee_block); | ||
1111 | *phys = ext_pblock(ex); | ||
1112 | return 0; | ||
1113 | } | ||
1114 | |||
1115 | /* go up and search for index to the right */ | ||
1116 | while (--depth >= 0) { | ||
1117 | ix = path[depth].p_idx; | ||
1118 | if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) | ||
1119 | break; | ||
1120 | } | ||
1121 | |||
1122 | if (depth < 0) { | ||
1123 | /* we've gone up to the root and | ||
1124 | * found no index to the right */ | ||
1125 | return 0; | ||
1126 | } | ||
1127 | |||
1128 | /* we've found index to the right, let's | ||
1129 | * follow it and find the closest allocated | ||
1130 | * block to the right */ | ||
1131 | ix++; | ||
1132 | block = idx_pblock(ix); | ||
1133 | while (++depth < path->p_depth) { | ||
1134 | bh = sb_bread(inode->i_sb, block); | ||
1135 | if (bh == NULL) | ||
1136 | return -EIO; | ||
1137 | eh = ext_block_hdr(bh); | ||
1138 | if (ext4_ext_check_header(inode, eh, depth)) { | ||
1139 | put_bh(bh); | ||
1140 | return -EIO; | ||
1141 | } | ||
1142 | ix = EXT_FIRST_INDEX(eh); | ||
1143 | block = idx_pblock(ix); | ||
1144 | put_bh(bh); | ||
1145 | } | ||
1146 | |||
1147 | bh = sb_bread(inode->i_sb, block); | ||
1148 | if (bh == NULL) | ||
1149 | return -EIO; | ||
1150 | eh = ext_block_hdr(bh); | ||
1151 | if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { | ||
1152 | put_bh(bh); | ||
1153 | return -EIO; | ||
1154 | } | ||
1155 | ex = EXT_FIRST_EXTENT(eh); | ||
1156 | *logical = le32_to_cpu(ex->ee_block); | ||
1157 | *phys = ext_pblock(ex); | ||
1158 | put_bh(bh); | ||
1159 | return 0; | ||
1160 | |||
1161 | } | ||
1162 | |||
1163 | /* | ||
1018 | * ext4_ext_next_allocated_block: | 1164 | * ext4_ext_next_allocated_block: |
1019 | * returns allocated block in subsequent extent or EXT_MAX_BLOCK. | 1165 | * returns allocated block in subsequent extent or EXT_MAX_BLOCK. |
1020 | * NOTE: it considers block number from index entry as | 1166 | * NOTE: it considers block number from index entry as |
1021 | * allocated block. Thus, index entries have to be consistent | 1167 | * allocated block. Thus, index entries have to be consistent |
1022 | * with leaves. | 1168 | * with leaves. |
1023 | */ | 1169 | */ |
1024 | static unsigned long | 1170 | static ext4_lblk_t |
1025 | ext4_ext_next_allocated_block(struct ext4_ext_path *path) | 1171 | ext4_ext_next_allocated_block(struct ext4_ext_path *path) |
1026 | { | 1172 | { |
1027 | int depth; | 1173 | int depth; |
@@ -1054,7 +1200,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) | |||
1054 | * ext4_ext_next_leaf_block: | 1200 | * ext4_ext_next_leaf_block: |
1055 | * returns first allocated block from next leaf or EXT_MAX_BLOCK | 1201 | * returns first allocated block from next leaf or EXT_MAX_BLOCK |
1056 | */ | 1202 | */ |
1057 | static unsigned ext4_ext_next_leaf_block(struct inode *inode, | 1203 | static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, |
1058 | struct ext4_ext_path *path) | 1204 | struct ext4_ext_path *path) |
1059 | { | 1205 | { |
1060 | int depth; | 1206 | int depth; |
@@ -1072,7 +1218,8 @@ static unsigned ext4_ext_next_leaf_block(struct inode *inode, | |||
1072 | while (depth >= 0) { | 1218 | while (depth >= 0) { |
1073 | if (path[depth].p_idx != | 1219 | if (path[depth].p_idx != |
1074 | EXT_LAST_INDEX(path[depth].p_hdr)) | 1220 | EXT_LAST_INDEX(path[depth].p_hdr)) |
1075 | return le32_to_cpu(path[depth].p_idx[1].ei_block); | 1221 | return (ext4_lblk_t) |
1222 | le32_to_cpu(path[depth].p_idx[1].ei_block); | ||
1076 | depth--; | 1223 | depth--; |
1077 | } | 1224 | } |
1078 | 1225 | ||
@@ -1085,7 +1232,7 @@ static unsigned ext4_ext_next_leaf_block(struct inode *inode, | |||
1085 | * then we have to correct all indexes above. | 1232 | * then we have to correct all indexes above. |
1086 | * TODO: do we need to correct tree in all cases? | 1233 | * TODO: do we need to correct tree in all cases? |
1087 | */ | 1234 | */ |
1088 | int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, | 1235 | static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, |
1089 | struct ext4_ext_path *path) | 1236 | struct ext4_ext_path *path) |
1090 | { | 1237 | { |
1091 | struct ext4_extent_header *eh; | 1238 | struct ext4_extent_header *eh; |
@@ -1171,7 +1318,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, | |||
1171 | if (ext1_ee_len + ext2_ee_len > max_len) | 1318 | if (ext1_ee_len + ext2_ee_len > max_len) |
1172 | return 0; | 1319 | return 0; |
1173 | #ifdef AGGRESSIVE_TEST | 1320 | #ifdef AGGRESSIVE_TEST |
1174 | if (le16_to_cpu(ex1->ee_len) >= 4) | 1321 | if (ext1_ee_len >= 4) |
1175 | return 0; | 1322 | return 0; |
1176 | #endif | 1323 | #endif |
1177 | 1324 | ||
@@ -1239,7 +1386,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode, | |||
1239 | struct ext4_extent *newext, | 1386 | struct ext4_extent *newext, |
1240 | struct ext4_ext_path *path) | 1387 | struct ext4_ext_path *path) |
1241 | { | 1388 | { |
1242 | unsigned long b1, b2; | 1389 | ext4_lblk_t b1, b2; |
1243 | unsigned int depth, len1; | 1390 | unsigned int depth, len1; |
1244 | unsigned int ret = 0; | 1391 | unsigned int ret = 0; |
1245 | 1392 | ||
@@ -1260,7 +1407,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode, | |||
1260 | goto out; | 1407 | goto out; |
1261 | } | 1408 | } |
1262 | 1409 | ||
1263 | /* check for wrap through zero */ | 1410 | /* check for wrap through zero on extent logical start block*/ |
1264 | if (b1 + len1 < b1) { | 1411 | if (b1 + len1 < b1) { |
1265 | len1 = EXT_MAX_BLOCK - b1; | 1412 | len1 = EXT_MAX_BLOCK - b1; |
1266 | newext->ee_len = cpu_to_le16(len1); | 1413 | newext->ee_len = cpu_to_le16(len1); |
@@ -1290,7 +1437,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, | |||
1290 | struct ext4_extent *ex, *fex; | 1437 | struct ext4_extent *ex, *fex; |
1291 | struct ext4_extent *nearex; /* nearest extent */ | 1438 | struct ext4_extent *nearex; /* nearest extent */ |
1292 | struct ext4_ext_path *npath = NULL; | 1439 | struct ext4_ext_path *npath = NULL; |
1293 | int depth, len, err, next; | 1440 | int depth, len, err; |
1441 | ext4_lblk_t next; | ||
1294 | unsigned uninitialized = 0; | 1442 | unsigned uninitialized = 0; |
1295 | 1443 | ||
1296 | BUG_ON(ext4_ext_get_actual_len(newext) == 0); | 1444 | BUG_ON(ext4_ext_get_actual_len(newext) == 0); |
@@ -1435,114 +1583,8 @@ cleanup: | |||
1435 | return err; | 1583 | return err; |
1436 | } | 1584 | } |
1437 | 1585 | ||
1438 | int ext4_ext_walk_space(struct inode *inode, unsigned long block, | ||
1439 | unsigned long num, ext_prepare_callback func, | ||
1440 | void *cbdata) | ||
1441 | { | ||
1442 | struct ext4_ext_path *path = NULL; | ||
1443 | struct ext4_ext_cache cbex; | ||
1444 | struct ext4_extent *ex; | ||
1445 | unsigned long next, start = 0, end = 0; | ||
1446 | unsigned long last = block + num; | ||
1447 | int depth, exists, err = 0; | ||
1448 | |||
1449 | BUG_ON(func == NULL); | ||
1450 | BUG_ON(inode == NULL); | ||
1451 | |||
1452 | while (block < last && block != EXT_MAX_BLOCK) { | ||
1453 | num = last - block; | ||
1454 | /* find extent for this block */ | ||
1455 | path = ext4_ext_find_extent(inode, block, path); | ||
1456 | if (IS_ERR(path)) { | ||
1457 | err = PTR_ERR(path); | ||
1458 | path = NULL; | ||
1459 | break; | ||
1460 | } | ||
1461 | |||
1462 | depth = ext_depth(inode); | ||
1463 | BUG_ON(path[depth].p_hdr == NULL); | ||
1464 | ex = path[depth].p_ext; | ||
1465 | next = ext4_ext_next_allocated_block(path); | ||
1466 | |||
1467 | exists = 0; | ||
1468 | if (!ex) { | ||
1469 | /* there is no extent yet, so try to allocate | ||
1470 | * all requested space */ | ||
1471 | start = block; | ||
1472 | end = block + num; | ||
1473 | } else if (le32_to_cpu(ex->ee_block) > block) { | ||
1474 | /* need to allocate space before found extent */ | ||
1475 | start = block; | ||
1476 | end = le32_to_cpu(ex->ee_block); | ||
1477 | if (block + num < end) | ||
1478 | end = block + num; | ||
1479 | } else if (block >= le32_to_cpu(ex->ee_block) | ||
1480 | + ext4_ext_get_actual_len(ex)) { | ||
1481 | /* need to allocate space after found extent */ | ||
1482 | start = block; | ||
1483 | end = block + num; | ||
1484 | if (end >= next) | ||
1485 | end = next; | ||
1486 | } else if (block >= le32_to_cpu(ex->ee_block)) { | ||
1487 | /* | ||
1488 | * some part of requested space is covered | ||
1489 | * by found extent | ||
1490 | */ | ||
1491 | start = block; | ||
1492 | end = le32_to_cpu(ex->ee_block) | ||
1493 | + ext4_ext_get_actual_len(ex); | ||
1494 | if (block + num < end) | ||
1495 | end = block + num; | ||
1496 | exists = 1; | ||
1497 | } else { | ||
1498 | BUG(); | ||
1499 | } | ||
1500 | BUG_ON(end <= start); | ||
1501 | |||
1502 | if (!exists) { | ||
1503 | cbex.ec_block = start; | ||
1504 | cbex.ec_len = end - start; | ||
1505 | cbex.ec_start = 0; | ||
1506 | cbex.ec_type = EXT4_EXT_CACHE_GAP; | ||
1507 | } else { | ||
1508 | cbex.ec_block = le32_to_cpu(ex->ee_block); | ||
1509 | cbex.ec_len = ext4_ext_get_actual_len(ex); | ||
1510 | cbex.ec_start = ext_pblock(ex); | ||
1511 | cbex.ec_type = EXT4_EXT_CACHE_EXTENT; | ||
1512 | } | ||
1513 | |||
1514 | BUG_ON(cbex.ec_len == 0); | ||
1515 | err = func(inode, path, &cbex, cbdata); | ||
1516 | ext4_ext_drop_refs(path); | ||
1517 | |||
1518 | if (err < 0) | ||
1519 | break; | ||
1520 | if (err == EXT_REPEAT) | ||
1521 | continue; | ||
1522 | else if (err == EXT_BREAK) { | ||
1523 | err = 0; | ||
1524 | break; | ||
1525 | } | ||
1526 | |||
1527 | if (ext_depth(inode) != depth) { | ||
1528 | /* depth was changed. we have to realloc path */ | ||
1529 | kfree(path); | ||
1530 | path = NULL; | ||
1531 | } | ||
1532 | |||
1533 | block = cbex.ec_block + cbex.ec_len; | ||
1534 | } | ||
1535 | |||
1536 | if (path) { | ||
1537 | ext4_ext_drop_refs(path); | ||
1538 | kfree(path); | ||
1539 | } | ||
1540 | |||
1541 | return err; | ||
1542 | } | ||
1543 | |||
1544 | static void | 1586 | static void |
1545 | ext4_ext_put_in_cache(struct inode *inode, __u32 block, | 1587 | ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, |
1546 | __u32 len, ext4_fsblk_t start, int type) | 1588 | __u32 len, ext4_fsblk_t start, int type) |
1547 | { | 1589 | { |
1548 | struct ext4_ext_cache *cex; | 1590 | struct ext4_ext_cache *cex; |
@@ -1561,10 +1603,11 @@ ext4_ext_put_in_cache(struct inode *inode, __u32 block, | |||
1561 | */ | 1603 | */ |
1562 | static void | 1604 | static void |
1563 | ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, | 1605 | ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, |
1564 | unsigned long block) | 1606 | ext4_lblk_t block) |
1565 | { | 1607 | { |
1566 | int depth = ext_depth(inode); | 1608 | int depth = ext_depth(inode); |
1567 | unsigned long lblock, len; | 1609 | unsigned long len; |
1610 | ext4_lblk_t lblock; | ||
1568 | struct ext4_extent *ex; | 1611 | struct ext4_extent *ex; |
1569 | 1612 | ||
1570 | ex = path[depth].p_ext; | 1613 | ex = path[depth].p_ext; |
@@ -1576,32 +1619,34 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, | |||
1576 | } else if (block < le32_to_cpu(ex->ee_block)) { | 1619 | } else if (block < le32_to_cpu(ex->ee_block)) { |
1577 | lblock = block; | 1620 | lblock = block; |
1578 | len = le32_to_cpu(ex->ee_block) - block; | 1621 | len = le32_to_cpu(ex->ee_block) - block; |
1579 | ext_debug("cache gap(before): %lu [%lu:%lu]", | 1622 | ext_debug("cache gap(before): %u [%u:%u]", |
1580 | (unsigned long) block, | 1623 | block, |
1581 | (unsigned long) le32_to_cpu(ex->ee_block), | 1624 | le32_to_cpu(ex->ee_block), |
1582 | (unsigned long) ext4_ext_get_actual_len(ex)); | 1625 | ext4_ext_get_actual_len(ex)); |
1583 | } else if (block >= le32_to_cpu(ex->ee_block) | 1626 | } else if (block >= le32_to_cpu(ex->ee_block) |
1584 | + ext4_ext_get_actual_len(ex)) { | 1627 | + ext4_ext_get_actual_len(ex)) { |
1628 | ext4_lblk_t next; | ||
1585 | lblock = le32_to_cpu(ex->ee_block) | 1629 | lblock = le32_to_cpu(ex->ee_block) |
1586 | + ext4_ext_get_actual_len(ex); | 1630 | + ext4_ext_get_actual_len(ex); |
1587 | len = ext4_ext_next_allocated_block(path); | 1631 | |
1588 | ext_debug("cache gap(after): [%lu:%lu] %lu", | 1632 | next = ext4_ext_next_allocated_block(path); |
1589 | (unsigned long) le32_to_cpu(ex->ee_block), | 1633 | ext_debug("cache gap(after): [%u:%u] %u", |
1590 | (unsigned long) ext4_ext_get_actual_len(ex), | 1634 | le32_to_cpu(ex->ee_block), |
1591 | (unsigned long) block); | 1635 | ext4_ext_get_actual_len(ex), |
1592 | BUG_ON(len == lblock); | 1636 | block); |
1593 | len = len - lblock; | 1637 | BUG_ON(next == lblock); |
1638 | len = next - lblock; | ||
1594 | } else { | 1639 | } else { |
1595 | lblock = len = 0; | 1640 | lblock = len = 0; |
1596 | BUG(); | 1641 | BUG(); |
1597 | } | 1642 | } |
1598 | 1643 | ||
1599 | ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len); | 1644 | ext_debug(" -> %u:%lu\n", lblock, len); |
1600 | ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); | 1645 | ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); |
1601 | } | 1646 | } |
1602 | 1647 | ||
1603 | static int | 1648 | static int |
1604 | ext4_ext_in_cache(struct inode *inode, unsigned long block, | 1649 | ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, |
1605 | struct ext4_extent *ex) | 1650 | struct ext4_extent *ex) |
1606 | { | 1651 | { |
1607 | struct ext4_ext_cache *cex; | 1652 | struct ext4_ext_cache *cex; |
@@ -1618,11 +1663,9 @@ ext4_ext_in_cache(struct inode *inode, unsigned long block, | |||
1618 | ex->ee_block = cpu_to_le32(cex->ec_block); | 1663 | ex->ee_block = cpu_to_le32(cex->ec_block); |
1619 | ext4_ext_store_pblock(ex, cex->ec_start); | 1664 | ext4_ext_store_pblock(ex, cex->ec_start); |
1620 | ex->ee_len = cpu_to_le16(cex->ec_len); | 1665 | ex->ee_len = cpu_to_le16(cex->ec_len); |
1621 | ext_debug("%lu cached by %lu:%lu:%llu\n", | 1666 | ext_debug("%u cached by %u:%u:%llu\n", |
1622 | (unsigned long) block, | 1667 | block, |
1623 | (unsigned long) cex->ec_block, | 1668 | cex->ec_block, cex->ec_len, cex->ec_start); |
1624 | (unsigned long) cex->ec_len, | ||
1625 | cex->ec_start); | ||
1626 | return cex->ec_type; | 1669 | return cex->ec_type; |
1627 | } | 1670 | } |
1628 | 1671 | ||
@@ -1636,7 +1679,7 @@ ext4_ext_in_cache(struct inode *inode, unsigned long block, | |||
1636 | * It's used in truncate case only, thus all requests are for | 1679 | * It's used in truncate case only, thus all requests are for |
1637 | * last index in the block only. | 1680 | * last index in the block only. |
1638 | */ | 1681 | */ |
1639 | int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | 1682 | static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, |
1640 | struct ext4_ext_path *path) | 1683 | struct ext4_ext_path *path) |
1641 | { | 1684 | { |
1642 | struct buffer_head *bh; | 1685 | struct buffer_head *bh; |
@@ -1657,7 +1700,7 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | |||
1657 | ext_debug("index is empty, remove it, free block %llu\n", leaf); | 1700 | ext_debug("index is empty, remove it, free block %llu\n", leaf); |
1658 | bh = sb_find_get_block(inode->i_sb, leaf); | 1701 | bh = sb_find_get_block(inode->i_sb, leaf); |
1659 | ext4_forget(handle, 1, inode, bh, leaf); | 1702 | ext4_forget(handle, 1, inode, bh, leaf); |
1660 | ext4_free_blocks(handle, inode, leaf, 1); | 1703 | ext4_free_blocks(handle, inode, leaf, 1, 1); |
1661 | return err; | 1704 | return err; |
1662 | } | 1705 | } |
1663 | 1706 | ||
@@ -1666,7 +1709,7 @@ int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | |||
1666 | * This routine returns max. credits that the extent tree can consume. | 1709 | * This routine returns max. credits that the extent tree can consume. |
1667 | * It should be OK for low-performance paths like ->writepage() | 1710 | * It should be OK for low-performance paths like ->writepage() |
1668 | * To allow many writing processes to fit into a single transaction, | 1711 | * To allow many writing processes to fit into a single transaction, |
1669 | * the caller should calculate credits under truncate_mutex and | 1712 | * the caller should calculate credits under i_data_sem and |
1670 | * pass the actual path. | 1713 | * pass the actual path. |
1671 | */ | 1714 | */ |
1672 | int ext4_ext_calc_credits_for_insert(struct inode *inode, | 1715 | int ext4_ext_calc_credits_for_insert(struct inode *inode, |
@@ -1714,12 +1757,14 @@ int ext4_ext_calc_credits_for_insert(struct inode *inode, | |||
1714 | 1757 | ||
1715 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | 1758 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, |
1716 | struct ext4_extent *ex, | 1759 | struct ext4_extent *ex, |
1717 | unsigned long from, unsigned long to) | 1760 | ext4_lblk_t from, ext4_lblk_t to) |
1718 | { | 1761 | { |
1719 | struct buffer_head *bh; | 1762 | struct buffer_head *bh; |
1720 | unsigned short ee_len = ext4_ext_get_actual_len(ex); | 1763 | unsigned short ee_len = ext4_ext_get_actual_len(ex); |
1721 | int i; | 1764 | int i, metadata = 0; |
1722 | 1765 | ||
1766 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
1767 | metadata = 1; | ||
1723 | #ifdef EXTENTS_STATS | 1768 | #ifdef EXTENTS_STATS |
1724 | { | 1769 | { |
1725 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1770 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
@@ -1738,42 +1783,45 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | |||
1738 | if (from >= le32_to_cpu(ex->ee_block) | 1783 | if (from >= le32_to_cpu(ex->ee_block) |
1739 | && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { | 1784 | && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { |
1740 | /* tail removal */ | 1785 | /* tail removal */ |
1741 | unsigned long num; | 1786 | ext4_lblk_t num; |
1742 | ext4_fsblk_t start; | 1787 | ext4_fsblk_t start; |
1788 | |||
1743 | num = le32_to_cpu(ex->ee_block) + ee_len - from; | 1789 | num = le32_to_cpu(ex->ee_block) + ee_len - from; |
1744 | start = ext_pblock(ex) + ee_len - num; | 1790 | start = ext_pblock(ex) + ee_len - num; |
1745 | ext_debug("free last %lu blocks starting %llu\n", num, start); | 1791 | ext_debug("free last %u blocks starting %llu\n", num, start); |
1746 | for (i = 0; i < num; i++) { | 1792 | for (i = 0; i < num; i++) { |
1747 | bh = sb_find_get_block(inode->i_sb, start + i); | 1793 | bh = sb_find_get_block(inode->i_sb, start + i); |
1748 | ext4_forget(handle, 0, inode, bh, start + i); | 1794 | ext4_forget(handle, 0, inode, bh, start + i); |
1749 | } | 1795 | } |
1750 | ext4_free_blocks(handle, inode, start, num); | 1796 | ext4_free_blocks(handle, inode, start, num, metadata); |
1751 | } else if (from == le32_to_cpu(ex->ee_block) | 1797 | } else if (from == le32_to_cpu(ex->ee_block) |
1752 | && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { | 1798 | && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { |
1753 | printk("strange request: removal %lu-%lu from %u:%u\n", | 1799 | printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", |
1754 | from, to, le32_to_cpu(ex->ee_block), ee_len); | 1800 | from, to, le32_to_cpu(ex->ee_block), ee_len); |
1755 | } else { | 1801 | } else { |
1756 | printk("strange request: removal(2) %lu-%lu from %u:%u\n", | 1802 | printk(KERN_INFO "strange request: removal(2) " |
1757 | from, to, le32_to_cpu(ex->ee_block), ee_len); | 1803 | "%u-%u from %u:%u\n", |
1804 | from, to, le32_to_cpu(ex->ee_block), ee_len); | ||
1758 | } | 1805 | } |
1759 | return 0; | 1806 | return 0; |
1760 | } | 1807 | } |
1761 | 1808 | ||
1762 | static int | 1809 | static int |
1763 | ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | 1810 | ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, |
1764 | struct ext4_ext_path *path, unsigned long start) | 1811 | struct ext4_ext_path *path, ext4_lblk_t start) |
1765 | { | 1812 | { |
1766 | int err = 0, correct_index = 0; | 1813 | int err = 0, correct_index = 0; |
1767 | int depth = ext_depth(inode), credits; | 1814 | int depth = ext_depth(inode), credits; |
1768 | struct ext4_extent_header *eh; | 1815 | struct ext4_extent_header *eh; |
1769 | unsigned a, b, block, num; | 1816 | ext4_lblk_t a, b, block; |
1770 | unsigned long ex_ee_block; | 1817 | unsigned num; |
1818 | ext4_lblk_t ex_ee_block; | ||
1771 | unsigned short ex_ee_len; | 1819 | unsigned short ex_ee_len; |
1772 | unsigned uninitialized = 0; | 1820 | unsigned uninitialized = 0; |
1773 | struct ext4_extent *ex; | 1821 | struct ext4_extent *ex; |
1774 | 1822 | ||
1775 | /* the header must be checked already in ext4_ext_remove_space() */ | 1823 | /* the header must be checked already in ext4_ext_remove_space() */ |
1776 | ext_debug("truncate since %lu in leaf\n", start); | 1824 | ext_debug("truncate since %u in leaf\n", start); |
1777 | if (!path[depth].p_hdr) | 1825 | if (!path[depth].p_hdr) |
1778 | path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); | 1826 | path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); |
1779 | eh = path[depth].p_hdr; | 1827 | eh = path[depth].p_hdr; |
@@ -1904,7 +1952,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) | |||
1904 | return 1; | 1952 | return 1; |
1905 | } | 1953 | } |
1906 | 1954 | ||
1907 | int ext4_ext_remove_space(struct inode *inode, unsigned long start) | 1955 | static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) |
1908 | { | 1956 | { |
1909 | struct super_block *sb = inode->i_sb; | 1957 | struct super_block *sb = inode->i_sb; |
1910 | int depth = ext_depth(inode); | 1958 | int depth = ext_depth(inode); |
@@ -1912,7 +1960,7 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start) | |||
1912 | handle_t *handle; | 1960 | handle_t *handle; |
1913 | int i = 0, err = 0; | 1961 | int i = 0, err = 0; |
1914 | 1962 | ||
1915 | ext_debug("truncate since %lu\n", start); | 1963 | ext_debug("truncate since %u\n", start); |
1916 | 1964 | ||
1917 | /* probably first extent we're gonna free will be last in block */ | 1965 | /* probably first extent we're gonna free will be last in block */ |
1918 | handle = ext4_journal_start(inode, depth + 1); | 1966 | handle = ext4_journal_start(inode, depth + 1); |
@@ -2094,17 +2142,19 @@ void ext4_ext_release(struct super_block *sb) | |||
2094 | * b> Splits in two extents: Write is happening at either end of the extent | 2142 | * b> Splits in two extents: Write is happening at either end of the extent |
2095 | * c> Splits in three extents: Somone is writing in middle of the extent | 2143 | * c> Splits in three extents: Somone is writing in middle of the extent |
2096 | */ | 2144 | */ |
2097 | int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, | 2145 | static int ext4_ext_convert_to_initialized(handle_t *handle, |
2098 | struct ext4_ext_path *path, | 2146 | struct inode *inode, |
2099 | ext4_fsblk_t iblock, | 2147 | struct ext4_ext_path *path, |
2100 | unsigned long max_blocks) | 2148 | ext4_lblk_t iblock, |
2149 | unsigned long max_blocks) | ||
2101 | { | 2150 | { |
2102 | struct ext4_extent *ex, newex; | 2151 | struct ext4_extent *ex, newex; |
2103 | struct ext4_extent *ex1 = NULL; | 2152 | struct ext4_extent *ex1 = NULL; |
2104 | struct ext4_extent *ex2 = NULL; | 2153 | struct ext4_extent *ex2 = NULL; |
2105 | struct ext4_extent *ex3 = NULL; | 2154 | struct ext4_extent *ex3 = NULL; |
2106 | struct ext4_extent_header *eh; | 2155 | struct ext4_extent_header *eh; |
2107 | unsigned int allocated, ee_block, ee_len, depth; | 2156 | ext4_lblk_t ee_block; |
2157 | unsigned int allocated, ee_len, depth; | ||
2108 | ext4_fsblk_t newblock; | 2158 | ext4_fsblk_t newblock; |
2109 | int err = 0; | 2159 | int err = 0; |
2110 | int ret = 0; | 2160 | int ret = 0; |
@@ -2225,8 +2275,13 @@ out: | |||
2225 | return err ? err : allocated; | 2275 | return err ? err : allocated; |
2226 | } | 2276 | } |
2227 | 2277 | ||
2278 | /* | ||
2279 | * Need to be called with | ||
2280 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block | ||
2281 | * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) | ||
2282 | */ | ||
2228 | int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | 2283 | int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, |
2229 | ext4_fsblk_t iblock, | 2284 | ext4_lblk_t iblock, |
2230 | unsigned long max_blocks, struct buffer_head *bh_result, | 2285 | unsigned long max_blocks, struct buffer_head *bh_result, |
2231 | int create, int extend_disksize) | 2286 | int create, int extend_disksize) |
2232 | { | 2287 | { |
@@ -2236,11 +2291,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2236 | ext4_fsblk_t goal, newblock; | 2291 | ext4_fsblk_t goal, newblock; |
2237 | int err = 0, depth, ret; | 2292 | int err = 0, depth, ret; |
2238 | unsigned long allocated = 0; | 2293 | unsigned long allocated = 0; |
2294 | struct ext4_allocation_request ar; | ||
2239 | 2295 | ||
2240 | __clear_bit(BH_New, &bh_result->b_state); | 2296 | __clear_bit(BH_New, &bh_result->b_state); |
2241 | ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock, | 2297 | ext_debug("blocks %u/%lu requested for inode %u\n", |
2242 | max_blocks, (unsigned) inode->i_ino); | 2298 | iblock, max_blocks, inode->i_ino); |
2243 | mutex_lock(&EXT4_I(inode)->truncate_mutex); | ||
2244 | 2299 | ||
2245 | /* check in cache */ | 2300 | /* check in cache */ |
2246 | goal = ext4_ext_in_cache(inode, iblock, &newex); | 2301 | goal = ext4_ext_in_cache(inode, iblock, &newex); |
@@ -2260,7 +2315,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2260 | - le32_to_cpu(newex.ee_block) | 2315 | - le32_to_cpu(newex.ee_block) |
2261 | + ext_pblock(&newex); | 2316 | + ext_pblock(&newex); |
2262 | /* number of remaining blocks in the extent */ | 2317 | /* number of remaining blocks in the extent */ |
2263 | allocated = le16_to_cpu(newex.ee_len) - | 2318 | allocated = ext4_ext_get_actual_len(&newex) - |
2264 | (iblock - le32_to_cpu(newex.ee_block)); | 2319 | (iblock - le32_to_cpu(newex.ee_block)); |
2265 | goto out; | 2320 | goto out; |
2266 | } else { | 2321 | } else { |
@@ -2288,7 +2343,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2288 | 2343 | ||
2289 | ex = path[depth].p_ext; | 2344 | ex = path[depth].p_ext; |
2290 | if (ex) { | 2345 | if (ex) { |
2291 | unsigned long ee_block = le32_to_cpu(ex->ee_block); | 2346 | ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); |
2292 | ext4_fsblk_t ee_start = ext_pblock(ex); | 2347 | ext4_fsblk_t ee_start = ext_pblock(ex); |
2293 | unsigned short ee_len; | 2348 | unsigned short ee_len; |
2294 | 2349 | ||
@@ -2302,7 +2357,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2302 | newblock = iblock - ee_block + ee_start; | 2357 | newblock = iblock - ee_block + ee_start; |
2303 | /* number of remaining blocks in the extent */ | 2358 | /* number of remaining blocks in the extent */ |
2304 | allocated = ee_len - (iblock - ee_block); | 2359 | allocated = ee_len - (iblock - ee_block); |
2305 | ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock, | 2360 | ext_debug("%u fit into %lu:%d -> %llu\n", iblock, |
2306 | ee_block, ee_len, newblock); | 2361 | ee_block, ee_len, newblock); |
2307 | 2362 | ||
2308 | /* Do not put uninitialized extent in the cache */ | 2363 | /* Do not put uninitialized extent in the cache */ |
@@ -2320,9 +2375,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2320 | ret = ext4_ext_convert_to_initialized(handle, inode, | 2375 | ret = ext4_ext_convert_to_initialized(handle, inode, |
2321 | path, iblock, | 2376 | path, iblock, |
2322 | max_blocks); | 2377 | max_blocks); |
2323 | if (ret <= 0) | 2378 | if (ret <= 0) { |
2379 | err = ret; | ||
2324 | goto out2; | 2380 | goto out2; |
2325 | else | 2381 | } else |
2326 | allocated = ret; | 2382 | allocated = ret; |
2327 | goto outnew; | 2383 | goto outnew; |
2328 | } | 2384 | } |
@@ -2347,8 +2403,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2347 | if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) | 2403 | if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) |
2348 | ext4_init_block_alloc_info(inode); | 2404 | ext4_init_block_alloc_info(inode); |
2349 | 2405 | ||
2350 | /* allocate new block */ | 2406 | /* find neighbour allocated blocks */ |
2351 | goal = ext4_ext_find_goal(inode, path, iblock); | 2407 | ar.lleft = iblock; |
2408 | err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); | ||
2409 | if (err) | ||
2410 | goto out2; | ||
2411 | ar.lright = iblock; | ||
2412 | err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); | ||
2413 | if (err) | ||
2414 | goto out2; | ||
2352 | 2415 | ||
2353 | /* | 2416 | /* |
2354 | * See if request is beyond maximum number of blocks we can have in | 2417 | * See if request is beyond maximum number of blocks we can have in |
@@ -2368,10 +2431,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2368 | newex.ee_len = cpu_to_le16(max_blocks); | 2431 | newex.ee_len = cpu_to_le16(max_blocks); |
2369 | err = ext4_ext_check_overlap(inode, &newex, path); | 2432 | err = ext4_ext_check_overlap(inode, &newex, path); |
2370 | if (err) | 2433 | if (err) |
2371 | allocated = le16_to_cpu(newex.ee_len); | 2434 | allocated = ext4_ext_get_actual_len(&newex); |
2372 | else | 2435 | else |
2373 | allocated = max_blocks; | 2436 | allocated = max_blocks; |
2374 | newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err); | 2437 | |
2438 | /* allocate new block */ | ||
2439 | ar.inode = inode; | ||
2440 | ar.goal = ext4_ext_find_goal(inode, path, iblock); | ||
2441 | ar.logical = iblock; | ||
2442 | ar.len = allocated; | ||
2443 | if (S_ISREG(inode->i_mode)) | ||
2444 | ar.flags = EXT4_MB_HINT_DATA; | ||
2445 | else | ||
2446 | /* disable in-core preallocation for non-regular files */ | ||
2447 | ar.flags = 0; | ||
2448 | newblock = ext4_mb_new_blocks(handle, &ar, &err); | ||
2375 | if (!newblock) | 2449 | if (!newblock) |
2376 | goto out2; | 2450 | goto out2; |
2377 | ext_debug("allocate new block: goal %llu, found %llu/%lu\n", | 2451 | ext_debug("allocate new block: goal %llu, found %llu/%lu\n", |
@@ -2379,14 +2453,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2379 | 2453 | ||
2380 | /* try to insert new extent into found leaf and return */ | 2454 | /* try to insert new extent into found leaf and return */ |
2381 | ext4_ext_store_pblock(&newex, newblock); | 2455 | ext4_ext_store_pblock(&newex, newblock); |
2382 | newex.ee_len = cpu_to_le16(allocated); | 2456 | newex.ee_len = cpu_to_le16(ar.len); |
2383 | if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ | 2457 | if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ |
2384 | ext4_ext_mark_uninitialized(&newex); | 2458 | ext4_ext_mark_uninitialized(&newex); |
2385 | err = ext4_ext_insert_extent(handle, inode, path, &newex); | 2459 | err = ext4_ext_insert_extent(handle, inode, path, &newex); |
2386 | if (err) { | 2460 | if (err) { |
2387 | /* free data blocks we just allocated */ | 2461 | /* free data blocks we just allocated */ |
2462 | /* not a good idea to call discard here directly, | ||
2463 | * but otherwise we'd need to call it every free() */ | ||
2464 | ext4_mb_discard_inode_preallocations(inode); | ||
2388 | ext4_free_blocks(handle, inode, ext_pblock(&newex), | 2465 | ext4_free_blocks(handle, inode, ext_pblock(&newex), |
2389 | le16_to_cpu(newex.ee_len)); | 2466 | ext4_ext_get_actual_len(&newex), 0); |
2390 | goto out2; | 2467 | goto out2; |
2391 | } | 2468 | } |
2392 | 2469 | ||
@@ -2395,6 +2472,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2395 | 2472 | ||
2396 | /* previous routine could use block we allocated */ | 2473 | /* previous routine could use block we allocated */ |
2397 | newblock = ext_pblock(&newex); | 2474 | newblock = ext_pblock(&newex); |
2475 | allocated = ext4_ext_get_actual_len(&newex); | ||
2398 | outnew: | 2476 | outnew: |
2399 | __set_bit(BH_New, &bh_result->b_state); | 2477 | __set_bit(BH_New, &bh_result->b_state); |
2400 | 2478 | ||
@@ -2414,8 +2492,6 @@ out2: | |||
2414 | ext4_ext_drop_refs(path); | 2492 | ext4_ext_drop_refs(path); |
2415 | kfree(path); | 2493 | kfree(path); |
2416 | } | 2494 | } |
2417 | mutex_unlock(&EXT4_I(inode)->truncate_mutex); | ||
2418 | |||
2419 | return err ? err : allocated; | 2495 | return err ? err : allocated; |
2420 | } | 2496 | } |
2421 | 2497 | ||
@@ -2423,7 +2499,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) | |||
2423 | { | 2499 | { |
2424 | struct address_space *mapping = inode->i_mapping; | 2500 | struct address_space *mapping = inode->i_mapping; |
2425 | struct super_block *sb = inode->i_sb; | 2501 | struct super_block *sb = inode->i_sb; |
2426 | unsigned long last_block; | 2502 | ext4_lblk_t last_block; |
2427 | handle_t *handle; | 2503 | handle_t *handle; |
2428 | int err = 0; | 2504 | int err = 0; |
2429 | 2505 | ||
@@ -2445,9 +2521,11 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) | |||
2445 | if (page) | 2521 | if (page) |
2446 | ext4_block_truncate_page(handle, page, mapping, inode->i_size); | 2522 | ext4_block_truncate_page(handle, page, mapping, inode->i_size); |
2447 | 2523 | ||
2448 | mutex_lock(&EXT4_I(inode)->truncate_mutex); | 2524 | down_write(&EXT4_I(inode)->i_data_sem); |
2449 | ext4_ext_invalidate_cache(inode); | 2525 | ext4_ext_invalidate_cache(inode); |
2450 | 2526 | ||
2527 | ext4_mb_discard_inode_preallocations(inode); | ||
2528 | |||
2451 | /* | 2529 | /* |
2452 | * TODO: optimization is possible here. | 2530 | * TODO: optimization is possible here. |
2453 | * Probably we need not scan at all, | 2531 | * Probably we need not scan at all, |
@@ -2481,7 +2559,7 @@ out_stop: | |||
2481 | if (inode->i_nlink) | 2559 | if (inode->i_nlink) |
2482 | ext4_orphan_del(handle, inode); | 2560 | ext4_orphan_del(handle, inode); |
2483 | 2561 | ||
2484 | mutex_unlock(&EXT4_I(inode)->truncate_mutex); | 2562 | up_write(&EXT4_I(inode)->i_data_sem); |
2485 | ext4_journal_stop(handle); | 2563 | ext4_journal_stop(handle); |
2486 | } | 2564 | } |
2487 | 2565 | ||
@@ -2516,7 +2594,8 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) | |||
2516 | long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) | 2594 | long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) |
2517 | { | 2595 | { |
2518 | handle_t *handle; | 2596 | handle_t *handle; |
2519 | ext4_fsblk_t block, max_blocks; | 2597 | ext4_lblk_t block; |
2598 | unsigned long max_blocks; | ||
2520 | ext4_fsblk_t nblocks = 0; | 2599 | ext4_fsblk_t nblocks = 0; |
2521 | int ret = 0; | 2600 | int ret = 0; |
2522 | int ret2 = 0; | 2601 | int ret2 = 0; |
@@ -2544,6 +2623,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) | |||
2544 | * modify 1 super block, 1 block bitmap and 1 group descriptor. | 2623 | * modify 1 super block, 1 block bitmap and 1 group descriptor. |
2545 | */ | 2624 | */ |
2546 | credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; | 2625 | credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; |
2626 | down_write((&EXT4_I(inode)->i_data_sem)); | ||
2547 | retry: | 2627 | retry: |
2548 | while (ret >= 0 && ret < max_blocks) { | 2628 | while (ret >= 0 && ret < max_blocks) { |
2549 | block = block + ret; | 2629 | block = block + ret; |
@@ -2557,12 +2637,12 @@ retry: | |||
2557 | ret = ext4_ext_get_blocks(handle, inode, block, | 2637 | ret = ext4_ext_get_blocks(handle, inode, block, |
2558 | max_blocks, &map_bh, | 2638 | max_blocks, &map_bh, |
2559 | EXT4_CREATE_UNINITIALIZED_EXT, 0); | 2639 | EXT4_CREATE_UNINITIALIZED_EXT, 0); |
2560 | WARN_ON(!ret); | 2640 | WARN_ON(ret <= 0); |
2561 | if (!ret) { | 2641 | if (ret <= 0) { |
2562 | ext4_error(inode->i_sb, "ext4_fallocate", | 2642 | ext4_error(inode->i_sb, "ext4_fallocate", |
2563 | "ext4_ext_get_blocks returned 0! inode#%lu" | 2643 | "ext4_ext_get_blocks returned error: " |
2564 | ", block=%llu, max_blocks=%llu", | 2644 | "inode#%lu, block=%u, max_blocks=%lu", |
2565 | inode->i_ino, block, max_blocks); | 2645 | inode->i_ino, block, max_blocks); |
2566 | ret = -EIO; | 2646 | ret = -EIO; |
2567 | ext4_mark_inode_dirty(handle, inode); | 2647 | ext4_mark_inode_dirty(handle, inode); |
2568 | ret2 = ext4_journal_stop(handle); | 2648 | ret2 = ext4_journal_stop(handle); |
@@ -2600,6 +2680,7 @@ retry: | |||
2600 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 2680 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
2601 | goto retry; | 2681 | goto retry; |
2602 | 2682 | ||
2683 | up_write((&EXT4_I(inode)->i_data_sem)); | ||
2603 | /* | 2684 | /* |
2604 | * Time to update the file size. | 2685 | * Time to update the file size. |
2605 | * Update only when preallocation was requested beyond the file size. | 2686 | * Update only when preallocation was requested beyond the file size. |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1a81cd66d63b..ac35ec58db55 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -37,9 +37,9 @@ static int ext4_release_file (struct inode * inode, struct file * filp) | |||
37 | if ((filp->f_mode & FMODE_WRITE) && | 37 | if ((filp->f_mode & FMODE_WRITE) && |
38 | (atomic_read(&inode->i_writecount) == 1)) | 38 | (atomic_read(&inode->i_writecount) == 1)) |
39 | { | 39 | { |
40 | mutex_lock(&EXT4_I(inode)->truncate_mutex); | 40 | down_write(&EXT4_I(inode)->i_data_sem); |
41 | ext4_discard_reservation(inode); | 41 | ext4_discard_reservation(inode); |
42 | mutex_unlock(&EXT4_I(inode)->truncate_mutex); | 42 | up_write(&EXT4_I(inode)->i_data_sem); |
43 | } | 43 | } |
44 | if (is_dx(inode) && filp->private_data) | 44 | if (is_dx(inode) && filp->private_data) |
45 | ext4_htree_free_dir_info(filp->private_data); | 45 | ext4_htree_free_dir_info(filp->private_data); |
@@ -56,8 +56,25 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, | |||
56 | ssize_t ret; | 56 | ssize_t ret; |
57 | int err; | 57 | int err; |
58 | 58 | ||
59 | ret = generic_file_aio_write(iocb, iov, nr_segs, pos); | 59 | /* |
60 | * If we have encountered a bitmap-format file, the size limit | ||
61 | * is smaller than s_maxbytes, which is for extent-mapped files. | ||
62 | */ | ||
63 | |||
64 | if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { | ||
65 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
66 | size_t length = iov_length(iov, nr_segs); | ||
60 | 67 | ||
68 | if (pos > sbi->s_bitmap_maxbytes) | ||
69 | return -EFBIG; | ||
70 | |||
71 | if (pos + length > sbi->s_bitmap_maxbytes) { | ||
72 | nr_segs = iov_shorten((struct iovec *)iov, nr_segs, | ||
73 | sbi->s_bitmap_maxbytes - pos); | ||
74 | } | ||
75 | } | ||
76 | |||
77 | ret = generic_file_aio_write(iocb, iov, nr_segs, pos); | ||
61 | /* | 78 | /* |
62 | * Skip flushing if there was an error, or if nothing was written. | 79 | * Skip flushing if there was an error, or if nothing was written. |
63 | */ | 80 | */ |
diff --git a/fs/ext4/group.h b/fs/ext4/group.h index 1577910bb58b..7eb0604e7eea 100644 --- a/fs/ext4/group.h +++ b/fs/ext4/group.h | |||
@@ -14,14 +14,16 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, | |||
14 | extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, | 14 | extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, |
15 | struct ext4_group_desc *gdp); | 15 | struct ext4_group_desc *gdp); |
16 | struct buffer_head *read_block_bitmap(struct super_block *sb, | 16 | struct buffer_head *read_block_bitmap(struct super_block *sb, |
17 | unsigned int block_group); | 17 | ext4_group_t block_group); |
18 | extern unsigned ext4_init_block_bitmap(struct super_block *sb, | 18 | extern unsigned ext4_init_block_bitmap(struct super_block *sb, |
19 | struct buffer_head *bh, int group, | 19 | struct buffer_head *bh, |
20 | ext4_group_t group, | ||
20 | struct ext4_group_desc *desc); | 21 | struct ext4_group_desc *desc); |
21 | #define ext4_free_blocks_after_init(sb, group, desc) \ | 22 | #define ext4_free_blocks_after_init(sb, group, desc) \ |
22 | ext4_init_block_bitmap(sb, NULL, group, desc) | 23 | ext4_init_block_bitmap(sb, NULL, group, desc) |
23 | extern unsigned ext4_init_inode_bitmap(struct super_block *sb, | 24 | extern unsigned ext4_init_inode_bitmap(struct super_block *sb, |
24 | struct buffer_head *bh, int group, | 25 | struct buffer_head *bh, |
26 | ext4_group_t group, | ||
25 | struct ext4_group_desc *desc); | 27 | struct ext4_group_desc *desc); |
26 | extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); | 28 | extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); |
27 | #endif /* _LINUX_EXT4_GROUP_H */ | 29 | #endif /* _LINUX_EXT4_GROUP_H */ |
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c61f37fd3f05..575b5215c808 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -64,8 +64,8 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) | |||
64 | } | 64 | } |
65 | 65 | ||
66 | /* Initializes an uninitialized inode bitmap */ | 66 | /* Initializes an uninitialized inode bitmap */ |
67 | unsigned ext4_init_inode_bitmap(struct super_block *sb, | 67 | unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, |
68 | struct buffer_head *bh, int block_group, | 68 | ext4_group_t block_group, |
69 | struct ext4_group_desc *gdp) | 69 | struct ext4_group_desc *gdp) |
70 | { | 70 | { |
71 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 71 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
@@ -75,7 +75,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, | |||
75 | /* If checksum is bad mark all blocks and inodes use to prevent | 75 | /* If checksum is bad mark all blocks and inodes use to prevent |
76 | * allocation, essentially implementing a per-group read-only flag. */ | 76 | * allocation, essentially implementing a per-group read-only flag. */ |
77 | if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { | 77 | if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { |
78 | ext4_error(sb, __FUNCTION__, "Checksum bad for group %u\n", | 78 | ext4_error(sb, __FUNCTION__, "Checksum bad for group %lu\n", |
79 | block_group); | 79 | block_group); |
80 | gdp->bg_free_blocks_count = 0; | 80 | gdp->bg_free_blocks_count = 0; |
81 | gdp->bg_free_inodes_count = 0; | 81 | gdp->bg_free_inodes_count = 0; |
@@ -98,7 +98,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, | |||
98 | * Return buffer_head of bitmap on success or NULL. | 98 | * Return buffer_head of bitmap on success or NULL. |
99 | */ | 99 | */ |
100 | static struct buffer_head * | 100 | static struct buffer_head * |
101 | read_inode_bitmap(struct super_block * sb, unsigned long block_group) | 101 | read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) |
102 | { | 102 | { |
103 | struct ext4_group_desc *desc; | 103 | struct ext4_group_desc *desc; |
104 | struct buffer_head *bh = NULL; | 104 | struct buffer_head *bh = NULL; |
@@ -152,7 +152,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) | |||
152 | unsigned long ino; | 152 | unsigned long ino; |
153 | struct buffer_head *bitmap_bh = NULL; | 153 | struct buffer_head *bitmap_bh = NULL; |
154 | struct buffer_head *bh2; | 154 | struct buffer_head *bh2; |
155 | unsigned long block_group; | 155 | ext4_group_t block_group; |
156 | unsigned long bit; | 156 | unsigned long bit; |
157 | struct ext4_group_desc * gdp; | 157 | struct ext4_group_desc * gdp; |
158 | struct ext4_super_block * es; | 158 | struct ext4_super_block * es; |
@@ -260,12 +260,14 @@ error_return: | |||
260 | * For other inodes, search forward from the parent directory\'s block | 260 | * For other inodes, search forward from the parent directory\'s block |
261 | * group to find a free inode. | 261 | * group to find a free inode. |
262 | */ | 262 | */ |
263 | static int find_group_dir(struct super_block *sb, struct inode *parent) | 263 | static int find_group_dir(struct super_block *sb, struct inode *parent, |
264 | ext4_group_t *best_group) | ||
264 | { | 265 | { |
265 | int ngroups = EXT4_SB(sb)->s_groups_count; | 266 | ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; |
266 | unsigned int freei, avefreei; | 267 | unsigned int freei, avefreei; |
267 | struct ext4_group_desc *desc, *best_desc = NULL; | 268 | struct ext4_group_desc *desc, *best_desc = NULL; |
268 | int group, best_group = -1; | 269 | ext4_group_t group; |
270 | int ret = -1; | ||
269 | 271 | ||
270 | freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); | 272 | freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); |
271 | avefreei = freei / ngroups; | 273 | avefreei = freei / ngroups; |
@@ -279,11 +281,12 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) | |||
279 | if (!best_desc || | 281 | if (!best_desc || |
280 | (le16_to_cpu(desc->bg_free_blocks_count) > | 282 | (le16_to_cpu(desc->bg_free_blocks_count) > |
281 | le16_to_cpu(best_desc->bg_free_blocks_count))) { | 283 | le16_to_cpu(best_desc->bg_free_blocks_count))) { |
282 | best_group = group; | 284 | *best_group = group; |
283 | best_desc = desc; | 285 | best_desc = desc; |
286 | ret = 0; | ||
284 | } | 287 | } |
285 | } | 288 | } |
286 | return best_group; | 289 | return ret; |
287 | } | 290 | } |
288 | 291 | ||
289 | /* | 292 | /* |
@@ -314,12 +317,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) | |||
314 | #define INODE_COST 64 | 317 | #define INODE_COST 64 |
315 | #define BLOCK_COST 256 | 318 | #define BLOCK_COST 256 |
316 | 319 | ||
317 | static int find_group_orlov(struct super_block *sb, struct inode *parent) | 320 | static int find_group_orlov(struct super_block *sb, struct inode *parent, |
321 | ext4_group_t *group) | ||
318 | { | 322 | { |
319 | int parent_group = EXT4_I(parent)->i_block_group; | 323 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; |
320 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 324 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
321 | struct ext4_super_block *es = sbi->s_es; | 325 | struct ext4_super_block *es = sbi->s_es; |
322 | int ngroups = sbi->s_groups_count; | 326 | ext4_group_t ngroups = sbi->s_groups_count; |
323 | int inodes_per_group = EXT4_INODES_PER_GROUP(sb); | 327 | int inodes_per_group = EXT4_INODES_PER_GROUP(sb); |
324 | unsigned int freei, avefreei; | 328 | unsigned int freei, avefreei; |
325 | ext4_fsblk_t freeb, avefreeb; | 329 | ext4_fsblk_t freeb, avefreeb; |
@@ -327,7 +331,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) | |||
327 | unsigned int ndirs; | 331 | unsigned int ndirs; |
328 | int max_debt, max_dirs, min_inodes; | 332 | int max_debt, max_dirs, min_inodes; |
329 | ext4_grpblk_t min_blocks; | 333 | ext4_grpblk_t min_blocks; |
330 | int group = -1, i; | 334 | ext4_group_t i; |
331 | struct ext4_group_desc *desc; | 335 | struct ext4_group_desc *desc; |
332 | 336 | ||
333 | freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); | 337 | freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); |
@@ -340,13 +344,14 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) | |||
340 | if ((parent == sb->s_root->d_inode) || | 344 | if ((parent == sb->s_root->d_inode) || |
341 | (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { | 345 | (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { |
342 | int best_ndir = inodes_per_group; | 346 | int best_ndir = inodes_per_group; |
343 | int best_group = -1; | 347 | ext4_group_t grp; |
348 | int ret = -1; | ||
344 | 349 | ||
345 | get_random_bytes(&group, sizeof(group)); | 350 | get_random_bytes(&grp, sizeof(grp)); |
346 | parent_group = (unsigned)group % ngroups; | 351 | parent_group = (unsigned)grp % ngroups; |
347 | for (i = 0; i < ngroups; i++) { | 352 | for (i = 0; i < ngroups; i++) { |
348 | group = (parent_group + i) % ngroups; | 353 | grp = (parent_group + i) % ngroups; |
349 | desc = ext4_get_group_desc (sb, group, NULL); | 354 | desc = ext4_get_group_desc(sb, grp, NULL); |
350 | if (!desc || !desc->bg_free_inodes_count) | 355 | if (!desc || !desc->bg_free_inodes_count) |
351 | continue; | 356 | continue; |
352 | if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) | 357 | if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) |
@@ -355,11 +360,12 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) | |||
355 | continue; | 360 | continue; |
356 | if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) | 361 | if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) |
357 | continue; | 362 | continue; |
358 | best_group = group; | 363 | *group = grp; |
364 | ret = 0; | ||
359 | best_ndir = le16_to_cpu(desc->bg_used_dirs_count); | 365 | best_ndir = le16_to_cpu(desc->bg_used_dirs_count); |
360 | } | 366 | } |
361 | if (best_group >= 0) | 367 | if (ret == 0) |
362 | return best_group; | 368 | return ret; |
363 | goto fallback; | 369 | goto fallback; |
364 | } | 370 | } |
365 | 371 | ||
@@ -380,8 +386,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) | |||
380 | max_debt = 1; | 386 | max_debt = 1; |
381 | 387 | ||
382 | for (i = 0; i < ngroups; i++) { | 388 | for (i = 0; i < ngroups; i++) { |
383 | group = (parent_group + i) % ngroups; | 389 | *group = (parent_group + i) % ngroups; |
384 | desc = ext4_get_group_desc (sb, group, NULL); | 390 | desc = ext4_get_group_desc(sb, *group, NULL); |
385 | if (!desc || !desc->bg_free_inodes_count) | 391 | if (!desc || !desc->bg_free_inodes_count) |
386 | continue; | 392 | continue; |
387 | if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) | 393 | if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) |
@@ -390,17 +396,16 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) | |||
390 | continue; | 396 | continue; |
391 | if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) | 397 | if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) |
392 | continue; | 398 | continue; |
393 | return group; | 399 | return 0; |
394 | } | 400 | } |
395 | 401 | ||
396 | fallback: | 402 | fallback: |
397 | for (i = 0; i < ngroups; i++) { | 403 | for (i = 0; i < ngroups; i++) { |
398 | group = (parent_group + i) % ngroups; | 404 | *group = (parent_group + i) % ngroups; |
399 | desc = ext4_get_group_desc (sb, group, NULL); | 405 | desc = ext4_get_group_desc(sb, *group, NULL); |
400 | if (!desc || !desc->bg_free_inodes_count) | 406 | if (desc && desc->bg_free_inodes_count && |
401 | continue; | 407 | le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) |
402 | if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) | 408 | return 0; |
403 | return group; | ||
404 | } | 409 | } |
405 | 410 | ||
406 | if (avefreei) { | 411 | if (avefreei) { |
@@ -415,21 +420,22 @@ fallback: | |||
415 | return -1; | 420 | return -1; |
416 | } | 421 | } |
417 | 422 | ||
418 | static int find_group_other(struct super_block *sb, struct inode *parent) | 423 | static int find_group_other(struct super_block *sb, struct inode *parent, |
424 | ext4_group_t *group) | ||
419 | { | 425 | { |
420 | int parent_group = EXT4_I(parent)->i_block_group; | 426 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; |
421 | int ngroups = EXT4_SB(sb)->s_groups_count; | 427 | ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; |
422 | struct ext4_group_desc *desc; | 428 | struct ext4_group_desc *desc; |
423 | int group, i; | 429 | ext4_group_t i; |
424 | 430 | ||
425 | /* | 431 | /* |
426 | * Try to place the inode in its parent directory | 432 | * Try to place the inode in its parent directory |
427 | */ | 433 | */ |
428 | group = parent_group; | 434 | *group = parent_group; |
429 | desc = ext4_get_group_desc (sb, group, NULL); | 435 | desc = ext4_get_group_desc(sb, *group, NULL); |
430 | if (desc && le16_to_cpu(desc->bg_free_inodes_count) && | 436 | if (desc && le16_to_cpu(desc->bg_free_inodes_count) && |
431 | le16_to_cpu(desc->bg_free_blocks_count)) | 437 | le16_to_cpu(desc->bg_free_blocks_count)) |
432 | return group; | 438 | return 0; |
433 | 439 | ||
434 | /* | 440 | /* |
435 | * We're going to place this inode in a different blockgroup from its | 441 | * We're going to place this inode in a different blockgroup from its |
@@ -440,33 +446,33 @@ static int find_group_other(struct super_block *sb, struct inode *parent) | |||
440 | * | 446 | * |
441 | * So add our directory's i_ino into the starting point for the hash. | 447 | * So add our directory's i_ino into the starting point for the hash. |
442 | */ | 448 | */ |
443 | group = (group + parent->i_ino) % ngroups; | 449 | *group = (*group + parent->i_ino) % ngroups; |
444 | 450 | ||
445 | /* | 451 | /* |
446 | * Use a quadratic hash to find a group with a free inode and some free | 452 | * Use a quadratic hash to find a group with a free inode and some free |
447 | * blocks. | 453 | * blocks. |
448 | */ | 454 | */ |
449 | for (i = 1; i < ngroups; i <<= 1) { | 455 | for (i = 1; i < ngroups; i <<= 1) { |
450 | group += i; | 456 | *group += i; |
451 | if (group >= ngroups) | 457 | if (*group >= ngroups) |
452 | group -= ngroups; | 458 | *group -= ngroups; |
453 | desc = ext4_get_group_desc (sb, group, NULL); | 459 | desc = ext4_get_group_desc(sb, *group, NULL); |
454 | if (desc && le16_to_cpu(desc->bg_free_inodes_count) && | 460 | if (desc && le16_to_cpu(desc->bg_free_inodes_count) && |
455 | le16_to_cpu(desc->bg_free_blocks_count)) | 461 | le16_to_cpu(desc->bg_free_blocks_count)) |
456 | return group; | 462 | return 0; |
457 | } | 463 | } |
458 | 464 | ||
459 | /* | 465 | /* |
460 | * That failed: try linear search for a free inode, even if that group | 466 | * That failed: try linear search for a free inode, even if that group |
461 | * has no free blocks. | 467 | * has no free blocks. |
462 | */ | 468 | */ |
463 | group = parent_group; | 469 | *group = parent_group; |
464 | for (i = 0; i < ngroups; i++) { | 470 | for (i = 0; i < ngroups; i++) { |
465 | if (++group >= ngroups) | 471 | if (++*group >= ngroups) |
466 | group = 0; | 472 | *group = 0; |
467 | desc = ext4_get_group_desc (sb, group, NULL); | 473 | desc = ext4_get_group_desc(sb, *group, NULL); |
468 | if (desc && le16_to_cpu(desc->bg_free_inodes_count)) | 474 | if (desc && le16_to_cpu(desc->bg_free_inodes_count)) |
469 | return group; | 475 | return 0; |
470 | } | 476 | } |
471 | 477 | ||
472 | return -1; | 478 | return -1; |
@@ -487,16 +493,17 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) | |||
487 | struct super_block *sb; | 493 | struct super_block *sb; |
488 | struct buffer_head *bitmap_bh = NULL; | 494 | struct buffer_head *bitmap_bh = NULL; |
489 | struct buffer_head *bh2; | 495 | struct buffer_head *bh2; |
490 | int group; | 496 | ext4_group_t group = 0; |
491 | unsigned long ino = 0; | 497 | unsigned long ino = 0; |
492 | struct inode * inode; | 498 | struct inode * inode; |
493 | struct ext4_group_desc * gdp = NULL; | 499 | struct ext4_group_desc * gdp = NULL; |
494 | struct ext4_super_block * es; | 500 | struct ext4_super_block * es; |
495 | struct ext4_inode_info *ei; | 501 | struct ext4_inode_info *ei; |
496 | struct ext4_sb_info *sbi; | 502 | struct ext4_sb_info *sbi; |
497 | int err = 0; | 503 | int ret2, err = 0; |
498 | struct inode *ret; | 504 | struct inode *ret; |
499 | int i, free = 0; | 505 | ext4_group_t i; |
506 | int free = 0; | ||
500 | 507 | ||
501 | /* Cannot create files in a deleted directory */ | 508 | /* Cannot create files in a deleted directory */ |
502 | if (!dir || !dir->i_nlink) | 509 | if (!dir || !dir->i_nlink) |
@@ -512,14 +519,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) | |||
512 | es = sbi->s_es; | 519 | es = sbi->s_es; |
513 | if (S_ISDIR(mode)) { | 520 | if (S_ISDIR(mode)) { |
514 | if (test_opt (sb, OLDALLOC)) | 521 | if (test_opt (sb, OLDALLOC)) |
515 | group = find_group_dir(sb, dir); | 522 | ret2 = find_group_dir(sb, dir, &group); |
516 | else | 523 | else |
517 | group = find_group_orlov(sb, dir); | 524 | ret2 = find_group_orlov(sb, dir, &group); |
518 | } else | 525 | } else |
519 | group = find_group_other(sb, dir); | 526 | ret2 = find_group_other(sb, dir, &group); |
520 | 527 | ||
521 | err = -ENOSPC; | 528 | err = -ENOSPC; |
522 | if (group == -1) | 529 | if (ret2 == -1) |
523 | goto out; | 530 | goto out; |
524 | 531 | ||
525 | for (i = 0; i < sbi->s_groups_count; i++) { | 532 | for (i = 0; i < sbi->s_groups_count; i++) { |
@@ -583,7 +590,7 @@ got: | |||
583 | ino > EXT4_INODES_PER_GROUP(sb)) { | 590 | ino > EXT4_INODES_PER_GROUP(sb)) { |
584 | ext4_error(sb, __FUNCTION__, | 591 | ext4_error(sb, __FUNCTION__, |
585 | "reserved inode or inode > inodes count - " | 592 | "reserved inode or inode > inodes count - " |
586 | "block_group = %d, inode=%lu", group, | 593 | "block_group = %lu, inode=%lu", group, |
587 | ino + group * EXT4_INODES_PER_GROUP(sb)); | 594 | ino + group * EXT4_INODES_PER_GROUP(sb)); |
588 | err = -EIO; | 595 | err = -EIO; |
589 | goto fail; | 596 | goto fail; |
@@ -702,7 +709,6 @@ got: | |||
702 | if (!S_ISDIR(mode)) | 709 | if (!S_ISDIR(mode)) |
703 | ei->i_flags &= ~EXT4_DIRSYNC_FL; | 710 | ei->i_flags &= ~EXT4_DIRSYNC_FL; |
704 | ei->i_file_acl = 0; | 711 | ei->i_file_acl = 0; |
705 | ei->i_dir_acl = 0; | ||
706 | ei->i_dtime = 0; | 712 | ei->i_dtime = 0; |
707 | ei->i_block_alloc_info = NULL; | 713 | ei->i_block_alloc_info = NULL; |
708 | ei->i_block_group = group; | 714 | ei->i_block_group = group; |
@@ -741,13 +747,10 @@ got: | |||
741 | if (test_opt(sb, EXTENTS)) { | 747 | if (test_opt(sb, EXTENTS)) { |
742 | EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; | 748 | EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; |
743 | ext4_ext_tree_init(handle, inode); | 749 | ext4_ext_tree_init(handle, inode); |
744 | if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { | 750 | err = ext4_update_incompat_feature(handle, sb, |
745 | err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); | 751 | EXT4_FEATURE_INCOMPAT_EXTENTS); |
746 | if (err) goto fail; | 752 | if (err) |
747 | EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS); | 753 | goto fail; |
748 | BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata"); | ||
749 | err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); | ||
750 | } | ||
751 | } | 754 | } |
752 | 755 | ||
753 | ext4_debug("allocating inode %lu\n", inode->i_ino); | 756 | ext4_debug("allocating inode %lu\n", inode->i_ino); |
@@ -777,7 +780,7 @@ fail_drop: | |||
777 | struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) | 780 | struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) |
778 | { | 781 | { |
779 | unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); | 782 | unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); |
780 | unsigned long block_group; | 783 | ext4_group_t block_group; |
781 | int bit; | 784 | int bit; |
782 | struct buffer_head *bitmap_bh = NULL; | 785 | struct buffer_head *bitmap_bh = NULL; |
783 | struct inode *inode = NULL; | 786 | struct inode *inode = NULL; |
@@ -833,7 +836,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) | |||
833 | { | 836 | { |
834 | unsigned long desc_count; | 837 | unsigned long desc_count; |
835 | struct ext4_group_desc *gdp; | 838 | struct ext4_group_desc *gdp; |
836 | int i; | 839 | ext4_group_t i; |
837 | #ifdef EXT4FS_DEBUG | 840 | #ifdef EXT4FS_DEBUG |
838 | struct ext4_super_block *es; | 841 | struct ext4_super_block *es; |
839 | unsigned long bitmap_count, x; | 842 | unsigned long bitmap_count, x; |
@@ -854,7 +857,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) | |||
854 | continue; | 857 | continue; |
855 | 858 | ||
856 | x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); | 859 | x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); |
857 | printk("group %d: stored = %d, counted = %lu\n", | 860 | printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", |
858 | i, le16_to_cpu(gdp->bg_free_inodes_count), x); | 861 | i, le16_to_cpu(gdp->bg_free_inodes_count), x); |
859 | bitmap_count += x; | 862 | bitmap_count += x; |
860 | } | 863 | } |
@@ -879,7 +882,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) | |||
879 | unsigned long ext4_count_dirs (struct super_block * sb) | 882 | unsigned long ext4_count_dirs (struct super_block * sb) |
880 | { | 883 | { |
881 | unsigned long count = 0; | 884 | unsigned long count = 0; |
882 | int i; | 885 | ext4_group_t i; |
883 | 886 | ||
884 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { | 887 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { |
885 | struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); | 888 | struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5489703d9573..bb717cbb749c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -105,7 +105,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, | |||
105 | */ | 105 | */ |
106 | static unsigned long blocks_for_truncate(struct inode *inode) | 106 | static unsigned long blocks_for_truncate(struct inode *inode) |
107 | { | 107 | { |
108 | unsigned long needed; | 108 | ext4_lblk_t needed; |
109 | 109 | ||
110 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); | 110 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); |
111 | 111 | ||
@@ -243,13 +243,6 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) | |||
243 | p->bh = bh; | 243 | p->bh = bh; |
244 | } | 244 | } |
245 | 245 | ||
246 | static int verify_chain(Indirect *from, Indirect *to) | ||
247 | { | ||
248 | while (from <= to && from->key == *from->p) | ||
249 | from++; | ||
250 | return (from > to); | ||
251 | } | ||
252 | |||
253 | /** | 246 | /** |
254 | * ext4_block_to_path - parse the block number into array of offsets | 247 | * ext4_block_to_path - parse the block number into array of offsets |
255 | * @inode: inode in question (we are only interested in its superblock) | 248 | * @inode: inode in question (we are only interested in its superblock) |
@@ -282,7 +275,8 @@ static int verify_chain(Indirect *from, Indirect *to) | |||
282 | */ | 275 | */ |
283 | 276 | ||
284 | static int ext4_block_to_path(struct inode *inode, | 277 | static int ext4_block_to_path(struct inode *inode, |
285 | long i_block, int offsets[4], int *boundary) | 278 | ext4_lblk_t i_block, |
279 | ext4_lblk_t offsets[4], int *boundary) | ||
286 | { | 280 | { |
287 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); | 281 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); |
288 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); | 282 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); |
@@ -313,7 +307,10 @@ static int ext4_block_to_path(struct inode *inode, | |||
313 | offsets[n++] = i_block & (ptrs - 1); | 307 | offsets[n++] = i_block & (ptrs - 1); |
314 | final = ptrs; | 308 | final = ptrs; |
315 | } else { | 309 | } else { |
316 | ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big"); | 310 | ext4_warning(inode->i_sb, "ext4_block_to_path", |
311 | "block %lu > max", | ||
312 | i_block + direct_blocks + | ||
313 | indirect_blocks + double_blocks); | ||
317 | } | 314 | } |
318 | if (boundary) | 315 | if (boundary) |
319 | *boundary = final - 1 - (i_block & (ptrs - 1)); | 316 | *boundary = final - 1 - (i_block & (ptrs - 1)); |
@@ -344,12 +341,14 @@ static int ext4_block_to_path(struct inode *inode, | |||
344 | * (pointer to last triple returned, *@err == 0) | 341 | * (pointer to last triple returned, *@err == 0) |
345 | * or when it gets an IO error reading an indirect block | 342 | * or when it gets an IO error reading an indirect block |
346 | * (ditto, *@err == -EIO) | 343 | * (ditto, *@err == -EIO) |
347 | * or when it notices that chain had been changed while it was reading | ||
348 | * (ditto, *@err == -EAGAIN) | ||
349 | * or when it reads all @depth-1 indirect blocks successfully and finds | 344 | * or when it reads all @depth-1 indirect blocks successfully and finds |
350 | * the whole chain, all way to the data (returns %NULL, *err == 0). | 345 | * the whole chain, all way to the data (returns %NULL, *err == 0). |
346 | * | ||
347 | * Need to be called with | ||
348 | * down_read(&EXT4_I(inode)->i_data_sem) | ||
351 | */ | 349 | */ |
352 | static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets, | 350 | static Indirect *ext4_get_branch(struct inode *inode, int depth, |
351 | ext4_lblk_t *offsets, | ||
353 | Indirect chain[4], int *err) | 352 | Indirect chain[4], int *err) |
354 | { | 353 | { |
355 | struct super_block *sb = inode->i_sb; | 354 | struct super_block *sb = inode->i_sb; |
@@ -365,9 +364,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets, | |||
365 | bh = sb_bread(sb, le32_to_cpu(p->key)); | 364 | bh = sb_bread(sb, le32_to_cpu(p->key)); |
366 | if (!bh) | 365 | if (!bh) |
367 | goto failure; | 366 | goto failure; |
368 | /* Reader: pointers */ | ||
369 | if (!verify_chain(chain, p)) | ||
370 | goto changed; | ||
371 | add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); | 367 | add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); |
372 | /* Reader: end */ | 368 | /* Reader: end */ |
373 | if (!p->key) | 369 | if (!p->key) |
@@ -375,10 +371,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets, | |||
375 | } | 371 | } |
376 | return NULL; | 372 | return NULL; |
377 | 373 | ||
378 | changed: | ||
379 | brelse(bh); | ||
380 | *err = -EAGAIN; | ||
381 | goto no_block; | ||
382 | failure: | 374 | failure: |
383 | *err = -EIO; | 375 | *err = -EIO; |
384 | no_block: | 376 | no_block: |
@@ -445,7 +437,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | |||
445 | * stores it in *@goal and returns zero. | 437 | * stores it in *@goal and returns zero. |
446 | */ | 438 | */ |
447 | 439 | ||
448 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block, | 440 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, |
449 | Indirect chain[4], Indirect *partial) | 441 | Indirect chain[4], Indirect *partial) |
450 | { | 442 | { |
451 | struct ext4_block_alloc_info *block_i; | 443 | struct ext4_block_alloc_info *block_i; |
@@ -559,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
559 | return ret; | 551 | return ret; |
560 | failed_out: | 552 | failed_out: |
561 | for (i = 0; i <index; i++) | 553 | for (i = 0; i <index; i++) |
562 | ext4_free_blocks(handle, inode, new_blocks[i], 1); | 554 | ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); |
563 | return ret; | 555 | return ret; |
564 | } | 556 | } |
565 | 557 | ||
@@ -590,7 +582,7 @@ failed_out: | |||
590 | */ | 582 | */ |
591 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | 583 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, |
592 | int indirect_blks, int *blks, ext4_fsblk_t goal, | 584 | int indirect_blks, int *blks, ext4_fsblk_t goal, |
593 | int *offsets, Indirect *branch) | 585 | ext4_lblk_t *offsets, Indirect *branch) |
594 | { | 586 | { |
595 | int blocksize = inode->i_sb->s_blocksize; | 587 | int blocksize = inode->i_sb->s_blocksize; |
596 | int i, n = 0; | 588 | int i, n = 0; |
@@ -658,9 +650,9 @@ failed: | |||
658 | ext4_journal_forget(handle, branch[i].bh); | 650 | ext4_journal_forget(handle, branch[i].bh); |
659 | } | 651 | } |
660 | for (i = 0; i <indirect_blks; i++) | 652 | for (i = 0; i <indirect_blks; i++) |
661 | ext4_free_blocks(handle, inode, new_blocks[i], 1); | 653 | ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); |
662 | 654 | ||
663 | ext4_free_blocks(handle, inode, new_blocks[i], num); | 655 | ext4_free_blocks(handle, inode, new_blocks[i], num, 0); |
664 | 656 | ||
665 | return err; | 657 | return err; |
666 | } | 658 | } |
@@ -680,7 +672,7 @@ failed: | |||
680 | * chain to new block and return 0. | 672 | * chain to new block and return 0. |
681 | */ | 673 | */ |
682 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, | 674 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, |
683 | long block, Indirect *where, int num, int blks) | 675 | ext4_lblk_t block, Indirect *where, int num, int blks) |
684 | { | 676 | { |
685 | int i; | 677 | int i; |
686 | int err = 0; | 678 | int err = 0; |
@@ -757,9 +749,10 @@ err_out: | |||
757 | for (i = 1; i <= num; i++) { | 749 | for (i = 1; i <= num; i++) { |
758 | BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); | 750 | BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); |
759 | ext4_journal_forget(handle, where[i].bh); | 751 | ext4_journal_forget(handle, where[i].bh); |
760 | ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); | 752 | ext4_free_blocks(handle, inode, |
753 | le32_to_cpu(where[i-1].key), 1, 0); | ||
761 | } | 754 | } |
762 | ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); | 755 | ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); |
763 | 756 | ||
764 | return err; | 757 | return err; |
765 | } | 758 | } |
@@ -782,14 +775,19 @@ err_out: | |||
782 | * return > 0, # of blocks mapped or allocated. | 775 | * return > 0, # of blocks mapped or allocated. |
783 | * return = 0, if plain lookup failed. | 776 | * return = 0, if plain lookup failed. |
784 | * return < 0, error case. | 777 | * return < 0, error case. |
778 | * | ||
779 | * | ||
780 | * Need to be called with | ||
781 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block | ||
782 | * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) | ||
785 | */ | 783 | */ |
786 | int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | 784 | int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, |
787 | sector_t iblock, unsigned long maxblocks, | 785 | ext4_lblk_t iblock, unsigned long maxblocks, |
788 | struct buffer_head *bh_result, | 786 | struct buffer_head *bh_result, |
789 | int create, int extend_disksize) | 787 | int create, int extend_disksize) |
790 | { | 788 | { |
791 | int err = -EIO; | 789 | int err = -EIO; |
792 | int offsets[4]; | 790 | ext4_lblk_t offsets[4]; |
793 | Indirect chain[4]; | 791 | Indirect chain[4]; |
794 | Indirect *partial; | 792 | Indirect *partial; |
795 | ext4_fsblk_t goal; | 793 | ext4_fsblk_t goal; |
@@ -803,7 +801,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
803 | 801 | ||
804 | J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); | 802 | J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); |
805 | J_ASSERT(handle != NULL || create == 0); | 803 | J_ASSERT(handle != NULL || create == 0); |
806 | depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary); | 804 | depth = ext4_block_to_path(inode, iblock, offsets, |
805 | &blocks_to_boundary); | ||
807 | 806 | ||
808 | if (depth == 0) | 807 | if (depth == 0) |
809 | goto out; | 808 | goto out; |
@@ -819,18 +818,6 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
819 | while (count < maxblocks && count <= blocks_to_boundary) { | 818 | while (count < maxblocks && count <= blocks_to_boundary) { |
820 | ext4_fsblk_t blk; | 819 | ext4_fsblk_t blk; |
821 | 820 | ||
822 | if (!verify_chain(chain, partial)) { | ||
823 | /* | ||
824 | * Indirect block might be removed by | ||
825 | * truncate while we were reading it. | ||
826 | * Handling of that case: forget what we've | ||
827 | * got now. Flag the err as EAGAIN, so it | ||
828 | * will reread. | ||
829 | */ | ||
830 | err = -EAGAIN; | ||
831 | count = 0; | ||
832 | break; | ||
833 | } | ||
834 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | 821 | blk = le32_to_cpu(*(chain[depth-1].p + count)); |
835 | 822 | ||
836 | if (blk == first_block + count) | 823 | if (blk == first_block + count) |
@@ -838,44 +825,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
838 | else | 825 | else |
839 | break; | 826 | break; |
840 | } | 827 | } |
841 | if (err != -EAGAIN) | 828 | goto got_it; |
842 | goto got_it; | ||
843 | } | 829 | } |
844 | 830 | ||
845 | /* Next simple case - plain lookup or failed read of indirect block */ | 831 | /* Next simple case - plain lookup or failed read of indirect block */ |
846 | if (!create || err == -EIO) | 832 | if (!create || err == -EIO) |
847 | goto cleanup; | 833 | goto cleanup; |
848 | 834 | ||
849 | mutex_lock(&ei->truncate_mutex); | ||
850 | |||
851 | /* | ||
852 | * If the indirect block is missing while we are reading | ||
853 | * the chain(ext4_get_branch() returns -EAGAIN err), or | ||
854 | * if the chain has been changed after we grab the semaphore, | ||
855 | * (either because another process truncated this branch, or | ||
856 | * another get_block allocated this branch) re-grab the chain to see if | ||
857 | * the request block has been allocated or not. | ||
858 | * | ||
859 | * Since we already block the truncate/other get_block | ||
860 | * at this point, we will have the current copy of the chain when we | ||
861 | * splice the branch into the tree. | ||
862 | */ | ||
863 | if (err == -EAGAIN || !verify_chain(chain, partial)) { | ||
864 | while (partial > chain) { | ||
865 | brelse(partial->bh); | ||
866 | partial--; | ||
867 | } | ||
868 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); | ||
869 | if (!partial) { | ||
870 | count++; | ||
871 | mutex_unlock(&ei->truncate_mutex); | ||
872 | if (err) | ||
873 | goto cleanup; | ||
874 | clear_buffer_new(bh_result); | ||
875 | goto got_it; | ||
876 | } | ||
877 | } | ||
878 | |||
879 | /* | 835 | /* |
880 | * Okay, we need to do block allocation. Lazily initialize the block | 836 | * Okay, we need to do block allocation. Lazily initialize the block |
881 | * allocation info here if necessary | 837 | * allocation info here if necessary |
@@ -911,13 +867,12 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
911 | err = ext4_splice_branch(handle, inode, iblock, | 867 | err = ext4_splice_branch(handle, inode, iblock, |
912 | partial, indirect_blks, count); | 868 | partial, indirect_blks, count); |
913 | /* | 869 | /* |
914 | * i_disksize growing is protected by truncate_mutex. Don't forget to | 870 | * i_disksize growing is protected by i_data_sem. Don't forget to |
915 | * protect it if you're about to implement concurrent | 871 | * protect it if you're about to implement concurrent |
916 | * ext4_get_block() -bzzz | 872 | * ext4_get_block() -bzzz |
917 | */ | 873 | */ |
918 | if (!err && extend_disksize && inode->i_size > ei->i_disksize) | 874 | if (!err && extend_disksize && inode->i_size > ei->i_disksize) |
919 | ei->i_disksize = inode->i_size; | 875 | ei->i_disksize = inode->i_size; |
920 | mutex_unlock(&ei->truncate_mutex); | ||
921 | if (err) | 876 | if (err) |
922 | goto cleanup; | 877 | goto cleanup; |
923 | 878 | ||
@@ -942,6 +897,47 @@ out: | |||
942 | 897 | ||
943 | #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) | 898 | #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) |
944 | 899 | ||
900 | int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | ||
901 | unsigned long max_blocks, struct buffer_head *bh, | ||
902 | int create, int extend_disksize) | ||
903 | { | ||
904 | int retval; | ||
905 | /* | ||
906 | * Try to see if we can get the block without requesting | ||
907 | * for new file system block. | ||
908 | */ | ||
909 | down_read((&EXT4_I(inode)->i_data_sem)); | ||
910 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | ||
911 | retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, | ||
912 | bh, 0, 0); | ||
913 | } else { | ||
914 | retval = ext4_get_blocks_handle(handle, | ||
915 | inode, block, max_blocks, bh, 0, 0); | ||
916 | } | ||
917 | up_read((&EXT4_I(inode)->i_data_sem)); | ||
918 | if (!create || (retval > 0)) | ||
919 | return retval; | ||
920 | |||
921 | /* | ||
922 | * We need to allocate new blocks which will result | ||
923 | * in i_data update | ||
924 | */ | ||
925 | down_write((&EXT4_I(inode)->i_data_sem)); | ||
926 | /* | ||
927 | * We need to check for EXT4 here because migrate | ||
928 | * could have changed the inode type in between | ||
929 | */ | ||
930 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | ||
931 | retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, | ||
932 | bh, create, extend_disksize); | ||
933 | } else { | ||
934 | retval = ext4_get_blocks_handle(handle, inode, block, | ||
935 | max_blocks, bh, create, extend_disksize); | ||
936 | } | ||
937 | up_write((&EXT4_I(inode)->i_data_sem)); | ||
938 | return retval; | ||
939 | } | ||
940 | |||
945 | static int ext4_get_block(struct inode *inode, sector_t iblock, | 941 | static int ext4_get_block(struct inode *inode, sector_t iblock, |
946 | struct buffer_head *bh_result, int create) | 942 | struct buffer_head *bh_result, int create) |
947 | { | 943 | { |
@@ -996,7 +992,7 @@ get_block: | |||
996 | * `handle' can be NULL if create is zero | 992 | * `handle' can be NULL if create is zero |
997 | */ | 993 | */ |
998 | struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, | 994 | struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, |
999 | long block, int create, int *errp) | 995 | ext4_lblk_t block, int create, int *errp) |
1000 | { | 996 | { |
1001 | struct buffer_head dummy; | 997 | struct buffer_head dummy; |
1002 | int fatal = 0, err; | 998 | int fatal = 0, err; |
@@ -1063,7 +1059,7 @@ err: | |||
1063 | } | 1059 | } |
1064 | 1060 | ||
1065 | struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, | 1061 | struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, |
1066 | int block, int create, int *err) | 1062 | ext4_lblk_t block, int create, int *err) |
1067 | { | 1063 | { |
1068 | struct buffer_head * bh; | 1064 | struct buffer_head * bh; |
1069 | 1065 | ||
@@ -1446,7 +1442,7 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) | |||
1446 | * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... | 1442 | * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... |
1447 | * | 1443 | * |
1448 | * Same applies to ext4_get_block(). We will deadlock on various things like | 1444 | * Same applies to ext4_get_block(). We will deadlock on various things like |
1449 | * lock_journal and i_truncate_mutex. | 1445 | * lock_journal and i_data_sem |
1450 | * | 1446 | * |
1451 | * Setting PF_MEMALLOC here doesn't work - too many internal memory | 1447 | * Setting PF_MEMALLOC here doesn't work - too many internal memory |
1452 | * allocations fail. | 1448 | * allocations fail. |
@@ -1828,7 +1824,8 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, | |||
1828 | { | 1824 | { |
1829 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; | 1825 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; |
1830 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 1826 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
1831 | unsigned blocksize, iblock, length, pos; | 1827 | unsigned blocksize, length, pos; |
1828 | ext4_lblk_t iblock; | ||
1832 | struct inode *inode = mapping->host; | 1829 | struct inode *inode = mapping->host; |
1833 | struct buffer_head *bh; | 1830 | struct buffer_head *bh; |
1834 | int err = 0; | 1831 | int err = 0; |
@@ -1964,7 +1961,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q) | |||
1964 | * (no partially truncated stuff there). */ | 1961 | * (no partially truncated stuff there). */ |
1965 | 1962 | ||
1966 | static Indirect *ext4_find_shared(struct inode *inode, int depth, | 1963 | static Indirect *ext4_find_shared(struct inode *inode, int depth, |
1967 | int offsets[4], Indirect chain[4], __le32 *top) | 1964 | ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) |
1968 | { | 1965 | { |
1969 | Indirect *partial, *p; | 1966 | Indirect *partial, *p; |
1970 | int k, err; | 1967 | int k, err; |
@@ -2048,15 +2045,15 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, | |||
2048 | for (p = first; p < last; p++) { | 2045 | for (p = first; p < last; p++) { |
2049 | u32 nr = le32_to_cpu(*p); | 2046 | u32 nr = le32_to_cpu(*p); |
2050 | if (nr) { | 2047 | if (nr) { |
2051 | struct buffer_head *bh; | 2048 | struct buffer_head *tbh; |
2052 | 2049 | ||
2053 | *p = 0; | 2050 | *p = 0; |
2054 | bh = sb_find_get_block(inode->i_sb, nr); | 2051 | tbh = sb_find_get_block(inode->i_sb, nr); |
2055 | ext4_forget(handle, 0, inode, bh, nr); | 2052 | ext4_forget(handle, 0, inode, tbh, nr); |
2056 | } | 2053 | } |
2057 | } | 2054 | } |
2058 | 2055 | ||
2059 | ext4_free_blocks(handle, inode, block_to_free, count); | 2056 | ext4_free_blocks(handle, inode, block_to_free, count, 0); |
2060 | } | 2057 | } |
2061 | 2058 | ||
2062 | /** | 2059 | /** |
@@ -2229,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
2229 | ext4_journal_test_restart(handle, inode); | 2226 | ext4_journal_test_restart(handle, inode); |
2230 | } | 2227 | } |
2231 | 2228 | ||
2232 | ext4_free_blocks(handle, inode, nr, 1); | 2229 | ext4_free_blocks(handle, inode, nr, 1, 1); |
2233 | 2230 | ||
2234 | if (parent_bh) { | 2231 | if (parent_bh) { |
2235 | /* | 2232 | /* |
@@ -2289,12 +2286,12 @@ void ext4_truncate(struct inode *inode) | |||
2289 | __le32 *i_data = ei->i_data; | 2286 | __le32 *i_data = ei->i_data; |
2290 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | 2287 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); |
2291 | struct address_space *mapping = inode->i_mapping; | 2288 | struct address_space *mapping = inode->i_mapping; |
2292 | int offsets[4]; | 2289 | ext4_lblk_t offsets[4]; |
2293 | Indirect chain[4]; | 2290 | Indirect chain[4]; |
2294 | Indirect *partial; | 2291 | Indirect *partial; |
2295 | __le32 nr = 0; | 2292 | __le32 nr = 0; |
2296 | int n; | 2293 | int n; |
2297 | long last_block; | 2294 | ext4_lblk_t last_block; |
2298 | unsigned blocksize = inode->i_sb->s_blocksize; | 2295 | unsigned blocksize = inode->i_sb->s_blocksize; |
2299 | struct page *page; | 2296 | struct page *page; |
2300 | 2297 | ||
@@ -2320,8 +2317,10 @@ void ext4_truncate(struct inode *inode) | |||
2320 | return; | 2317 | return; |
2321 | } | 2318 | } |
2322 | 2319 | ||
2323 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | 2320 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { |
2324 | return ext4_ext_truncate(inode, page); | 2321 | ext4_ext_truncate(inode, page); |
2322 | return; | ||
2323 | } | ||
2325 | 2324 | ||
2326 | handle = start_transaction(inode); | 2325 | handle = start_transaction(inode); |
2327 | if (IS_ERR(handle)) { | 2326 | if (IS_ERR(handle)) { |
@@ -2369,7 +2368,7 @@ void ext4_truncate(struct inode *inode) | |||
2369 | * From here we block out all ext4_get_block() callers who want to | 2368 | * From here we block out all ext4_get_block() callers who want to |
2370 | * modify the block allocation tree. | 2369 | * modify the block allocation tree. |
2371 | */ | 2370 | */ |
2372 | mutex_lock(&ei->truncate_mutex); | 2371 | down_write(&ei->i_data_sem); |
2373 | 2372 | ||
2374 | if (n == 1) { /* direct blocks */ | 2373 | if (n == 1) { /* direct blocks */ |
2375 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | 2374 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], |
@@ -2433,7 +2432,7 @@ do_indirects: | |||
2433 | 2432 | ||
2434 | ext4_discard_reservation(inode); | 2433 | ext4_discard_reservation(inode); |
2435 | 2434 | ||
2436 | mutex_unlock(&ei->truncate_mutex); | 2435 | up_write(&ei->i_data_sem); |
2437 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | 2436 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); |
2438 | ext4_mark_inode_dirty(handle, inode); | 2437 | ext4_mark_inode_dirty(handle, inode); |
2439 | 2438 | ||
@@ -2460,7 +2459,8 @@ out_stop: | |||
2460 | static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, | 2459 | static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, |
2461 | unsigned long ino, struct ext4_iloc *iloc) | 2460 | unsigned long ino, struct ext4_iloc *iloc) |
2462 | { | 2461 | { |
2463 | unsigned long desc, group_desc, block_group; | 2462 | unsigned long desc, group_desc; |
2463 | ext4_group_t block_group; | ||
2464 | unsigned long offset; | 2464 | unsigned long offset; |
2465 | ext4_fsblk_t block; | 2465 | ext4_fsblk_t block; |
2466 | struct buffer_head *bh; | 2466 | struct buffer_head *bh; |
@@ -2547,7 +2547,7 @@ static int __ext4_get_inode_loc(struct inode *inode, | |||
2547 | struct ext4_group_desc *desc; | 2547 | struct ext4_group_desc *desc; |
2548 | int inodes_per_buffer; | 2548 | int inodes_per_buffer; |
2549 | int inode_offset, i; | 2549 | int inode_offset, i; |
2550 | int block_group; | 2550 | ext4_group_t block_group; |
2551 | int start; | 2551 | int start; |
2552 | 2552 | ||
2553 | block_group = (inode->i_ino - 1) / | 2553 | block_group = (inode->i_ino - 1) / |
@@ -2660,6 +2660,28 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei) | |||
2660 | if (flags & S_DIRSYNC) | 2660 | if (flags & S_DIRSYNC) |
2661 | ei->i_flags |= EXT4_DIRSYNC_FL; | 2661 | ei->i_flags |= EXT4_DIRSYNC_FL; |
2662 | } | 2662 | } |
2663 | static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, | ||
2664 | struct ext4_inode_info *ei) | ||
2665 | { | ||
2666 | blkcnt_t i_blocks ; | ||
2667 | struct inode *inode = &(ei->vfs_inode); | ||
2668 | struct super_block *sb = inode->i_sb; | ||
2669 | |||
2670 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, | ||
2671 | EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { | ||
2672 | /* we are using combined 48 bit field */ | ||
2673 | i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | | ||
2674 | le32_to_cpu(raw_inode->i_blocks_lo); | ||
2675 | if (ei->i_flags & EXT4_HUGE_FILE_FL) { | ||
2676 | /* i_blocks represent file system block size */ | ||
2677 | return i_blocks << (inode->i_blkbits - 9); | ||
2678 | } else { | ||
2679 | return i_blocks; | ||
2680 | } | ||
2681 | } else { | ||
2682 | return le32_to_cpu(raw_inode->i_blocks_lo); | ||
2683 | } | ||
2684 | } | ||
2663 | 2685 | ||
2664 | void ext4_read_inode(struct inode * inode) | 2686 | void ext4_read_inode(struct inode * inode) |
2665 | { | 2687 | { |
@@ -2687,7 +2709,6 @@ void ext4_read_inode(struct inode * inode) | |||
2687 | inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; | 2709 | inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; |
2688 | } | 2710 | } |
2689 | inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); | 2711 | inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); |
2690 | inode->i_size = le32_to_cpu(raw_inode->i_size); | ||
2691 | 2712 | ||
2692 | ei->i_state = 0; | 2713 | ei->i_state = 0; |
2693 | ei->i_dir_start_lookup = 0; | 2714 | ei->i_dir_start_lookup = 0; |
@@ -2709,19 +2730,15 @@ void ext4_read_inode(struct inode * inode) | |||
2709 | * recovery code: that's fine, we're about to complete | 2730 | * recovery code: that's fine, we're about to complete |
2710 | * the process of deleting those. */ | 2731 | * the process of deleting those. */ |
2711 | } | 2732 | } |
2712 | inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); | ||
2713 | ei->i_flags = le32_to_cpu(raw_inode->i_flags); | 2733 | ei->i_flags = le32_to_cpu(raw_inode->i_flags); |
2714 | ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); | 2734 | inode->i_blocks = ext4_inode_blocks(raw_inode, ei); |
2735 | ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); | ||
2715 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != | 2736 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != |
2716 | cpu_to_le32(EXT4_OS_HURD)) | 2737 | cpu_to_le32(EXT4_OS_HURD)) { |
2717 | ei->i_file_acl |= | 2738 | ei->i_file_acl |= |
2718 | ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; | 2739 | ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; |
2719 | if (!S_ISREG(inode->i_mode)) { | ||
2720 | ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); | ||
2721 | } else { | ||
2722 | inode->i_size |= | ||
2723 | ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; | ||
2724 | } | 2740 | } |
2741 | inode->i_size = ext4_isize(raw_inode); | ||
2725 | ei->i_disksize = inode->i_size; | 2742 | ei->i_disksize = inode->i_size; |
2726 | inode->i_generation = le32_to_cpu(raw_inode->i_generation); | 2743 | inode->i_generation = le32_to_cpu(raw_inode->i_generation); |
2727 | ei->i_block_group = iloc.block_group; | 2744 | ei->i_block_group = iloc.block_group; |
@@ -2765,6 +2782,13 @@ void ext4_read_inode(struct inode * inode) | |||
2765 | EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); | 2782 | EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); |
2766 | EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); | 2783 | EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); |
2767 | 2784 | ||
2785 | inode->i_version = le32_to_cpu(raw_inode->i_disk_version); | ||
2786 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { | ||
2787 | if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) | ||
2788 | inode->i_version |= | ||
2789 | (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; | ||
2790 | } | ||
2791 | |||
2768 | if (S_ISREG(inode->i_mode)) { | 2792 | if (S_ISREG(inode->i_mode)) { |
2769 | inode->i_op = &ext4_file_inode_operations; | 2793 | inode->i_op = &ext4_file_inode_operations; |
2770 | inode->i_fop = &ext4_file_operations; | 2794 | inode->i_fop = &ext4_file_operations; |
@@ -2797,6 +2821,55 @@ bad_inode: | |||
2797 | return; | 2821 | return; |
2798 | } | 2822 | } |
2799 | 2823 | ||
2824 | static int ext4_inode_blocks_set(handle_t *handle, | ||
2825 | struct ext4_inode *raw_inode, | ||
2826 | struct ext4_inode_info *ei) | ||
2827 | { | ||
2828 | struct inode *inode = &(ei->vfs_inode); | ||
2829 | u64 i_blocks = inode->i_blocks; | ||
2830 | struct super_block *sb = inode->i_sb; | ||
2831 | int err = 0; | ||
2832 | |||
2833 | if (i_blocks <= ~0U) { | ||
2834 | /* | ||
2835 | * i_blocks can be represnted in a 32 bit variable | ||
2836 | * as multiple of 512 bytes | ||
2837 | */ | ||
2838 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); | ||
2839 | raw_inode->i_blocks_high = 0; | ||
2840 | ei->i_flags &= ~EXT4_HUGE_FILE_FL; | ||
2841 | } else if (i_blocks <= 0xffffffffffffULL) { | ||
2842 | /* | ||
2843 | * i_blocks can be represented in a 48 bit variable | ||
2844 | * as multiple of 512 bytes | ||
2845 | */ | ||
2846 | err = ext4_update_rocompat_feature(handle, sb, | ||
2847 | EXT4_FEATURE_RO_COMPAT_HUGE_FILE); | ||
2848 | if (err) | ||
2849 | goto err_out; | ||
2850 | /* i_block is stored in the split 48 bit fields */ | ||
2851 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); | ||
2852 | raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); | ||
2853 | ei->i_flags &= ~EXT4_HUGE_FILE_FL; | ||
2854 | } else { | ||
2855 | /* | ||
2856 | * i_blocks should be represented in a 48 bit variable | ||
2857 | * as multiple of file system block size | ||
2858 | */ | ||
2859 | err = ext4_update_rocompat_feature(handle, sb, | ||
2860 | EXT4_FEATURE_RO_COMPAT_HUGE_FILE); | ||
2861 | if (err) | ||
2862 | goto err_out; | ||
2863 | ei->i_flags |= EXT4_HUGE_FILE_FL; | ||
2864 | /* i_block is stored in file system block size */ | ||
2865 | i_blocks = i_blocks >> (inode->i_blkbits - 9); | ||
2866 | raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); | ||
2867 | raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); | ||
2868 | } | ||
2869 | err_out: | ||
2870 | return err; | ||
2871 | } | ||
2872 | |||
2800 | /* | 2873 | /* |
2801 | * Post the struct inode info into an on-disk inode location in the | 2874 | * Post the struct inode info into an on-disk inode location in the |
2802 | * buffer-cache. This gobbles the caller's reference to the | 2875 | * buffer-cache. This gobbles the caller's reference to the |
@@ -2845,47 +2918,42 @@ static int ext4_do_update_inode(handle_t *handle, | |||
2845 | raw_inode->i_gid_high = 0; | 2918 | raw_inode->i_gid_high = 0; |
2846 | } | 2919 | } |
2847 | raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); | 2920 | raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); |
2848 | raw_inode->i_size = cpu_to_le32(ei->i_disksize); | ||
2849 | 2921 | ||
2850 | EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); | 2922 | EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); |
2851 | EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); | 2923 | EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); |
2852 | EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); | 2924 | EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); |
2853 | EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); | 2925 | EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); |
2854 | 2926 | ||
2855 | raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); | 2927 | if (ext4_inode_blocks_set(handle, raw_inode, ei)) |
2928 | goto out_brelse; | ||
2856 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); | 2929 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); |
2857 | raw_inode->i_flags = cpu_to_le32(ei->i_flags); | 2930 | raw_inode->i_flags = cpu_to_le32(ei->i_flags); |
2858 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != | 2931 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != |
2859 | cpu_to_le32(EXT4_OS_HURD)) | 2932 | cpu_to_le32(EXT4_OS_HURD)) |
2860 | raw_inode->i_file_acl_high = | 2933 | raw_inode->i_file_acl_high = |
2861 | cpu_to_le16(ei->i_file_acl >> 32); | 2934 | cpu_to_le16(ei->i_file_acl >> 32); |
2862 | raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); | 2935 | raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); |
2863 | if (!S_ISREG(inode->i_mode)) { | 2936 | ext4_isize_set(raw_inode, ei->i_disksize); |
2864 | raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); | 2937 | if (ei->i_disksize > 0x7fffffffULL) { |
2865 | } else { | 2938 | struct super_block *sb = inode->i_sb; |
2866 | raw_inode->i_size_high = | 2939 | if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, |
2867 | cpu_to_le32(ei->i_disksize >> 32); | 2940 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || |
2868 | if (ei->i_disksize > 0x7fffffffULL) { | 2941 | EXT4_SB(sb)->s_es->s_rev_level == |
2869 | struct super_block *sb = inode->i_sb; | 2942 | cpu_to_le32(EXT4_GOOD_OLD_REV)) { |
2870 | if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, | 2943 | /* If this is the first large file |
2871 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || | 2944 | * created, add a flag to the superblock. |
2872 | EXT4_SB(sb)->s_es->s_rev_level == | 2945 | */ |
2873 | cpu_to_le32(EXT4_GOOD_OLD_REV)) { | 2946 | err = ext4_journal_get_write_access(handle, |
2874 | /* If this is the first large file | 2947 | EXT4_SB(sb)->s_sbh); |
2875 | * created, add a flag to the superblock. | 2948 | if (err) |
2876 | */ | 2949 | goto out_brelse; |
2877 | err = ext4_journal_get_write_access(handle, | 2950 | ext4_update_dynamic_rev(sb); |
2878 | EXT4_SB(sb)->s_sbh); | 2951 | EXT4_SET_RO_COMPAT_FEATURE(sb, |
2879 | if (err) | ||
2880 | goto out_brelse; | ||
2881 | ext4_update_dynamic_rev(sb); | ||
2882 | EXT4_SET_RO_COMPAT_FEATURE(sb, | ||
2883 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE); | 2952 | EXT4_FEATURE_RO_COMPAT_LARGE_FILE); |
2884 | sb->s_dirt = 1; | 2953 | sb->s_dirt = 1; |
2885 | handle->h_sync = 1; | 2954 | handle->h_sync = 1; |
2886 | err = ext4_journal_dirty_metadata(handle, | 2955 | err = ext4_journal_dirty_metadata(handle, |
2887 | EXT4_SB(sb)->s_sbh); | 2956 | EXT4_SB(sb)->s_sbh); |
2888 | } | ||
2889 | } | 2957 | } |
2890 | } | 2958 | } |
2891 | raw_inode->i_generation = cpu_to_le32(inode->i_generation); | 2959 | raw_inode->i_generation = cpu_to_le32(inode->i_generation); |
@@ -2903,8 +2971,14 @@ static int ext4_do_update_inode(handle_t *handle, | |||
2903 | } else for (block = 0; block < EXT4_N_BLOCKS; block++) | 2971 | } else for (block = 0; block < EXT4_N_BLOCKS; block++) |
2904 | raw_inode->i_block[block] = ei->i_data[block]; | 2972 | raw_inode->i_block[block] = ei->i_data[block]; |
2905 | 2973 | ||
2906 | if (ei->i_extra_isize) | 2974 | raw_inode->i_disk_version = cpu_to_le32(inode->i_version); |
2975 | if (ei->i_extra_isize) { | ||
2976 | if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) | ||
2977 | raw_inode->i_version_hi = | ||
2978 | cpu_to_le32(inode->i_version >> 32); | ||
2907 | raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); | 2979 | raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); |
2980 | } | ||
2981 | |||
2908 | 2982 | ||
2909 | BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); | 2983 | BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); |
2910 | rc = ext4_journal_dirty_metadata(handle, bh); | 2984 | rc = ext4_journal_dirty_metadata(handle, bh); |
@@ -3024,6 +3098,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
3024 | ext4_journal_stop(handle); | 3098 | ext4_journal_stop(handle); |
3025 | } | 3099 | } |
3026 | 3100 | ||
3101 | if (attr->ia_valid & ATTR_SIZE) { | ||
3102 | if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { | ||
3103 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
3104 | |||
3105 | if (attr->ia_size > sbi->s_bitmap_maxbytes) { | ||
3106 | error = -EFBIG; | ||
3107 | goto err_out; | ||
3108 | } | ||
3109 | } | ||
3110 | } | ||
3111 | |||
3027 | if (S_ISREG(inode->i_mode) && | 3112 | if (S_ISREG(inode->i_mode) && |
3028 | attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { | 3113 | attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { |
3029 | handle_t *handle; | 3114 | handle_t *handle; |
@@ -3120,6 +3205,9 @@ int ext4_mark_iloc_dirty(handle_t *handle, | |||
3120 | { | 3205 | { |
3121 | int err = 0; | 3206 | int err = 0; |
3122 | 3207 | ||
3208 | if (test_opt(inode->i_sb, I_VERSION)) | ||
3209 | inode_inc_iversion(inode); | ||
3210 | |||
3123 | /* the do_update_inode consumes one bh->b_count */ | 3211 | /* the do_update_inode consumes one bh->b_count */ |
3124 | get_bh(iloc->bh); | 3212 | get_bh(iloc->bh); |
3125 | 3213 | ||
@@ -3158,8 +3246,10 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, | |||
3158 | * Expand an inode by new_extra_isize bytes. | 3246 | * Expand an inode by new_extra_isize bytes. |
3159 | * Returns 0 on success or negative error number on failure. | 3247 | * Returns 0 on success or negative error number on failure. |
3160 | */ | 3248 | */ |
3161 | int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, | 3249 | static int ext4_expand_extra_isize(struct inode *inode, |
3162 | struct ext4_iloc iloc, handle_t *handle) | 3250 | unsigned int new_extra_isize, |
3251 | struct ext4_iloc iloc, | ||
3252 | handle_t *handle) | ||
3163 | { | 3253 | { |
3164 | struct ext4_inode *raw_inode; | 3254 | struct ext4_inode *raw_inode; |
3165 | struct ext4_xattr_ibody_header *header; | 3255 | struct ext4_xattr_ibody_header *header; |
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e7f894bdb420..2ed7c37f897e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
@@ -199,7 +199,7 @@ flags_err: | |||
199 | * need to allocate reservation structure for this inode | 199 | * need to allocate reservation structure for this inode |
200 | * before set the window size | 200 | * before set the window size |
201 | */ | 201 | */ |
202 | mutex_lock(&ei->truncate_mutex); | 202 | down_write(&ei->i_data_sem); |
203 | if (!ei->i_block_alloc_info) | 203 | if (!ei->i_block_alloc_info) |
204 | ext4_init_block_alloc_info(inode); | 204 | ext4_init_block_alloc_info(inode); |
205 | 205 | ||
@@ -207,7 +207,7 @@ flags_err: | |||
207 | struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; | 207 | struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; |
208 | rsv->rsv_goal_size = rsv_window_size; | 208 | rsv->rsv_goal_size = rsv_window_size; |
209 | } | 209 | } |
210 | mutex_unlock(&ei->truncate_mutex); | 210 | up_write(&ei->i_data_sem); |
211 | return 0; | 211 | return 0; |
212 | } | 212 | } |
213 | case EXT4_IOC_GROUP_EXTEND: { | 213 | case EXT4_IOC_GROUP_EXTEND: { |
@@ -254,6 +254,9 @@ flags_err: | |||
254 | return err; | 254 | return err; |
255 | } | 255 | } |
256 | 256 | ||
257 | case EXT4_IOC_MIGRATE: | ||
258 | return ext4_ext_migrate(inode, filp, cmd, arg); | ||
259 | |||
257 | default: | 260 | default: |
258 | return -ENOTTY; | 261 | return -ENOTTY; |
259 | } | 262 | } |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c new file mode 100644 index 000000000000..76e5fedc0a0b --- /dev/null +++ b/fs/ext4/mballoc.c | |||
@@ -0,0 +1,4552 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com | ||
3 | * Written by Alex Tomas <alex@clusterfs.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License version 2 as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public Licens | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- | ||
17 | */ | ||
18 | |||
19 | |||
20 | /* | ||
21 | * mballoc.c contains the multiblocks allocation routines | ||
22 | */ | ||
23 | |||
24 | #include <linux/time.h> | ||
25 | #include <linux/fs.h> | ||
26 | #include <linux/namei.h> | ||
27 | #include <linux/ext4_jbd2.h> | ||
28 | #include <linux/ext4_fs.h> | ||
29 | #include <linux/quotaops.h> | ||
30 | #include <linux/buffer_head.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/swap.h> | ||
33 | #include <linux/proc_fs.h> | ||
34 | #include <linux/pagemap.h> | ||
35 | #include <linux/seq_file.h> | ||
36 | #include <linux/version.h> | ||
37 | #include "group.h" | ||
38 | |||
39 | /* | ||
40 | * MUSTDO: | ||
41 | * - test ext4_ext_search_left() and ext4_ext_search_right() | ||
42 | * - search for metadata in few groups | ||
43 | * | ||
44 | * TODO v4: | ||
45 | * - normalization should take into account whether file is still open | ||
46 | * - discard preallocations if no free space left (policy?) | ||
47 | * - don't normalize tails | ||
48 | * - quota | ||
49 | * - reservation for superuser | ||
50 | * | ||
51 | * TODO v3: | ||
52 | * - bitmap read-ahead (proposed by Oleg Drokin aka green) | ||
53 | * - track min/max extents in each group for better group selection | ||
54 | * - mb_mark_used() may allocate chunk right after splitting buddy | ||
55 | * - tree of groups sorted by number of free blocks | ||
56 | * - error handling | ||
57 | */ | ||
58 | |||
59 | /* | ||
60 | * The allocation request involve request for multiple number of blocks | ||
61 | * near to the goal(block) value specified. | ||
62 | * | ||
63 | * During initialization phase of the allocator we decide to use the group | ||
64 | * preallocation or inode preallocation depending on the size file. The | ||
65 | * size of the file could be the resulting file size we would have after | ||
66 | * allocation or the current file size which ever is larger. If the size is | ||
67 | * less that sbi->s_mb_stream_request we select the group | ||
68 | * preallocation. The default value of s_mb_stream_request is 16 | ||
69 | * blocks. This can also be tuned via | ||
70 | * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms | ||
71 | * of number of blocks. | ||
72 | * | ||
73 | * The main motivation for having small file use group preallocation is to | ||
74 | * ensure that we have small file closer in the disk. | ||
75 | * | ||
76 | * First stage the allocator looks at the inode prealloc list | ||
77 | * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for | ||
78 | * this particular inode. The inode prealloc space is represented as: | ||
79 | * | ||
80 | * pa_lstart -> the logical start block for this prealloc space | ||
81 | * pa_pstart -> the physical start block for this prealloc space | ||
82 | * pa_len -> lenght for this prealloc space | ||
83 | * pa_free -> free space available in this prealloc space | ||
84 | * | ||
85 | * The inode preallocation space is used looking at the _logical_ start | ||
86 | * block. If only the logical file block falls within the range of prealloc | ||
87 | * space we will consume the particular prealloc space. This make sure that | ||
88 | * that the we have contiguous physical blocks representing the file blocks | ||
89 | * | ||
90 | * The important thing to be noted in case of inode prealloc space is that | ||
91 | * we don't modify the values associated to inode prealloc space except | ||
92 | * pa_free. | ||
93 | * | ||
94 | * If we are not able to find blocks in the inode prealloc space and if we | ||
95 | * have the group allocation flag set then we look at the locality group | ||
96 | * prealloc space. These are per CPU prealloc list repreasented as | ||
97 | * | ||
98 | * ext4_sb_info.s_locality_groups[smp_processor_id()] | ||
99 | * | ||
100 | * The reason for having a per cpu locality group is to reduce the contention | ||
101 | * between CPUs. It is possible to get scheduled at this point. | ||
102 | * | ||
103 | * The locality group prealloc space is used looking at whether we have | ||
104 | * enough free space (pa_free) withing the prealloc space. | ||
105 | * | ||
106 | * If we can't allocate blocks via inode prealloc or/and locality group | ||
107 | * prealloc then we look at the buddy cache. The buddy cache is represented | ||
108 | * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets | ||
109 | * mapped to the buddy and bitmap information regarding different | ||
110 | * groups. The buddy information is attached to buddy cache inode so that | ||
111 | * we can access them through the page cache. The information regarding | ||
112 | * each group is loaded via ext4_mb_load_buddy. The information involve | ||
113 | * block bitmap and buddy information. The information are stored in the | ||
114 | * inode as: | ||
115 | * | ||
116 | * { page } | ||
117 | * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... | ||
118 | * | ||
119 | * | ||
120 | * one block each for bitmap and buddy information. So for each group we | ||
121 | * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / | ||
122 | * blocksize) blocks. So it can have information regarding groups_per_page | ||
123 | * which is blocks_per_page/2 | ||
124 | * | ||
125 | * The buddy cache inode is not stored on disk. The inode is thrown | ||
126 | * away when the filesystem is unmounted. | ||
127 | * | ||
128 | * We look for count number of blocks in the buddy cache. If we were able | ||
129 | * to locate that many free blocks we return with additional information | ||
130 | * regarding rest of the contiguous physical block available | ||
131 | * | ||
132 | * Before allocating blocks via buddy cache we normalize the request | ||
133 | * blocks. This ensure we ask for more blocks that we needed. The extra | ||
134 | * blocks that we get after allocation is added to the respective prealloc | ||
135 | * list. In case of inode preallocation we follow a list of heuristics | ||
136 | * based on file size. This can be found in ext4_mb_normalize_request. If | ||
137 | * we are doing a group prealloc we try to normalize the request to | ||
138 | * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to | ||
139 | * 512 blocks. This can be tuned via | ||
140 | * /proc/fs/ext4/<partition/group_prealloc. The value is represented in | ||
141 | * terms of number of blocks. If we have mounted the file system with -O | ||
142 | * stripe=<value> option the group prealloc request is normalized to the | ||
143 | * stripe value (sbi->s_stripe) | ||
144 | * | ||
145 | * The regular allocator(using the buddy cache) support few tunables. | ||
146 | * | ||
147 | * /proc/fs/ext4/<partition>/min_to_scan | ||
148 | * /proc/fs/ext4/<partition>/max_to_scan | ||
149 | * /proc/fs/ext4/<partition>/order2_req | ||
150 | * | ||
151 | * The regular allocator use buddy scan only if the request len is power of | ||
152 | * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The | ||
153 | * value of s_mb_order2_reqs can be tuned via | ||
154 | * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to | ||
155 | * stripe size (sbi->s_stripe), we try to search for contigous block in | ||
156 | * stripe size. This should result in better allocation on RAID setup. If | ||
157 | * not we search in the specific group using bitmap for best extents. The | ||
158 | * tunable min_to_scan and max_to_scan controll the behaviour here. | ||
159 | * min_to_scan indicate how long the mballoc __must__ look for a best | ||
160 | * extent and max_to_scanindicate how long the mballoc __can__ look for a | ||
161 | * best extent in the found extents. Searching for the blocks starts with | ||
162 | * the group specified as the goal value in allocation context via | ||
163 | * ac_g_ex. Each group is first checked based on the criteria whether it | ||
164 | * can used for allocation. ext4_mb_good_group explains how the groups are | ||
165 | * checked. | ||
166 | * | ||
167 | * Both the prealloc space are getting populated as above. So for the first | ||
168 | * request we will hit the buddy cache which will result in this prealloc | ||
169 | * space getting filled. The prealloc space is then later used for the | ||
170 | * subsequent request. | ||
171 | */ | ||
172 | |||
173 | /* | ||
174 | * mballoc operates on the following data: | ||
175 | * - on-disk bitmap | ||
176 | * - in-core buddy (actually includes buddy and bitmap) | ||
177 | * - preallocation descriptors (PAs) | ||
178 | * | ||
179 | * there are two types of preallocations: | ||
180 | * - inode | ||
181 | * assiged to specific inode and can be used for this inode only. | ||
182 | * it describes part of inode's space preallocated to specific | ||
183 | * physical blocks. any block from that preallocated can be used | ||
184 | * independent. the descriptor just tracks number of blocks left | ||
185 | * unused. so, before taking some block from descriptor, one must | ||
186 | * make sure corresponded logical block isn't allocated yet. this | ||
187 | * also means that freeing any block within descriptor's range | ||
188 | * must discard all preallocated blocks. | ||
189 | * - locality group | ||
190 | * assigned to specific locality group which does not translate to | ||
191 | * permanent set of inodes: inode can join and leave group. space | ||
192 | * from this type of preallocation can be used for any inode. thus | ||
193 | * it's consumed from the beginning to the end. | ||
194 | * | ||
195 | * relation between them can be expressed as: | ||
196 | * in-core buddy = on-disk bitmap + preallocation descriptors | ||
197 | * | ||
198 | * this mean blocks mballoc considers used are: | ||
199 | * - allocated blocks (persistent) | ||
200 | * - preallocated blocks (non-persistent) | ||
201 | * | ||
202 | * consistency in mballoc world means that at any time a block is either | ||
203 | * free or used in ALL structures. notice: "any time" should not be read | ||
204 | * literally -- time is discrete and delimited by locks. | ||
205 | * | ||
206 | * to keep it simple, we don't use block numbers, instead we count number of | ||
207 | * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. | ||
208 | * | ||
209 | * all operations can be expressed as: | ||
210 | * - init buddy: buddy = on-disk + PAs | ||
211 | * - new PA: buddy += N; PA = N | ||
212 | * - use inode PA: on-disk += N; PA -= N | ||
213 | * - discard inode PA buddy -= on-disk - PA; PA = 0 | ||
214 | * - use locality group PA on-disk += N; PA -= N | ||
215 | * - discard locality group PA buddy -= PA; PA = 0 | ||
216 | * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap | ||
217 | * is used in real operation because we can't know actual used | ||
218 | * bits from PA, only from on-disk bitmap | ||
219 | * | ||
220 | * if we follow this strict logic, then all operations above should be atomic. | ||
221 | * given some of them can block, we'd have to use something like semaphores | ||
222 | * killing performance on high-end SMP hardware. let's try to relax it using | ||
223 | * the following knowledge: | ||
224 | * 1) if buddy is referenced, it's already initialized | ||
225 | * 2) while block is used in buddy and the buddy is referenced, | ||
226 | * nobody can re-allocate that block | ||
227 | * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has | ||
228 | * bit set and PA claims same block, it's OK. IOW, one can set bit in | ||
229 | * on-disk bitmap if buddy has same bit set or/and PA covers corresponded | ||
230 | * block | ||
231 | * | ||
232 | * so, now we're building a concurrency table: | ||
233 | * - init buddy vs. | ||
234 | * - new PA | ||
235 | * blocks for PA are allocated in the buddy, buddy must be referenced | ||
236 | * until PA is linked to allocation group to avoid concurrent buddy init | ||
237 | * - use inode PA | ||
238 | * we need to make sure that either on-disk bitmap or PA has uptodate data | ||
239 | * given (3) we care that PA-=N operation doesn't interfere with init | ||
240 | * - discard inode PA | ||
241 | * the simplest way would be to have buddy initialized by the discard | ||
242 | * - use locality group PA | ||
243 | * again PA-=N must be serialized with init | ||
244 | * - discard locality group PA | ||
245 | * the simplest way would be to have buddy initialized by the discard | ||
246 | * - new PA vs. | ||
247 | * - use inode PA | ||
248 | * i_data_sem serializes them | ||
249 | * - discard inode PA | ||
250 | * discard process must wait until PA isn't used by another process | ||
251 | * - use locality group PA | ||
252 | * some mutex should serialize them | ||
253 | * - discard locality group PA | ||
254 | * discard process must wait until PA isn't used by another process | ||
255 | * - use inode PA | ||
256 | * - use inode PA | ||
257 | * i_data_sem or another mutex should serializes them | ||
258 | * - discard inode PA | ||
259 | * discard process must wait until PA isn't used by another process | ||
260 | * - use locality group PA | ||
261 | * nothing wrong here -- they're different PAs covering different blocks | ||
262 | * - discard locality group PA | ||
263 | * discard process must wait until PA isn't used by another process | ||
264 | * | ||
265 | * now we're ready to make few consequences: | ||
266 | * - PA is referenced and while it is no discard is possible | ||
267 | * - PA is referenced until block isn't marked in on-disk bitmap | ||
268 | * - PA changes only after on-disk bitmap | ||
269 | * - discard must not compete with init. either init is done before | ||
270 | * any discard or they're serialized somehow | ||
271 | * - buddy init as sum of on-disk bitmap and PAs is done atomically | ||
272 | * | ||
273 | * a special case when we've used PA to emptiness. no need to modify buddy | ||
274 | * in this case, but we should care about concurrent init | ||
275 | * | ||
276 | */ | ||
277 | |||
278 | /* | ||
279 | * Logic in few words: | ||
280 | * | ||
281 | * - allocation: | ||
282 | * load group | ||
283 | * find blocks | ||
284 | * mark bits in on-disk bitmap | ||
285 | * release group | ||
286 | * | ||
287 | * - use preallocation: | ||
288 | * find proper PA (per-inode or group) | ||
289 | * load group | ||
290 | * mark bits in on-disk bitmap | ||
291 | * release group | ||
292 | * release PA | ||
293 | * | ||
294 | * - free: | ||
295 | * load group | ||
296 | * mark bits in on-disk bitmap | ||
297 | * release group | ||
298 | * | ||
299 | * - discard preallocations in group: | ||
300 | * mark PAs deleted | ||
301 | * move them onto local list | ||
302 | * load on-disk bitmap | ||
303 | * load group | ||
304 | * remove PA from object (inode or locality group) | ||
305 | * mark free blocks in-core | ||
306 | * | ||
307 | * - discard inode's preallocations: | ||
308 | */ | ||
309 | |||
310 | /* | ||
311 | * Locking rules | ||
312 | * | ||
313 | * Locks: | ||
314 | * - bitlock on a group (group) | ||
315 | * - object (inode/locality) (object) | ||
316 | * - per-pa lock (pa) | ||
317 | * | ||
318 | * Paths: | ||
319 | * - new pa | ||
320 | * object | ||
321 | * group | ||
322 | * | ||
323 | * - find and use pa: | ||
324 | * pa | ||
325 | * | ||
326 | * - release consumed pa: | ||
327 | * pa | ||
328 | * group | ||
329 | * object | ||
330 | * | ||
331 | * - generate in-core bitmap: | ||
332 | * group | ||
333 | * pa | ||
334 | * | ||
335 | * - discard all for given object (inode, locality group): | ||
336 | * object | ||
337 | * pa | ||
338 | * group | ||
339 | * | ||
340 | * - discard all for given group: | ||
341 | * group | ||
342 | * pa | ||
343 | * group | ||
344 | * object | ||
345 | * | ||
346 | */ | ||
347 | |||
348 | /* | ||
349 | * with AGGRESSIVE_CHECK allocator runs consistency checks over | ||
350 | * structures. these checks slow things down a lot | ||
351 | */ | ||
352 | #define AGGRESSIVE_CHECK__ | ||
353 | |||
354 | /* | ||
355 | * with DOUBLE_CHECK defined mballoc creates persistent in-core | ||
356 | * bitmaps, maintains and uses them to check for double allocations | ||
357 | */ | ||
358 | #define DOUBLE_CHECK__ | ||
359 | |||
360 | /* | ||
361 | */ | ||
362 | #define MB_DEBUG__ | ||
363 | #ifdef MB_DEBUG | ||
364 | #define mb_debug(fmt, a...) printk(fmt, ##a) | ||
365 | #else | ||
366 | #define mb_debug(fmt, a...) | ||
367 | #endif | ||
368 | |||
369 | /* | ||
370 | * with EXT4_MB_HISTORY mballoc stores last N allocations in memory | ||
371 | * and you can monitor it in /proc/fs/ext4/<dev>/mb_history | ||
372 | */ | ||
373 | #define EXT4_MB_HISTORY | ||
374 | #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ | ||
375 | #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ | ||
376 | #define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ | ||
377 | #define EXT4_MB_HISTORY_FREE 8 /* free */ | ||
378 | |||
379 | #define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ | ||
380 | EXT4_MB_HISTORY_PREALLOC) | ||
381 | |||
382 | /* | ||
383 | * How long mballoc can look for a best extent (in found extents) | ||
384 | */ | ||
385 | #define MB_DEFAULT_MAX_TO_SCAN 200 | ||
386 | |||
387 | /* | ||
388 | * How long mballoc must look for a best extent | ||
389 | */ | ||
390 | #define MB_DEFAULT_MIN_TO_SCAN 10 | ||
391 | |||
392 | /* | ||
393 | * How many groups mballoc will scan looking for the best chunk | ||
394 | */ | ||
395 | #define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 | ||
396 | |||
397 | /* | ||
398 | * with 'ext4_mb_stats' allocator will collect stats that will be | ||
399 | * shown at umount. The collecting costs though! | ||
400 | */ | ||
401 | #define MB_DEFAULT_STATS 1 | ||
402 | |||
403 | /* | ||
404 | * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served | ||
405 | * by the stream allocator, which purpose is to pack requests | ||
406 | * as close each to other as possible to produce smooth I/O traffic | ||
407 | * We use locality group prealloc space for stream request. | ||
408 | * We can tune the same via /proc/fs/ext4/<parition>/stream_req | ||
409 | */ | ||
410 | #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ | ||
411 | |||
412 | /* | ||
413 | * for which requests use 2^N search using buddies | ||
414 | */ | ||
415 | #define MB_DEFAULT_ORDER2_REQS 2 | ||
416 | |||
417 | /* | ||
418 | * default group prealloc size 512 blocks | ||
419 | */ | ||
420 | #define MB_DEFAULT_GROUP_PREALLOC 512 | ||
421 | |||
422 | static struct kmem_cache *ext4_pspace_cachep; | ||
423 | |||
424 | #ifdef EXT4_BB_MAX_BLOCKS | ||
425 | #undef EXT4_BB_MAX_BLOCKS | ||
426 | #endif | ||
427 | #define EXT4_BB_MAX_BLOCKS 30 | ||
428 | |||
429 | struct ext4_free_metadata { | ||
430 | ext4_group_t group; | ||
431 | unsigned short num; | ||
432 | ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; | ||
433 | struct list_head list; | ||
434 | }; | ||
435 | |||
436 | struct ext4_group_info { | ||
437 | unsigned long bb_state; | ||
438 | unsigned long bb_tid; | ||
439 | struct ext4_free_metadata *bb_md_cur; | ||
440 | unsigned short bb_first_free; | ||
441 | unsigned short bb_free; | ||
442 | unsigned short bb_fragments; | ||
443 | struct list_head bb_prealloc_list; | ||
444 | #ifdef DOUBLE_CHECK | ||
445 | void *bb_bitmap; | ||
446 | #endif | ||
447 | unsigned short bb_counters[]; | ||
448 | }; | ||
449 | |||
450 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 | ||
451 | #define EXT4_GROUP_INFO_LOCKED_BIT 1 | ||
452 | |||
453 | #define EXT4_MB_GRP_NEED_INIT(grp) \ | ||
454 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) | ||
455 | |||
456 | |||
457 | struct ext4_prealloc_space { | ||
458 | struct list_head pa_inode_list; | ||
459 | struct list_head pa_group_list; | ||
460 | union { | ||
461 | struct list_head pa_tmp_list; | ||
462 | struct rcu_head pa_rcu; | ||
463 | } u; | ||
464 | spinlock_t pa_lock; | ||
465 | atomic_t pa_count; | ||
466 | unsigned pa_deleted; | ||
467 | ext4_fsblk_t pa_pstart; /* phys. block */ | ||
468 | ext4_lblk_t pa_lstart; /* log. block */ | ||
469 | unsigned short pa_len; /* len of preallocated chunk */ | ||
470 | unsigned short pa_free; /* how many blocks are free */ | ||
471 | unsigned short pa_linear; /* consumed in one direction | ||
472 | * strictly, for grp prealloc */ | ||
473 | spinlock_t *pa_obj_lock; | ||
474 | struct inode *pa_inode; /* hack, for history only */ | ||
475 | }; | ||
476 | |||
477 | |||
478 | struct ext4_free_extent { | ||
479 | ext4_lblk_t fe_logical; | ||
480 | ext4_grpblk_t fe_start; | ||
481 | ext4_group_t fe_group; | ||
482 | int fe_len; | ||
483 | }; | ||
484 | |||
485 | /* | ||
486 | * Locality group: | ||
487 | * we try to group all related changes together | ||
488 | * so that writeback can flush/allocate them together as well | ||
489 | */ | ||
490 | struct ext4_locality_group { | ||
491 | /* for allocator */ | ||
492 | struct mutex lg_mutex; /* to serialize allocates */ | ||
493 | struct list_head lg_prealloc_list;/* list of preallocations */ | ||
494 | spinlock_t lg_prealloc_lock; | ||
495 | }; | ||
496 | |||
497 | struct ext4_allocation_context { | ||
498 | struct inode *ac_inode; | ||
499 | struct super_block *ac_sb; | ||
500 | |||
501 | /* original request */ | ||
502 | struct ext4_free_extent ac_o_ex; | ||
503 | |||
504 | /* goal request (after normalization) */ | ||
505 | struct ext4_free_extent ac_g_ex; | ||
506 | |||
507 | /* the best found extent */ | ||
508 | struct ext4_free_extent ac_b_ex; | ||
509 | |||
510 | /* copy of the bext found extent taken before preallocation efforts */ | ||
511 | struct ext4_free_extent ac_f_ex; | ||
512 | |||
513 | /* number of iterations done. we have to track to limit searching */ | ||
514 | unsigned long ac_ex_scanned; | ||
515 | __u16 ac_groups_scanned; | ||
516 | __u16 ac_found; | ||
517 | __u16 ac_tail; | ||
518 | __u16 ac_buddy; | ||
519 | __u16 ac_flags; /* allocation hints */ | ||
520 | __u8 ac_status; | ||
521 | __u8 ac_criteria; | ||
522 | __u8 ac_repeats; | ||
523 | __u8 ac_2order; /* if request is to allocate 2^N blocks and | ||
524 | * N > 0, the field stores N, otherwise 0 */ | ||
525 | __u8 ac_op; /* operation, for history only */ | ||
526 | struct page *ac_bitmap_page; | ||
527 | struct page *ac_buddy_page; | ||
528 | struct ext4_prealloc_space *ac_pa; | ||
529 | struct ext4_locality_group *ac_lg; | ||
530 | }; | ||
531 | |||
532 | #define AC_STATUS_CONTINUE 1 | ||
533 | #define AC_STATUS_FOUND 2 | ||
534 | #define AC_STATUS_BREAK 3 | ||
535 | |||
536 | struct ext4_mb_history { | ||
537 | struct ext4_free_extent orig; /* orig allocation */ | ||
538 | struct ext4_free_extent goal; /* goal allocation */ | ||
539 | struct ext4_free_extent result; /* result allocation */ | ||
540 | unsigned pid; | ||
541 | unsigned ino; | ||
542 | __u16 found; /* how many extents have been found */ | ||
543 | __u16 groups; /* how many groups have been scanned */ | ||
544 | __u16 tail; /* what tail broke some buddy */ | ||
545 | __u16 buddy; /* buddy the tail ^^^ broke */ | ||
546 | __u16 flags; | ||
547 | __u8 cr:3; /* which phase the result extent was found at */ | ||
548 | __u8 op:4; | ||
549 | __u8 merged:1; | ||
550 | }; | ||
551 | |||
552 | struct ext4_buddy { | ||
553 | struct page *bd_buddy_page; | ||
554 | void *bd_buddy; | ||
555 | struct page *bd_bitmap_page; | ||
556 | void *bd_bitmap; | ||
557 | struct ext4_group_info *bd_info; | ||
558 | struct super_block *bd_sb; | ||
559 | __u16 bd_blkbits; | ||
560 | ext4_group_t bd_group; | ||
561 | }; | ||
562 | #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) | ||
563 | #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) | ||
564 | |||
565 | #ifndef EXT4_MB_HISTORY | ||
566 | static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) | ||
567 | { | ||
568 | return; | ||
569 | } | ||
570 | #else | ||
571 | static void ext4_mb_store_history(struct ext4_allocation_context *ac); | ||
572 | #endif | ||
573 | |||
574 | #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) | ||
575 | |||
576 | static struct proc_dir_entry *proc_root_ext4; | ||
577 | struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); | ||
578 | ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | ||
579 | ext4_fsblk_t goal, unsigned long *count, int *errp); | ||
580 | |||
581 | static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | ||
582 | ext4_group_t group); | ||
583 | static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); | ||
584 | static void ext4_mb_free_committed_blocks(struct super_block *); | ||
585 | static void ext4_mb_return_to_preallocation(struct inode *inode, | ||
586 | struct ext4_buddy *e4b, sector_t block, | ||
587 | int count); | ||
588 | static void ext4_mb_put_pa(struct ext4_allocation_context *, | ||
589 | struct super_block *, struct ext4_prealloc_space *pa); | ||
590 | static int ext4_mb_init_per_dev_proc(struct super_block *sb); | ||
591 | static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); | ||
592 | |||
593 | |||
594 | static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) | ||
595 | { | ||
596 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
597 | |||
598 | bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); | ||
599 | } | ||
600 | |||
601 | static inline void ext4_unlock_group(struct super_block *sb, | ||
602 | ext4_group_t group) | ||
603 | { | ||
604 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
605 | |||
606 | bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); | ||
607 | } | ||
608 | |||
609 | static inline int ext4_is_group_locked(struct super_block *sb, | ||
610 | ext4_group_t group) | ||
611 | { | ||
612 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
613 | |||
614 | return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, | ||
615 | &(grinfo->bb_state)); | ||
616 | } | ||
617 | |||
618 | static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, | ||
619 | struct ext4_free_extent *fex) | ||
620 | { | ||
621 | ext4_fsblk_t block; | ||
622 | |||
623 | block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) | ||
624 | + fex->fe_start | ||
625 | + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | ||
626 | return block; | ||
627 | } | ||
628 | |||
629 | #if BITS_PER_LONG == 64 | ||
630 | #define mb_correct_addr_and_bit(bit, addr) \ | ||
631 | { \ | ||
632 | bit += ((unsigned long) addr & 7UL) << 3; \ | ||
633 | addr = (void *) ((unsigned long) addr & ~7UL); \ | ||
634 | } | ||
635 | #elif BITS_PER_LONG == 32 | ||
636 | #define mb_correct_addr_and_bit(bit, addr) \ | ||
637 | { \ | ||
638 | bit += ((unsigned long) addr & 3UL) << 3; \ | ||
639 | addr = (void *) ((unsigned long) addr & ~3UL); \ | ||
640 | } | ||
641 | #else | ||
642 | #error "how many bits you are?!" | ||
643 | #endif | ||
644 | |||
645 | static inline int mb_test_bit(int bit, void *addr) | ||
646 | { | ||
647 | /* | ||
648 | * ext4_test_bit on architecture like powerpc | ||
649 | * needs unsigned long aligned address | ||
650 | */ | ||
651 | mb_correct_addr_and_bit(bit, addr); | ||
652 | return ext4_test_bit(bit, addr); | ||
653 | } | ||
654 | |||
655 | static inline void mb_set_bit(int bit, void *addr) | ||
656 | { | ||
657 | mb_correct_addr_and_bit(bit, addr); | ||
658 | ext4_set_bit(bit, addr); | ||
659 | } | ||
660 | |||
661 | static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) | ||
662 | { | ||
663 | mb_correct_addr_and_bit(bit, addr); | ||
664 | ext4_set_bit_atomic(lock, bit, addr); | ||
665 | } | ||
666 | |||
667 | static inline void mb_clear_bit(int bit, void *addr) | ||
668 | { | ||
669 | mb_correct_addr_and_bit(bit, addr); | ||
670 | ext4_clear_bit(bit, addr); | ||
671 | } | ||
672 | |||
673 | static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) | ||
674 | { | ||
675 | mb_correct_addr_and_bit(bit, addr); | ||
676 | ext4_clear_bit_atomic(lock, bit, addr); | ||
677 | } | ||
678 | |||
679 | static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) | ||
680 | { | ||
681 | char *bb; | ||
682 | |||
683 | /* FIXME!! is this needed */ | ||
684 | BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); | ||
685 | BUG_ON(max == NULL); | ||
686 | |||
687 | if (order > e4b->bd_blkbits + 1) { | ||
688 | *max = 0; | ||
689 | return NULL; | ||
690 | } | ||
691 | |||
692 | /* at order 0 we see each particular block */ | ||
693 | *max = 1 << (e4b->bd_blkbits + 3); | ||
694 | if (order == 0) | ||
695 | return EXT4_MB_BITMAP(e4b); | ||
696 | |||
697 | bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; | ||
698 | *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; | ||
699 | |||
700 | return bb; | ||
701 | } | ||
702 | |||
703 | #ifdef DOUBLE_CHECK | ||
704 | static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, | ||
705 | int first, int count) | ||
706 | { | ||
707 | int i; | ||
708 | struct super_block *sb = e4b->bd_sb; | ||
709 | |||
710 | if (unlikely(e4b->bd_info->bb_bitmap == NULL)) | ||
711 | return; | ||
712 | BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); | ||
713 | for (i = 0; i < count; i++) { | ||
714 | if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { | ||
715 | ext4_fsblk_t blocknr; | ||
716 | blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); | ||
717 | blocknr += first + i; | ||
718 | blocknr += | ||
719 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | ||
720 | |||
721 | ext4_error(sb, __FUNCTION__, "double-free of inode" | ||
722 | " %lu's block %llu(bit %u in group %lu)\n", | ||
723 | inode ? inode->i_ino : 0, blocknr, | ||
724 | first + i, e4b->bd_group); | ||
725 | } | ||
726 | mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); | ||
727 | } | ||
728 | } | ||
729 | |||
730 | static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) | ||
731 | { | ||
732 | int i; | ||
733 | |||
734 | if (unlikely(e4b->bd_info->bb_bitmap == NULL)) | ||
735 | return; | ||
736 | BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); | ||
737 | for (i = 0; i < count; i++) { | ||
738 | BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); | ||
739 | mb_set_bit(first + i, e4b->bd_info->bb_bitmap); | ||
740 | } | ||
741 | } | ||
742 | |||
743 | static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) | ||
744 | { | ||
745 | if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { | ||
746 | unsigned char *b1, *b2; | ||
747 | int i; | ||
748 | b1 = (unsigned char *) e4b->bd_info->bb_bitmap; | ||
749 | b2 = (unsigned char *) bitmap; | ||
750 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { | ||
751 | if (b1[i] != b2[i]) { | ||
752 | printk("corruption in group %lu at byte %u(%u):" | ||
753 | " %x in copy != %x on disk/prealloc\n", | ||
754 | e4b->bd_group, i, i * 8, b1[i], b2[i]); | ||
755 | BUG(); | ||
756 | } | ||
757 | } | ||
758 | } | ||
759 | } | ||
760 | |||
761 | #else | ||
762 | static inline void mb_free_blocks_double(struct inode *inode, | ||
763 | struct ext4_buddy *e4b, int first, int count) | ||
764 | { | ||
765 | return; | ||
766 | } | ||
767 | static inline void mb_mark_used_double(struct ext4_buddy *e4b, | ||
768 | int first, int count) | ||
769 | { | ||
770 | return; | ||
771 | } | ||
772 | static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) | ||
773 | { | ||
774 | return; | ||
775 | } | ||
776 | #endif | ||
777 | |||
778 | #ifdef AGGRESSIVE_CHECK | ||
779 | |||
780 | #define MB_CHECK_ASSERT(assert) \ | ||
781 | do { \ | ||
782 | if (!(assert)) { \ | ||
783 | printk(KERN_EMERG \ | ||
784 | "Assertion failure in %s() at %s:%d: \"%s\"\n", \ | ||
785 | function, file, line, # assert); \ | ||
786 | BUG(); \ | ||
787 | } \ | ||
788 | } while (0) | ||
789 | |||
790 | static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, | ||
791 | const char *function, int line) | ||
792 | { | ||
793 | struct super_block *sb = e4b->bd_sb; | ||
794 | int order = e4b->bd_blkbits + 1; | ||
795 | int max; | ||
796 | int max2; | ||
797 | int i; | ||
798 | int j; | ||
799 | int k; | ||
800 | int count; | ||
801 | struct ext4_group_info *grp; | ||
802 | int fragments = 0; | ||
803 | int fstart; | ||
804 | struct list_head *cur; | ||
805 | void *buddy; | ||
806 | void *buddy2; | ||
807 | |||
808 | if (!test_opt(sb, MBALLOC)) | ||
809 | return 0; | ||
810 | |||
811 | { | ||
812 | static int mb_check_counter; | ||
813 | if (mb_check_counter++ % 100 != 0) | ||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | while (order > 1) { | ||
818 | buddy = mb_find_buddy(e4b, order, &max); | ||
819 | MB_CHECK_ASSERT(buddy); | ||
820 | buddy2 = mb_find_buddy(e4b, order - 1, &max2); | ||
821 | MB_CHECK_ASSERT(buddy2); | ||
822 | MB_CHECK_ASSERT(buddy != buddy2); | ||
823 | MB_CHECK_ASSERT(max * 2 == max2); | ||
824 | |||
825 | count = 0; | ||
826 | for (i = 0; i < max; i++) { | ||
827 | |||
828 | if (mb_test_bit(i, buddy)) { | ||
829 | /* only single bit in buddy2 may be 1 */ | ||
830 | if (!mb_test_bit(i << 1, buddy2)) { | ||
831 | MB_CHECK_ASSERT( | ||
832 | mb_test_bit((i<<1)+1, buddy2)); | ||
833 | } else if (!mb_test_bit((i << 1) + 1, buddy2)) { | ||
834 | MB_CHECK_ASSERT( | ||
835 | mb_test_bit(i << 1, buddy2)); | ||
836 | } | ||
837 | continue; | ||
838 | } | ||
839 | |||
840 | /* both bits in buddy2 must be 0 */ | ||
841 | MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); | ||
842 | MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); | ||
843 | |||
844 | for (j = 0; j < (1 << order); j++) { | ||
845 | k = (i * (1 << order)) + j; | ||
846 | MB_CHECK_ASSERT( | ||
847 | !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); | ||
848 | } | ||
849 | count++; | ||
850 | } | ||
851 | MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); | ||
852 | order--; | ||
853 | } | ||
854 | |||
855 | fstart = -1; | ||
856 | buddy = mb_find_buddy(e4b, 0, &max); | ||
857 | for (i = 0; i < max; i++) { | ||
858 | if (!mb_test_bit(i, buddy)) { | ||
859 | MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); | ||
860 | if (fstart == -1) { | ||
861 | fragments++; | ||
862 | fstart = i; | ||
863 | } | ||
864 | continue; | ||
865 | } | ||
866 | fstart = -1; | ||
867 | /* check used bits only */ | ||
868 | for (j = 0; j < e4b->bd_blkbits + 1; j++) { | ||
869 | buddy2 = mb_find_buddy(e4b, j, &max2); | ||
870 | k = i >> j; | ||
871 | MB_CHECK_ASSERT(k < max2); | ||
872 | MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); | ||
873 | } | ||
874 | } | ||
875 | MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); | ||
876 | MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); | ||
877 | |||
878 | grp = ext4_get_group_info(sb, e4b->bd_group); | ||
879 | buddy = mb_find_buddy(e4b, 0, &max); | ||
880 | list_for_each(cur, &grp->bb_prealloc_list) { | ||
881 | ext4_group_t groupnr; | ||
882 | struct ext4_prealloc_space *pa; | ||
883 | pa = list_entry(cur, struct ext4_prealloc_space, group_list); | ||
884 | ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k); | ||
885 | MB_CHECK_ASSERT(groupnr == e4b->bd_group); | ||
886 | for (i = 0; i < pa->len; i++) | ||
887 | MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); | ||
888 | } | ||
889 | return 0; | ||
890 | } | ||
891 | #undef MB_CHECK_ASSERT | ||
892 | #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ | ||
893 | __FILE__, __FUNCTION__, __LINE__) | ||
894 | #else | ||
895 | #define mb_check_buddy(e4b) | ||
896 | #endif | ||
897 | |||
898 | /* FIXME!! need more doc */ | ||
899 | static void ext4_mb_mark_free_simple(struct super_block *sb, | ||
900 | void *buddy, unsigned first, int len, | ||
901 | struct ext4_group_info *grp) | ||
902 | { | ||
903 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
904 | unsigned short min; | ||
905 | unsigned short max; | ||
906 | unsigned short chunk; | ||
907 | unsigned short border; | ||
908 | |||
909 | BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb)); | ||
910 | |||
911 | border = 2 << sb->s_blocksize_bits; | ||
912 | |||
913 | while (len > 0) { | ||
914 | /* find how many blocks can be covered since this position */ | ||
915 | max = ffs(first | border) - 1; | ||
916 | |||
917 | /* find how many blocks of power 2 we need to mark */ | ||
918 | min = fls(len) - 1; | ||
919 | |||
920 | if (max < min) | ||
921 | min = max; | ||
922 | chunk = 1 << min; | ||
923 | |||
924 | /* mark multiblock chunks only */ | ||
925 | grp->bb_counters[min]++; | ||
926 | if (min > 0) | ||
927 | mb_clear_bit(first >> min, | ||
928 | buddy + sbi->s_mb_offsets[min]); | ||
929 | |||
930 | len -= chunk; | ||
931 | first += chunk; | ||
932 | } | ||
933 | } | ||
934 | |||
935 | static void ext4_mb_generate_buddy(struct super_block *sb, | ||
936 | void *buddy, void *bitmap, ext4_group_t group) | ||
937 | { | ||
938 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); | ||
939 | unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); | ||
940 | unsigned short i = 0; | ||
941 | unsigned short first; | ||
942 | unsigned short len; | ||
943 | unsigned free = 0; | ||
944 | unsigned fragments = 0; | ||
945 | unsigned long long period = get_cycles(); | ||
946 | |||
947 | /* initialize buddy from bitmap which is aggregation | ||
948 | * of on-disk bitmap and preallocations */ | ||
949 | i = ext4_find_next_zero_bit(bitmap, max, 0); | ||
950 | grp->bb_first_free = i; | ||
951 | while (i < max) { | ||
952 | fragments++; | ||
953 | first = i; | ||
954 | i = ext4_find_next_bit(bitmap, max, i); | ||
955 | len = i - first; | ||
956 | free += len; | ||
957 | if (len > 1) | ||
958 | ext4_mb_mark_free_simple(sb, buddy, first, len, grp); | ||
959 | else | ||
960 | grp->bb_counters[0]++; | ||
961 | if (i < max) | ||
962 | i = ext4_find_next_zero_bit(bitmap, max, i); | ||
963 | } | ||
964 | grp->bb_fragments = fragments; | ||
965 | |||
966 | if (free != grp->bb_free) { | ||
967 | printk(KERN_DEBUG | ||
968 | "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", | ||
969 | group, free, grp->bb_free); | ||
970 | grp->bb_free = free; | ||
971 | } | ||
972 | |||
973 | clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); | ||
974 | |||
975 | period = get_cycles() - period; | ||
976 | spin_lock(&EXT4_SB(sb)->s_bal_lock); | ||
977 | EXT4_SB(sb)->s_mb_buddies_generated++; | ||
978 | EXT4_SB(sb)->s_mb_generation_time += period; | ||
979 | spin_unlock(&EXT4_SB(sb)->s_bal_lock); | ||
980 | } | ||
981 | |||
982 | /* The buddy information is attached the buddy cache inode | ||
983 | * for convenience. The information regarding each group | ||
984 | * is loaded via ext4_mb_load_buddy. The information involve | ||
985 | * block bitmap and buddy information. The information are | ||
986 | * stored in the inode as | ||
987 | * | ||
988 | * { page } | ||
989 | * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... | ||
990 | * | ||
991 | * | ||
992 | * one block each for bitmap and buddy information. | ||
993 | * So for each group we take up 2 blocks. A page can | ||
994 | * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. | ||
995 | * So it can have information regarding groups_per_page which | ||
996 | * is blocks_per_page/2 | ||
997 | */ | ||
998 | |||
999 | static int ext4_mb_init_cache(struct page *page, char *incore) | ||
1000 | { | ||
1001 | int blocksize; | ||
1002 | int blocks_per_page; | ||
1003 | int groups_per_page; | ||
1004 | int err = 0; | ||
1005 | int i; | ||
1006 | ext4_group_t first_group; | ||
1007 | int first_block; | ||
1008 | struct super_block *sb; | ||
1009 | struct buffer_head *bhs; | ||
1010 | struct buffer_head **bh; | ||
1011 | struct inode *inode; | ||
1012 | char *data; | ||
1013 | char *bitmap; | ||
1014 | |||
1015 | mb_debug("init page %lu\n", page->index); | ||
1016 | |||
1017 | inode = page->mapping->host; | ||
1018 | sb = inode->i_sb; | ||
1019 | blocksize = 1 << inode->i_blkbits; | ||
1020 | blocks_per_page = PAGE_CACHE_SIZE / blocksize; | ||
1021 | |||
1022 | groups_per_page = blocks_per_page >> 1; | ||
1023 | if (groups_per_page == 0) | ||
1024 | groups_per_page = 1; | ||
1025 | |||
1026 | /* allocate buffer_heads to read bitmaps */ | ||
1027 | if (groups_per_page > 1) { | ||
1028 | err = -ENOMEM; | ||
1029 | i = sizeof(struct buffer_head *) * groups_per_page; | ||
1030 | bh = kzalloc(i, GFP_NOFS); | ||
1031 | if (bh == NULL) | ||
1032 | goto out; | ||
1033 | } else | ||
1034 | bh = &bhs; | ||
1035 | |||
1036 | first_group = page->index * blocks_per_page / 2; | ||
1037 | |||
1038 | /* read all groups the page covers into the cache */ | ||
1039 | for (i = 0; i < groups_per_page; i++) { | ||
1040 | struct ext4_group_desc *desc; | ||
1041 | |||
1042 | if (first_group + i >= EXT4_SB(sb)->s_groups_count) | ||
1043 | break; | ||
1044 | |||
1045 | err = -EIO; | ||
1046 | desc = ext4_get_group_desc(sb, first_group + i, NULL); | ||
1047 | if (desc == NULL) | ||
1048 | goto out; | ||
1049 | |||
1050 | err = -ENOMEM; | ||
1051 | bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); | ||
1052 | if (bh[i] == NULL) | ||
1053 | goto out; | ||
1054 | |||
1055 | if (bh_uptodate_or_lock(bh[i])) | ||
1056 | continue; | ||
1057 | |||
1058 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
1059 | ext4_init_block_bitmap(sb, bh[i], | ||
1060 | first_group + i, desc); | ||
1061 | set_buffer_uptodate(bh[i]); | ||
1062 | unlock_buffer(bh[i]); | ||
1063 | continue; | ||
1064 | } | ||
1065 | get_bh(bh[i]); | ||
1066 | bh[i]->b_end_io = end_buffer_read_sync; | ||
1067 | submit_bh(READ, bh[i]); | ||
1068 | mb_debug("read bitmap for group %lu\n", first_group + i); | ||
1069 | } | ||
1070 | |||
1071 | /* wait for I/O completion */ | ||
1072 | for (i = 0; i < groups_per_page && bh[i]; i++) | ||
1073 | wait_on_buffer(bh[i]); | ||
1074 | |||
1075 | err = -EIO; | ||
1076 | for (i = 0; i < groups_per_page && bh[i]; i++) | ||
1077 | if (!buffer_uptodate(bh[i])) | ||
1078 | goto out; | ||
1079 | |||
1080 | first_block = page->index * blocks_per_page; | ||
1081 | for (i = 0; i < blocks_per_page; i++) { | ||
1082 | int group; | ||
1083 | struct ext4_group_info *grinfo; | ||
1084 | |||
1085 | group = (first_block + i) >> 1; | ||
1086 | if (group >= EXT4_SB(sb)->s_groups_count) | ||
1087 | break; | ||
1088 | |||
1089 | /* | ||
1090 | * data carry information regarding this | ||
1091 | * particular group in the format specified | ||
1092 | * above | ||
1093 | * | ||
1094 | */ | ||
1095 | data = page_address(page) + (i * blocksize); | ||
1096 | bitmap = bh[group - first_group]->b_data; | ||
1097 | |||
1098 | /* | ||
1099 | * We place the buddy block and bitmap block | ||
1100 | * close together | ||
1101 | */ | ||
1102 | if ((first_block + i) & 1) { | ||
1103 | /* this is block of buddy */ | ||
1104 | BUG_ON(incore == NULL); | ||
1105 | mb_debug("put buddy for group %u in page %lu/%x\n", | ||
1106 | group, page->index, i * blocksize); | ||
1107 | memset(data, 0xff, blocksize); | ||
1108 | grinfo = ext4_get_group_info(sb, group); | ||
1109 | grinfo->bb_fragments = 0; | ||
1110 | memset(grinfo->bb_counters, 0, | ||
1111 | sizeof(unsigned short)*(sb->s_blocksize_bits+2)); | ||
1112 | /* | ||
1113 | * incore got set to the group block bitmap below | ||
1114 | */ | ||
1115 | ext4_mb_generate_buddy(sb, data, incore, group); | ||
1116 | incore = NULL; | ||
1117 | } else { | ||
1118 | /* this is block of bitmap */ | ||
1119 | BUG_ON(incore != NULL); | ||
1120 | mb_debug("put bitmap for group %u in page %lu/%x\n", | ||
1121 | group, page->index, i * blocksize); | ||
1122 | |||
1123 | /* see comments in ext4_mb_put_pa() */ | ||
1124 | ext4_lock_group(sb, group); | ||
1125 | memcpy(data, bitmap, blocksize); | ||
1126 | |||
1127 | /* mark all preallocated blks used in in-core bitmap */ | ||
1128 | ext4_mb_generate_from_pa(sb, data, group); | ||
1129 | ext4_unlock_group(sb, group); | ||
1130 | |||
1131 | /* set incore so that the buddy information can be | ||
1132 | * generated using this | ||
1133 | */ | ||
1134 | incore = data; | ||
1135 | } | ||
1136 | } | ||
1137 | SetPageUptodate(page); | ||
1138 | |||
1139 | out: | ||
1140 | if (bh) { | ||
1141 | for (i = 0; i < groups_per_page && bh[i]; i++) | ||
1142 | brelse(bh[i]); | ||
1143 | if (bh != &bhs) | ||
1144 | kfree(bh); | ||
1145 | } | ||
1146 | return err; | ||
1147 | } | ||
1148 | |||
1149 | static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | ||
1150 | struct ext4_buddy *e4b) | ||
1151 | { | ||
1152 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1153 | struct inode *inode = sbi->s_buddy_cache; | ||
1154 | int blocks_per_page; | ||
1155 | int block; | ||
1156 | int pnum; | ||
1157 | int poff; | ||
1158 | struct page *page; | ||
1159 | |||
1160 | mb_debug("load group %lu\n", group); | ||
1161 | |||
1162 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1163 | |||
1164 | e4b->bd_blkbits = sb->s_blocksize_bits; | ||
1165 | e4b->bd_info = ext4_get_group_info(sb, group); | ||
1166 | e4b->bd_sb = sb; | ||
1167 | e4b->bd_group = group; | ||
1168 | e4b->bd_buddy_page = NULL; | ||
1169 | e4b->bd_bitmap_page = NULL; | ||
1170 | |||
1171 | /* | ||
1172 | * the buddy cache inode stores the block bitmap | ||
1173 | * and buddy information in consecutive blocks. | ||
1174 | * So for each group we need two blocks. | ||
1175 | */ | ||
1176 | block = group * 2; | ||
1177 | pnum = block / blocks_per_page; | ||
1178 | poff = block % blocks_per_page; | ||
1179 | |||
1180 | /* we could use find_or_create_page(), but it locks page | ||
1181 | * what we'd like to avoid in fast path ... */ | ||
1182 | page = find_get_page(inode->i_mapping, pnum); | ||
1183 | if (page == NULL || !PageUptodate(page)) { | ||
1184 | if (page) | ||
1185 | page_cache_release(page); | ||
1186 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
1187 | if (page) { | ||
1188 | BUG_ON(page->mapping != inode->i_mapping); | ||
1189 | if (!PageUptodate(page)) { | ||
1190 | ext4_mb_init_cache(page, NULL); | ||
1191 | mb_cmp_bitmaps(e4b, page_address(page) + | ||
1192 | (poff * sb->s_blocksize)); | ||
1193 | } | ||
1194 | unlock_page(page); | ||
1195 | } | ||
1196 | } | ||
1197 | if (page == NULL || !PageUptodate(page)) | ||
1198 | goto err; | ||
1199 | e4b->bd_bitmap_page = page; | ||
1200 | e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); | ||
1201 | mark_page_accessed(page); | ||
1202 | |||
1203 | block++; | ||
1204 | pnum = block / blocks_per_page; | ||
1205 | poff = block % blocks_per_page; | ||
1206 | |||
1207 | page = find_get_page(inode->i_mapping, pnum); | ||
1208 | if (page == NULL || !PageUptodate(page)) { | ||
1209 | if (page) | ||
1210 | page_cache_release(page); | ||
1211 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
1212 | if (page) { | ||
1213 | BUG_ON(page->mapping != inode->i_mapping); | ||
1214 | if (!PageUptodate(page)) | ||
1215 | ext4_mb_init_cache(page, e4b->bd_bitmap); | ||
1216 | |||
1217 | unlock_page(page); | ||
1218 | } | ||
1219 | } | ||
1220 | if (page == NULL || !PageUptodate(page)) | ||
1221 | goto err; | ||
1222 | e4b->bd_buddy_page = page; | ||
1223 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); | ||
1224 | mark_page_accessed(page); | ||
1225 | |||
1226 | BUG_ON(e4b->bd_bitmap_page == NULL); | ||
1227 | BUG_ON(e4b->bd_buddy_page == NULL); | ||
1228 | |||
1229 | return 0; | ||
1230 | |||
1231 | err: | ||
1232 | if (e4b->bd_bitmap_page) | ||
1233 | page_cache_release(e4b->bd_bitmap_page); | ||
1234 | if (e4b->bd_buddy_page) | ||
1235 | page_cache_release(e4b->bd_buddy_page); | ||
1236 | e4b->bd_buddy = NULL; | ||
1237 | e4b->bd_bitmap = NULL; | ||
1238 | return -EIO; | ||
1239 | } | ||
1240 | |||
1241 | static void ext4_mb_release_desc(struct ext4_buddy *e4b) | ||
1242 | { | ||
1243 | if (e4b->bd_bitmap_page) | ||
1244 | page_cache_release(e4b->bd_bitmap_page); | ||
1245 | if (e4b->bd_buddy_page) | ||
1246 | page_cache_release(e4b->bd_buddy_page); | ||
1247 | } | ||
1248 | |||
1249 | |||
1250 | static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) | ||
1251 | { | ||
1252 | int order = 1; | ||
1253 | void *bb; | ||
1254 | |||
1255 | BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); | ||
1256 | BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); | ||
1257 | |||
1258 | bb = EXT4_MB_BUDDY(e4b); | ||
1259 | while (order <= e4b->bd_blkbits + 1) { | ||
1260 | block = block >> 1; | ||
1261 | if (!mb_test_bit(block, bb)) { | ||
1262 | /* this block is part of buddy of order 'order' */ | ||
1263 | return order; | ||
1264 | } | ||
1265 | bb += 1 << (e4b->bd_blkbits - order); | ||
1266 | order++; | ||
1267 | } | ||
1268 | return 0; | ||
1269 | } | ||
1270 | |||
1271 | static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) | ||
1272 | { | ||
1273 | __u32 *addr; | ||
1274 | |||
1275 | len = cur + len; | ||
1276 | while (cur < len) { | ||
1277 | if ((cur & 31) == 0 && (len - cur) >= 32) { | ||
1278 | /* fast path: clear whole word at once */ | ||
1279 | addr = bm + (cur >> 3); | ||
1280 | *addr = 0; | ||
1281 | cur += 32; | ||
1282 | continue; | ||
1283 | } | ||
1284 | mb_clear_bit_atomic(lock, cur, bm); | ||
1285 | cur++; | ||
1286 | } | ||
1287 | } | ||
1288 | |||
1289 | static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) | ||
1290 | { | ||
1291 | __u32 *addr; | ||
1292 | |||
1293 | len = cur + len; | ||
1294 | while (cur < len) { | ||
1295 | if ((cur & 31) == 0 && (len - cur) >= 32) { | ||
1296 | /* fast path: set whole word at once */ | ||
1297 | addr = bm + (cur >> 3); | ||
1298 | *addr = 0xffffffff; | ||
1299 | cur += 32; | ||
1300 | continue; | ||
1301 | } | ||
1302 | mb_set_bit_atomic(lock, cur, bm); | ||
1303 | cur++; | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | ||
1308 | int first, int count) | ||
1309 | { | ||
1310 | int block = 0; | ||
1311 | int max = 0; | ||
1312 | int order; | ||
1313 | void *buddy; | ||
1314 | void *buddy2; | ||
1315 | struct super_block *sb = e4b->bd_sb; | ||
1316 | |||
1317 | BUG_ON(first + count > (sb->s_blocksize << 3)); | ||
1318 | BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); | ||
1319 | mb_check_buddy(e4b); | ||
1320 | mb_free_blocks_double(inode, e4b, first, count); | ||
1321 | |||
1322 | e4b->bd_info->bb_free += count; | ||
1323 | if (first < e4b->bd_info->bb_first_free) | ||
1324 | e4b->bd_info->bb_first_free = first; | ||
1325 | |||
1326 | /* let's maintain fragments counter */ | ||
1327 | if (first != 0) | ||
1328 | block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); | ||
1329 | if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) | ||
1330 | max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); | ||
1331 | if (block && max) | ||
1332 | e4b->bd_info->bb_fragments--; | ||
1333 | else if (!block && !max) | ||
1334 | e4b->bd_info->bb_fragments++; | ||
1335 | |||
1336 | /* let's maintain buddy itself */ | ||
1337 | while (count-- > 0) { | ||
1338 | block = first++; | ||
1339 | order = 0; | ||
1340 | |||
1341 | if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { | ||
1342 | ext4_fsblk_t blocknr; | ||
1343 | blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); | ||
1344 | blocknr += block; | ||
1345 | blocknr += | ||
1346 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | ||
1347 | |||
1348 | ext4_error(sb, __FUNCTION__, "double-free of inode" | ||
1349 | " %lu's block %llu(bit %u in group %lu)\n", | ||
1350 | inode ? inode->i_ino : 0, blocknr, block, | ||
1351 | e4b->bd_group); | ||
1352 | } | ||
1353 | mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); | ||
1354 | e4b->bd_info->bb_counters[order]++; | ||
1355 | |||
1356 | /* start of the buddy */ | ||
1357 | buddy = mb_find_buddy(e4b, order, &max); | ||
1358 | |||
1359 | do { | ||
1360 | block &= ~1UL; | ||
1361 | if (mb_test_bit(block, buddy) || | ||
1362 | mb_test_bit(block + 1, buddy)) | ||
1363 | break; | ||
1364 | |||
1365 | /* both the buddies are free, try to coalesce them */ | ||
1366 | buddy2 = mb_find_buddy(e4b, order + 1, &max); | ||
1367 | |||
1368 | if (!buddy2) | ||
1369 | break; | ||
1370 | |||
1371 | if (order > 0) { | ||
1372 | /* for special purposes, we don't set | ||
1373 | * free bits in bitmap */ | ||
1374 | mb_set_bit(block, buddy); | ||
1375 | mb_set_bit(block + 1, buddy); | ||
1376 | } | ||
1377 | e4b->bd_info->bb_counters[order]--; | ||
1378 | e4b->bd_info->bb_counters[order]--; | ||
1379 | |||
1380 | block = block >> 1; | ||
1381 | order++; | ||
1382 | e4b->bd_info->bb_counters[order]++; | ||
1383 | |||
1384 | mb_clear_bit(block, buddy2); | ||
1385 | buddy = buddy2; | ||
1386 | } while (1); | ||
1387 | } | ||
1388 | mb_check_buddy(e4b); | ||
1389 | |||
1390 | return 0; | ||
1391 | } | ||
1392 | |||
1393 | static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, | ||
1394 | int needed, struct ext4_free_extent *ex) | ||
1395 | { | ||
1396 | int next = block; | ||
1397 | int max; | ||
1398 | int ord; | ||
1399 | void *buddy; | ||
1400 | |||
1401 | BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); | ||
1402 | BUG_ON(ex == NULL); | ||
1403 | |||
1404 | buddy = mb_find_buddy(e4b, order, &max); | ||
1405 | BUG_ON(buddy == NULL); | ||
1406 | BUG_ON(block >= max); | ||
1407 | if (mb_test_bit(block, buddy)) { | ||
1408 | ex->fe_len = 0; | ||
1409 | ex->fe_start = 0; | ||
1410 | ex->fe_group = 0; | ||
1411 | return 0; | ||
1412 | } | ||
1413 | |||
1414 | /* FIXME dorp order completely ? */ | ||
1415 | if (likely(order == 0)) { | ||
1416 | /* find actual order */ | ||
1417 | order = mb_find_order_for_block(e4b, block); | ||
1418 | block = block >> order; | ||
1419 | } | ||
1420 | |||
1421 | ex->fe_len = 1 << order; | ||
1422 | ex->fe_start = block << order; | ||
1423 | ex->fe_group = e4b->bd_group; | ||
1424 | |||
1425 | /* calc difference from given start */ | ||
1426 | next = next - ex->fe_start; | ||
1427 | ex->fe_len -= next; | ||
1428 | ex->fe_start += next; | ||
1429 | |||
1430 | while (needed > ex->fe_len && | ||
1431 | (buddy = mb_find_buddy(e4b, order, &max))) { | ||
1432 | |||
1433 | if (block + 1 >= max) | ||
1434 | break; | ||
1435 | |||
1436 | next = (block + 1) * (1 << order); | ||
1437 | if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) | ||
1438 | break; | ||
1439 | |||
1440 | ord = mb_find_order_for_block(e4b, next); | ||
1441 | |||
1442 | order = ord; | ||
1443 | block = next >> order; | ||
1444 | ex->fe_len += 1 << order; | ||
1445 | } | ||
1446 | |||
1447 | BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); | ||
1448 | return ex->fe_len; | ||
1449 | } | ||
1450 | |||
1451 | static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) | ||
1452 | { | ||
1453 | int ord; | ||
1454 | int mlen = 0; | ||
1455 | int max = 0; | ||
1456 | int cur; | ||
1457 | int start = ex->fe_start; | ||
1458 | int len = ex->fe_len; | ||
1459 | unsigned ret = 0; | ||
1460 | int len0 = len; | ||
1461 | void *buddy; | ||
1462 | |||
1463 | BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); | ||
1464 | BUG_ON(e4b->bd_group != ex->fe_group); | ||
1465 | BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); | ||
1466 | mb_check_buddy(e4b); | ||
1467 | mb_mark_used_double(e4b, start, len); | ||
1468 | |||
1469 | e4b->bd_info->bb_free -= len; | ||
1470 | if (e4b->bd_info->bb_first_free == start) | ||
1471 | e4b->bd_info->bb_first_free += len; | ||
1472 | |||
1473 | /* let's maintain fragments counter */ | ||
1474 | if (start != 0) | ||
1475 | mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); | ||
1476 | if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) | ||
1477 | max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); | ||
1478 | if (mlen && max) | ||
1479 | e4b->bd_info->bb_fragments++; | ||
1480 | else if (!mlen && !max) | ||
1481 | e4b->bd_info->bb_fragments--; | ||
1482 | |||
1483 | /* let's maintain buddy itself */ | ||
1484 | while (len) { | ||
1485 | ord = mb_find_order_for_block(e4b, start); | ||
1486 | |||
1487 | if (((start >> ord) << ord) == start && len >= (1 << ord)) { | ||
1488 | /* the whole chunk may be allocated at once! */ | ||
1489 | mlen = 1 << ord; | ||
1490 | buddy = mb_find_buddy(e4b, ord, &max); | ||
1491 | BUG_ON((start >> ord) >= max); | ||
1492 | mb_set_bit(start >> ord, buddy); | ||
1493 | e4b->bd_info->bb_counters[ord]--; | ||
1494 | start += mlen; | ||
1495 | len -= mlen; | ||
1496 | BUG_ON(len < 0); | ||
1497 | continue; | ||
1498 | } | ||
1499 | |||
1500 | /* store for history */ | ||
1501 | if (ret == 0) | ||
1502 | ret = len | (ord << 16); | ||
1503 | |||
1504 | /* we have to split large buddy */ | ||
1505 | BUG_ON(ord <= 0); | ||
1506 | buddy = mb_find_buddy(e4b, ord, &max); | ||
1507 | mb_set_bit(start >> ord, buddy); | ||
1508 | e4b->bd_info->bb_counters[ord]--; | ||
1509 | |||
1510 | ord--; | ||
1511 | cur = (start >> ord) & ~1U; | ||
1512 | buddy = mb_find_buddy(e4b, ord, &max); | ||
1513 | mb_clear_bit(cur, buddy); | ||
1514 | mb_clear_bit(cur + 1, buddy); | ||
1515 | e4b->bd_info->bb_counters[ord]++; | ||
1516 | e4b->bd_info->bb_counters[ord]++; | ||
1517 | } | ||
1518 | |||
1519 | mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), | ||
1520 | EXT4_MB_BITMAP(e4b), ex->fe_start, len0); | ||
1521 | mb_check_buddy(e4b); | ||
1522 | |||
1523 | return ret; | ||
1524 | } | ||
1525 | |||
1526 | /* | ||
1527 | * Must be called under group lock! | ||
1528 | */ | ||
1529 | static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, | ||
1530 | struct ext4_buddy *e4b) | ||
1531 | { | ||
1532 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
1533 | int ret; | ||
1534 | |||
1535 | BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); | ||
1536 | BUG_ON(ac->ac_status == AC_STATUS_FOUND); | ||
1537 | |||
1538 | ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); | ||
1539 | ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; | ||
1540 | ret = mb_mark_used(e4b, &ac->ac_b_ex); | ||
1541 | |||
1542 | /* preallocation can change ac_b_ex, thus we store actually | ||
1543 | * allocated blocks for history */ | ||
1544 | ac->ac_f_ex = ac->ac_b_ex; | ||
1545 | |||
1546 | ac->ac_status = AC_STATUS_FOUND; | ||
1547 | ac->ac_tail = ret & 0xffff; | ||
1548 | ac->ac_buddy = ret >> 16; | ||
1549 | |||
1550 | /* XXXXXXX: SUCH A HORRIBLE **CK */ | ||
1551 | /*FIXME!! Why ? */ | ||
1552 | ac->ac_bitmap_page = e4b->bd_bitmap_page; | ||
1553 | get_page(ac->ac_bitmap_page); | ||
1554 | ac->ac_buddy_page = e4b->bd_buddy_page; | ||
1555 | get_page(ac->ac_buddy_page); | ||
1556 | |||
1557 | /* store last allocated for subsequent stream allocation */ | ||
1558 | if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { | ||
1559 | spin_lock(&sbi->s_md_lock); | ||
1560 | sbi->s_mb_last_group = ac->ac_f_ex.fe_group; | ||
1561 | sbi->s_mb_last_start = ac->ac_f_ex.fe_start; | ||
1562 | spin_unlock(&sbi->s_md_lock); | ||
1563 | } | ||
1564 | } | ||
1565 | |||
1566 | /* | ||
1567 | * regular allocator, for general purposes allocation | ||
1568 | */ | ||
1569 | |||
1570 | static void ext4_mb_check_limits(struct ext4_allocation_context *ac, | ||
1571 | struct ext4_buddy *e4b, | ||
1572 | int finish_group) | ||
1573 | { | ||
1574 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
1575 | struct ext4_free_extent *bex = &ac->ac_b_ex; | ||
1576 | struct ext4_free_extent *gex = &ac->ac_g_ex; | ||
1577 | struct ext4_free_extent ex; | ||
1578 | int max; | ||
1579 | |||
1580 | /* | ||
1581 | * We don't want to scan for a whole year | ||
1582 | */ | ||
1583 | if (ac->ac_found > sbi->s_mb_max_to_scan && | ||
1584 | !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { | ||
1585 | ac->ac_status = AC_STATUS_BREAK; | ||
1586 | return; | ||
1587 | } | ||
1588 | |||
1589 | /* | ||
1590 | * Haven't found good chunk so far, let's continue | ||
1591 | */ | ||
1592 | if (bex->fe_len < gex->fe_len) | ||
1593 | return; | ||
1594 | |||
1595 | if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) | ||
1596 | && bex->fe_group == e4b->bd_group) { | ||
1597 | /* recheck chunk's availability - we don't know | ||
1598 | * when it was found (within this lock-unlock | ||
1599 | * period or not) */ | ||
1600 | max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); | ||
1601 | if (max >= gex->fe_len) { | ||
1602 | ext4_mb_use_best_found(ac, e4b); | ||
1603 | return; | ||
1604 | } | ||
1605 | } | ||
1606 | } | ||
1607 | |||
1608 | /* | ||
1609 | * The routine checks whether found extent is good enough. If it is, | ||
1610 | * then the extent gets marked used and flag is set to the context | ||
1611 | * to stop scanning. Otherwise, the extent is compared with the | ||
1612 | * previous found extent and if new one is better, then it's stored | ||
1613 | * in the context. Later, the best found extent will be used, if | ||
1614 | * mballoc can't find good enough extent. | ||
1615 | * | ||
1616 | * FIXME: real allocation policy is to be designed yet! | ||
1617 | */ | ||
1618 | static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, | ||
1619 | struct ext4_free_extent *ex, | ||
1620 | struct ext4_buddy *e4b) | ||
1621 | { | ||
1622 | struct ext4_free_extent *bex = &ac->ac_b_ex; | ||
1623 | struct ext4_free_extent *gex = &ac->ac_g_ex; | ||
1624 | |||
1625 | BUG_ON(ex->fe_len <= 0); | ||
1626 | BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); | ||
1627 | BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); | ||
1628 | BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); | ||
1629 | |||
1630 | ac->ac_found++; | ||
1631 | |||
1632 | /* | ||
1633 | * The special case - take what you catch first | ||
1634 | */ | ||
1635 | if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { | ||
1636 | *bex = *ex; | ||
1637 | ext4_mb_use_best_found(ac, e4b); | ||
1638 | return; | ||
1639 | } | ||
1640 | |||
1641 | /* | ||
1642 | * Let's check whether the chuck is good enough | ||
1643 | */ | ||
1644 | if (ex->fe_len == gex->fe_len) { | ||
1645 | *bex = *ex; | ||
1646 | ext4_mb_use_best_found(ac, e4b); | ||
1647 | return; | ||
1648 | } | ||
1649 | |||
1650 | /* | ||
1651 | * If this is first found extent, just store it in the context | ||
1652 | */ | ||
1653 | if (bex->fe_len == 0) { | ||
1654 | *bex = *ex; | ||
1655 | return; | ||
1656 | } | ||
1657 | |||
1658 | /* | ||
1659 | * If new found extent is better, store it in the context | ||
1660 | */ | ||
1661 | if (bex->fe_len < gex->fe_len) { | ||
1662 | /* if the request isn't satisfied, any found extent | ||
1663 | * larger than previous best one is better */ | ||
1664 | if (ex->fe_len > bex->fe_len) | ||
1665 | *bex = *ex; | ||
1666 | } else if (ex->fe_len > gex->fe_len) { | ||
1667 | /* if the request is satisfied, then we try to find | ||
1668 | * an extent that still satisfy the request, but is | ||
1669 | * smaller than previous one */ | ||
1670 | if (ex->fe_len < bex->fe_len) | ||
1671 | *bex = *ex; | ||
1672 | } | ||
1673 | |||
1674 | ext4_mb_check_limits(ac, e4b, 0); | ||
1675 | } | ||
1676 | |||
1677 | static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, | ||
1678 | struct ext4_buddy *e4b) | ||
1679 | { | ||
1680 | struct ext4_free_extent ex = ac->ac_b_ex; | ||
1681 | ext4_group_t group = ex.fe_group; | ||
1682 | int max; | ||
1683 | int err; | ||
1684 | |||
1685 | BUG_ON(ex.fe_len <= 0); | ||
1686 | err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); | ||
1687 | if (err) | ||
1688 | return err; | ||
1689 | |||
1690 | ext4_lock_group(ac->ac_sb, group); | ||
1691 | max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); | ||
1692 | |||
1693 | if (max > 0) { | ||
1694 | ac->ac_b_ex = ex; | ||
1695 | ext4_mb_use_best_found(ac, e4b); | ||
1696 | } | ||
1697 | |||
1698 | ext4_unlock_group(ac->ac_sb, group); | ||
1699 | ext4_mb_release_desc(e4b); | ||
1700 | |||
1701 | return 0; | ||
1702 | } | ||
1703 | |||
1704 | static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, | ||
1705 | struct ext4_buddy *e4b) | ||
1706 | { | ||
1707 | ext4_group_t group = ac->ac_g_ex.fe_group; | ||
1708 | int max; | ||
1709 | int err; | ||
1710 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
1711 | struct ext4_super_block *es = sbi->s_es; | ||
1712 | struct ext4_free_extent ex; | ||
1713 | |||
1714 | if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) | ||
1715 | return 0; | ||
1716 | |||
1717 | err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); | ||
1718 | if (err) | ||
1719 | return err; | ||
1720 | |||
1721 | ext4_lock_group(ac->ac_sb, group); | ||
1722 | max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, | ||
1723 | ac->ac_g_ex.fe_len, &ex); | ||
1724 | |||
1725 | if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { | ||
1726 | ext4_fsblk_t start; | ||
1727 | |||
1728 | start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + | ||
1729 | ex.fe_start + le32_to_cpu(es->s_first_data_block); | ||
1730 | /* use do_div to get remainder (would be 64-bit modulo) */ | ||
1731 | if (do_div(start, sbi->s_stripe) == 0) { | ||
1732 | ac->ac_found++; | ||
1733 | ac->ac_b_ex = ex; | ||
1734 | ext4_mb_use_best_found(ac, e4b); | ||
1735 | } | ||
1736 | } else if (max >= ac->ac_g_ex.fe_len) { | ||
1737 | BUG_ON(ex.fe_len <= 0); | ||
1738 | BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); | ||
1739 | BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); | ||
1740 | ac->ac_found++; | ||
1741 | ac->ac_b_ex = ex; | ||
1742 | ext4_mb_use_best_found(ac, e4b); | ||
1743 | } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { | ||
1744 | /* Sometimes, caller may want to merge even small | ||
1745 | * number of blocks to an existing extent */ | ||
1746 | BUG_ON(ex.fe_len <= 0); | ||
1747 | BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); | ||
1748 | BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); | ||
1749 | ac->ac_found++; | ||
1750 | ac->ac_b_ex = ex; | ||
1751 | ext4_mb_use_best_found(ac, e4b); | ||
1752 | } | ||
1753 | ext4_unlock_group(ac->ac_sb, group); | ||
1754 | ext4_mb_release_desc(e4b); | ||
1755 | |||
1756 | return 0; | ||
1757 | } | ||
1758 | |||
1759 | /* | ||
1760 | * The routine scans buddy structures (not bitmap!) from given order | ||
1761 | * to max order and tries to find big enough chunk to satisfy the req | ||
1762 | */ | ||
1763 | static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, | ||
1764 | struct ext4_buddy *e4b) | ||
1765 | { | ||
1766 | struct super_block *sb = ac->ac_sb; | ||
1767 | struct ext4_group_info *grp = e4b->bd_info; | ||
1768 | void *buddy; | ||
1769 | int i; | ||
1770 | int k; | ||
1771 | int max; | ||
1772 | |||
1773 | BUG_ON(ac->ac_2order <= 0); | ||
1774 | for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { | ||
1775 | if (grp->bb_counters[i] == 0) | ||
1776 | continue; | ||
1777 | |||
1778 | buddy = mb_find_buddy(e4b, i, &max); | ||
1779 | BUG_ON(buddy == NULL); | ||
1780 | |||
1781 | k = ext4_find_next_zero_bit(buddy, max, 0); | ||
1782 | BUG_ON(k >= max); | ||
1783 | |||
1784 | ac->ac_found++; | ||
1785 | |||
1786 | ac->ac_b_ex.fe_len = 1 << i; | ||
1787 | ac->ac_b_ex.fe_start = k << i; | ||
1788 | ac->ac_b_ex.fe_group = e4b->bd_group; | ||
1789 | |||
1790 | ext4_mb_use_best_found(ac, e4b); | ||
1791 | |||
1792 | BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); | ||
1793 | |||
1794 | if (EXT4_SB(sb)->s_mb_stats) | ||
1795 | atomic_inc(&EXT4_SB(sb)->s_bal_2orders); | ||
1796 | |||
1797 | break; | ||
1798 | } | ||
1799 | } | ||
1800 | |||
1801 | /* | ||
1802 | * The routine scans the group and measures all found extents. | ||
1803 | * In order to optimize scanning, caller must pass number of | ||
1804 | * free blocks in the group, so the routine can know upper limit. | ||
1805 | */ | ||
1806 | static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, | ||
1807 | struct ext4_buddy *e4b) | ||
1808 | { | ||
1809 | struct super_block *sb = ac->ac_sb; | ||
1810 | void *bitmap = EXT4_MB_BITMAP(e4b); | ||
1811 | struct ext4_free_extent ex; | ||
1812 | int i; | ||
1813 | int free; | ||
1814 | |||
1815 | free = e4b->bd_info->bb_free; | ||
1816 | BUG_ON(free <= 0); | ||
1817 | |||
1818 | i = e4b->bd_info->bb_first_free; | ||
1819 | |||
1820 | while (free && ac->ac_status == AC_STATUS_CONTINUE) { | ||
1821 | i = ext4_find_next_zero_bit(bitmap, | ||
1822 | EXT4_BLOCKS_PER_GROUP(sb), i); | ||
1823 | if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { | ||
1824 | BUG_ON(free != 0); | ||
1825 | break; | ||
1826 | } | ||
1827 | |||
1828 | mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); | ||
1829 | BUG_ON(ex.fe_len <= 0); | ||
1830 | BUG_ON(free < ex.fe_len); | ||
1831 | |||
1832 | ext4_mb_measure_extent(ac, &ex, e4b); | ||
1833 | |||
1834 | i += ex.fe_len; | ||
1835 | free -= ex.fe_len; | ||
1836 | } | ||
1837 | |||
1838 | ext4_mb_check_limits(ac, e4b, 1); | ||
1839 | } | ||
1840 | |||
1841 | /* | ||
1842 | * This is a special case for storages like raid5 | ||
1843 | * we try to find stripe-aligned chunks for stripe-size requests | ||
1844 | * XXX should do so at least for multiples of stripe size as well | ||
1845 | */ | ||
1846 | static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, | ||
1847 | struct ext4_buddy *e4b) | ||
1848 | { | ||
1849 | struct super_block *sb = ac->ac_sb; | ||
1850 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1851 | void *bitmap = EXT4_MB_BITMAP(e4b); | ||
1852 | struct ext4_free_extent ex; | ||
1853 | ext4_fsblk_t first_group_block; | ||
1854 | ext4_fsblk_t a; | ||
1855 | ext4_grpblk_t i; | ||
1856 | int max; | ||
1857 | |||
1858 | BUG_ON(sbi->s_stripe == 0); | ||
1859 | |||
1860 | /* find first stripe-aligned block in group */ | ||
1861 | first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) | ||
1862 | + le32_to_cpu(sbi->s_es->s_first_data_block); | ||
1863 | a = first_group_block + sbi->s_stripe - 1; | ||
1864 | do_div(a, sbi->s_stripe); | ||
1865 | i = (a * sbi->s_stripe) - first_group_block; | ||
1866 | |||
1867 | while (i < EXT4_BLOCKS_PER_GROUP(sb)) { | ||
1868 | if (!mb_test_bit(i, bitmap)) { | ||
1869 | max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); | ||
1870 | if (max >= sbi->s_stripe) { | ||
1871 | ac->ac_found++; | ||
1872 | ac->ac_b_ex = ex; | ||
1873 | ext4_mb_use_best_found(ac, e4b); | ||
1874 | break; | ||
1875 | } | ||
1876 | } | ||
1877 | i += sbi->s_stripe; | ||
1878 | } | ||
1879 | } | ||
1880 | |||
1881 | static int ext4_mb_good_group(struct ext4_allocation_context *ac, | ||
1882 | ext4_group_t group, int cr) | ||
1883 | { | ||
1884 | unsigned free, fragments; | ||
1885 | unsigned i, bits; | ||
1886 | struct ext4_group_desc *desc; | ||
1887 | struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); | ||
1888 | |||
1889 | BUG_ON(cr < 0 || cr >= 4); | ||
1890 | BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); | ||
1891 | |||
1892 | free = grp->bb_free; | ||
1893 | fragments = grp->bb_fragments; | ||
1894 | if (free == 0) | ||
1895 | return 0; | ||
1896 | if (fragments == 0) | ||
1897 | return 0; | ||
1898 | |||
1899 | switch (cr) { | ||
1900 | case 0: | ||
1901 | BUG_ON(ac->ac_2order == 0); | ||
1902 | /* If this group is uninitialized, skip it initially */ | ||
1903 | desc = ext4_get_group_desc(ac->ac_sb, group, NULL); | ||
1904 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) | ||
1905 | return 0; | ||
1906 | |||
1907 | bits = ac->ac_sb->s_blocksize_bits + 1; | ||
1908 | for (i = ac->ac_2order; i <= bits; i++) | ||
1909 | if (grp->bb_counters[i] > 0) | ||
1910 | return 1; | ||
1911 | break; | ||
1912 | case 1: | ||
1913 | if ((free / fragments) >= ac->ac_g_ex.fe_len) | ||
1914 | return 1; | ||
1915 | break; | ||
1916 | case 2: | ||
1917 | if (free >= ac->ac_g_ex.fe_len) | ||
1918 | return 1; | ||
1919 | break; | ||
1920 | case 3: | ||
1921 | return 1; | ||
1922 | default: | ||
1923 | BUG(); | ||
1924 | } | ||
1925 | |||
1926 | return 0; | ||
1927 | } | ||
1928 | |||
1929 | static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | ||
1930 | { | ||
1931 | ext4_group_t group; | ||
1932 | ext4_group_t i; | ||
1933 | int cr; | ||
1934 | int err = 0; | ||
1935 | int bsbits; | ||
1936 | struct ext4_sb_info *sbi; | ||
1937 | struct super_block *sb; | ||
1938 | struct ext4_buddy e4b; | ||
1939 | loff_t size, isize; | ||
1940 | |||
1941 | sb = ac->ac_sb; | ||
1942 | sbi = EXT4_SB(sb); | ||
1943 | BUG_ON(ac->ac_status == AC_STATUS_FOUND); | ||
1944 | |||
1945 | /* first, try the goal */ | ||
1946 | err = ext4_mb_find_by_goal(ac, &e4b); | ||
1947 | if (err || ac->ac_status == AC_STATUS_FOUND) | ||
1948 | goto out; | ||
1949 | |||
1950 | if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) | ||
1951 | goto out; | ||
1952 | |||
1953 | /* | ||
1954 | * ac->ac2_order is set only if the fe_len is a power of 2 | ||
1955 | * if ac2_order is set we also set criteria to 0 so that we | ||
1956 | * try exact allocation using buddy. | ||
1957 | */ | ||
1958 | i = fls(ac->ac_g_ex.fe_len); | ||
1959 | ac->ac_2order = 0; | ||
1960 | /* | ||
1961 | * We search using buddy data only if the order of the request | ||
1962 | * is greater than equal to the sbi_s_mb_order2_reqs | ||
1963 | * You can tune it via /proc/fs/ext4/<partition>/order2_req | ||
1964 | */ | ||
1965 | if (i >= sbi->s_mb_order2_reqs) { | ||
1966 | /* | ||
1967 | * This should tell if fe_len is exactly power of 2 | ||
1968 | */ | ||
1969 | if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) | ||
1970 | ac->ac_2order = i - 1; | ||
1971 | } | ||
1972 | |||
1973 | bsbits = ac->ac_sb->s_blocksize_bits; | ||
1974 | /* if stream allocation is enabled, use global goal */ | ||
1975 | size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; | ||
1976 | isize = i_size_read(ac->ac_inode) >> bsbits; | ||
1977 | if (size < isize) | ||
1978 | size = isize; | ||
1979 | |||
1980 | if (size < sbi->s_mb_stream_request && | ||
1981 | (ac->ac_flags & EXT4_MB_HINT_DATA)) { | ||
1982 | /* TBD: may be hot point */ | ||
1983 | spin_lock(&sbi->s_md_lock); | ||
1984 | ac->ac_g_ex.fe_group = sbi->s_mb_last_group; | ||
1985 | ac->ac_g_ex.fe_start = sbi->s_mb_last_start; | ||
1986 | spin_unlock(&sbi->s_md_lock); | ||
1987 | } | ||
1988 | |||
1989 | /* searching for the right group start from the goal value specified */ | ||
1990 | group = ac->ac_g_ex.fe_group; | ||
1991 | |||
1992 | /* Let's just scan groups to find more-less suitable blocks */ | ||
1993 | cr = ac->ac_2order ? 0 : 1; | ||
1994 | /* | ||
1995 | * cr == 0 try to get exact allocation, | ||
1996 | * cr == 3 try to get anything | ||
1997 | */ | ||
1998 | repeat: | ||
1999 | for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { | ||
2000 | ac->ac_criteria = cr; | ||
2001 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { | ||
2002 | struct ext4_group_info *grp; | ||
2003 | struct ext4_group_desc *desc; | ||
2004 | |||
2005 | if (group == EXT4_SB(sb)->s_groups_count) | ||
2006 | group = 0; | ||
2007 | |||
2008 | /* quick check to skip empty groups */ | ||
2009 | grp = ext4_get_group_info(ac->ac_sb, group); | ||
2010 | if (grp->bb_free == 0) | ||
2011 | continue; | ||
2012 | |||
2013 | /* | ||
2014 | * if the group is already init we check whether it is | ||
2015 | * a good group and if not we don't load the buddy | ||
2016 | */ | ||
2017 | if (EXT4_MB_GRP_NEED_INIT(grp)) { | ||
2018 | /* | ||
2019 | * we need full data about the group | ||
2020 | * to make a good selection | ||
2021 | */ | ||
2022 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
2023 | if (err) | ||
2024 | goto out; | ||
2025 | ext4_mb_release_desc(&e4b); | ||
2026 | } | ||
2027 | |||
2028 | /* | ||
2029 | * If the particular group doesn't satisfy our | ||
2030 | * criteria we continue with the next group | ||
2031 | */ | ||
2032 | if (!ext4_mb_good_group(ac, group, cr)) | ||
2033 | continue; | ||
2034 | |||
2035 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
2036 | if (err) | ||
2037 | goto out; | ||
2038 | |||
2039 | ext4_lock_group(sb, group); | ||
2040 | if (!ext4_mb_good_group(ac, group, cr)) { | ||
2041 | /* someone did allocation from this group */ | ||
2042 | ext4_unlock_group(sb, group); | ||
2043 | ext4_mb_release_desc(&e4b); | ||
2044 | continue; | ||
2045 | } | ||
2046 | |||
2047 | ac->ac_groups_scanned++; | ||
2048 | desc = ext4_get_group_desc(sb, group, NULL); | ||
2049 | if (cr == 0 || (desc->bg_flags & | ||
2050 | cpu_to_le16(EXT4_BG_BLOCK_UNINIT) && | ||
2051 | ac->ac_2order != 0)) | ||
2052 | ext4_mb_simple_scan_group(ac, &e4b); | ||
2053 | else if (cr == 1 && | ||
2054 | ac->ac_g_ex.fe_len == sbi->s_stripe) | ||
2055 | ext4_mb_scan_aligned(ac, &e4b); | ||
2056 | else | ||
2057 | ext4_mb_complex_scan_group(ac, &e4b); | ||
2058 | |||
2059 | ext4_unlock_group(sb, group); | ||
2060 | ext4_mb_release_desc(&e4b); | ||
2061 | |||
2062 | if (ac->ac_status != AC_STATUS_CONTINUE) | ||
2063 | break; | ||
2064 | } | ||
2065 | } | ||
2066 | |||
2067 | if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && | ||
2068 | !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { | ||
2069 | /* | ||
2070 | * We've been searching too long. Let's try to allocate | ||
2071 | * the best chunk we've found so far | ||
2072 | */ | ||
2073 | |||
2074 | ext4_mb_try_best_found(ac, &e4b); | ||
2075 | if (ac->ac_status != AC_STATUS_FOUND) { | ||
2076 | /* | ||
2077 | * Someone more lucky has already allocated it. | ||
2078 | * The only thing we can do is just take first | ||
2079 | * found block(s) | ||
2080 | printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); | ||
2081 | */ | ||
2082 | ac->ac_b_ex.fe_group = 0; | ||
2083 | ac->ac_b_ex.fe_start = 0; | ||
2084 | ac->ac_b_ex.fe_len = 0; | ||
2085 | ac->ac_status = AC_STATUS_CONTINUE; | ||
2086 | ac->ac_flags |= EXT4_MB_HINT_FIRST; | ||
2087 | cr = 3; | ||
2088 | atomic_inc(&sbi->s_mb_lost_chunks); | ||
2089 | goto repeat; | ||
2090 | } | ||
2091 | } | ||
2092 | out: | ||
2093 | return err; | ||
2094 | } | ||
2095 | |||
2096 | #ifdef EXT4_MB_HISTORY | ||
2097 | struct ext4_mb_proc_session { | ||
2098 | struct ext4_mb_history *history; | ||
2099 | struct super_block *sb; | ||
2100 | int start; | ||
2101 | int max; | ||
2102 | }; | ||
2103 | |||
2104 | static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, | ||
2105 | struct ext4_mb_history *hs, | ||
2106 | int first) | ||
2107 | { | ||
2108 | if (hs == s->history + s->max) | ||
2109 | hs = s->history; | ||
2110 | if (!first && hs == s->history + s->start) | ||
2111 | return NULL; | ||
2112 | while (hs->orig.fe_len == 0) { | ||
2113 | hs++; | ||
2114 | if (hs == s->history + s->max) | ||
2115 | hs = s->history; | ||
2116 | if (hs == s->history + s->start) | ||
2117 | return NULL; | ||
2118 | } | ||
2119 | return hs; | ||
2120 | } | ||
2121 | |||
2122 | static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) | ||
2123 | { | ||
2124 | struct ext4_mb_proc_session *s = seq->private; | ||
2125 | struct ext4_mb_history *hs; | ||
2126 | int l = *pos; | ||
2127 | |||
2128 | if (l == 0) | ||
2129 | return SEQ_START_TOKEN; | ||
2130 | hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); | ||
2131 | if (!hs) | ||
2132 | return NULL; | ||
2133 | while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); | ||
2134 | return hs; | ||
2135 | } | ||
2136 | |||
2137 | static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, | ||
2138 | loff_t *pos) | ||
2139 | { | ||
2140 | struct ext4_mb_proc_session *s = seq->private; | ||
2141 | struct ext4_mb_history *hs = v; | ||
2142 | |||
2143 | ++*pos; | ||
2144 | if (v == SEQ_START_TOKEN) | ||
2145 | return ext4_mb_history_skip_empty(s, s->history + s->start, 1); | ||
2146 | else | ||
2147 | return ext4_mb_history_skip_empty(s, ++hs, 0); | ||
2148 | } | ||
2149 | |||
2150 | static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) | ||
2151 | { | ||
2152 | char buf[25], buf2[25], buf3[25], *fmt; | ||
2153 | struct ext4_mb_history *hs = v; | ||
2154 | |||
2155 | if (v == SEQ_START_TOKEN) { | ||
2156 | seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " | ||
2157 | "%-5s %-2s %-5s %-5s %-5s %-6s\n", | ||
2158 | "pid", "inode", "original", "goal", "result", "found", | ||
2159 | "grps", "cr", "flags", "merge", "tail", "broken"); | ||
2160 | return 0; | ||
2161 | } | ||
2162 | |||
2163 | if (hs->op == EXT4_MB_HISTORY_ALLOC) { | ||
2164 | fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " | ||
2165 | "%-5u %-5s %-5u %-6u\n"; | ||
2166 | sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, | ||
2167 | hs->result.fe_start, hs->result.fe_len, | ||
2168 | hs->result.fe_logical); | ||
2169 | sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, | ||
2170 | hs->orig.fe_start, hs->orig.fe_len, | ||
2171 | hs->orig.fe_logical); | ||
2172 | sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, | ||
2173 | hs->goal.fe_start, hs->goal.fe_len, | ||
2174 | hs->goal.fe_logical); | ||
2175 | seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, | ||
2176 | hs->found, hs->groups, hs->cr, hs->flags, | ||
2177 | hs->merged ? "M" : "", hs->tail, | ||
2178 | hs->buddy ? 1 << hs->buddy : 0); | ||
2179 | } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { | ||
2180 | fmt = "%-5u %-8u %-23s %-23s %-23s\n"; | ||
2181 | sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, | ||
2182 | hs->result.fe_start, hs->result.fe_len, | ||
2183 | hs->result.fe_logical); | ||
2184 | sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, | ||
2185 | hs->orig.fe_start, hs->orig.fe_len, | ||
2186 | hs->orig.fe_logical); | ||
2187 | seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); | ||
2188 | } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { | ||
2189 | sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, | ||
2190 | hs->result.fe_start, hs->result.fe_len); | ||
2191 | seq_printf(seq, "%-5u %-8u %-23s discard\n", | ||
2192 | hs->pid, hs->ino, buf2); | ||
2193 | } else if (hs->op == EXT4_MB_HISTORY_FREE) { | ||
2194 | sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, | ||
2195 | hs->result.fe_start, hs->result.fe_len); | ||
2196 | seq_printf(seq, "%-5u %-8u %-23s free\n", | ||
2197 | hs->pid, hs->ino, buf2); | ||
2198 | } | ||
2199 | return 0; | ||
2200 | } | ||
2201 | |||
2202 | static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) | ||
2203 | { | ||
2204 | } | ||
2205 | |||
2206 | static struct seq_operations ext4_mb_seq_history_ops = { | ||
2207 | .start = ext4_mb_seq_history_start, | ||
2208 | .next = ext4_mb_seq_history_next, | ||
2209 | .stop = ext4_mb_seq_history_stop, | ||
2210 | .show = ext4_mb_seq_history_show, | ||
2211 | }; | ||
2212 | |||
2213 | static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) | ||
2214 | { | ||
2215 | struct super_block *sb = PDE(inode)->data; | ||
2216 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2217 | struct ext4_mb_proc_session *s; | ||
2218 | int rc; | ||
2219 | int size; | ||
2220 | |||
2221 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
2222 | if (s == NULL) | ||
2223 | return -ENOMEM; | ||
2224 | s->sb = sb; | ||
2225 | size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; | ||
2226 | s->history = kmalloc(size, GFP_KERNEL); | ||
2227 | if (s->history == NULL) { | ||
2228 | kfree(s); | ||
2229 | return -ENOMEM; | ||
2230 | } | ||
2231 | |||
2232 | spin_lock(&sbi->s_mb_history_lock); | ||
2233 | memcpy(s->history, sbi->s_mb_history, size); | ||
2234 | s->max = sbi->s_mb_history_max; | ||
2235 | s->start = sbi->s_mb_history_cur % s->max; | ||
2236 | spin_unlock(&sbi->s_mb_history_lock); | ||
2237 | |||
2238 | rc = seq_open(file, &ext4_mb_seq_history_ops); | ||
2239 | if (rc == 0) { | ||
2240 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
2241 | m->private = s; | ||
2242 | } else { | ||
2243 | kfree(s->history); | ||
2244 | kfree(s); | ||
2245 | } | ||
2246 | return rc; | ||
2247 | |||
2248 | } | ||
2249 | |||
2250 | static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) | ||
2251 | { | ||
2252 | struct seq_file *seq = (struct seq_file *)file->private_data; | ||
2253 | struct ext4_mb_proc_session *s = seq->private; | ||
2254 | kfree(s->history); | ||
2255 | kfree(s); | ||
2256 | return seq_release(inode, file); | ||
2257 | } | ||
2258 | |||
2259 | static ssize_t ext4_mb_seq_history_write(struct file *file, | ||
2260 | const char __user *buffer, | ||
2261 | size_t count, loff_t *ppos) | ||
2262 | { | ||
2263 | struct seq_file *seq = (struct seq_file *)file->private_data; | ||
2264 | struct ext4_mb_proc_session *s = seq->private; | ||
2265 | struct super_block *sb = s->sb; | ||
2266 | char str[32]; | ||
2267 | int value; | ||
2268 | |||
2269 | if (count >= sizeof(str)) { | ||
2270 | printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", | ||
2271 | "mb_history", (int)sizeof(str)); | ||
2272 | return -EOVERFLOW; | ||
2273 | } | ||
2274 | |||
2275 | if (copy_from_user(str, buffer, count)) | ||
2276 | return -EFAULT; | ||
2277 | |||
2278 | value = simple_strtol(str, NULL, 0); | ||
2279 | if (value < 0) | ||
2280 | return -ERANGE; | ||
2281 | EXT4_SB(sb)->s_mb_history_filter = value; | ||
2282 | |||
2283 | return count; | ||
2284 | } | ||
2285 | |||
2286 | static struct file_operations ext4_mb_seq_history_fops = { | ||
2287 | .owner = THIS_MODULE, | ||
2288 | .open = ext4_mb_seq_history_open, | ||
2289 | .read = seq_read, | ||
2290 | .write = ext4_mb_seq_history_write, | ||
2291 | .llseek = seq_lseek, | ||
2292 | .release = ext4_mb_seq_history_release, | ||
2293 | }; | ||
2294 | |||
2295 | static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) | ||
2296 | { | ||
2297 | struct super_block *sb = seq->private; | ||
2298 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2299 | ext4_group_t group; | ||
2300 | |||
2301 | if (*pos < 0 || *pos >= sbi->s_groups_count) | ||
2302 | return NULL; | ||
2303 | |||
2304 | group = *pos + 1; | ||
2305 | return (void *) group; | ||
2306 | } | ||
2307 | |||
2308 | static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) | ||
2309 | { | ||
2310 | struct super_block *sb = seq->private; | ||
2311 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2312 | ext4_group_t group; | ||
2313 | |||
2314 | ++*pos; | ||
2315 | if (*pos < 0 || *pos >= sbi->s_groups_count) | ||
2316 | return NULL; | ||
2317 | group = *pos + 1; | ||
2318 | return (void *) group;; | ||
2319 | } | ||
2320 | |||
2321 | static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) | ||
2322 | { | ||
2323 | struct super_block *sb = seq->private; | ||
2324 | long group = (long) v; | ||
2325 | int i; | ||
2326 | int err; | ||
2327 | struct ext4_buddy e4b; | ||
2328 | struct sg { | ||
2329 | struct ext4_group_info info; | ||
2330 | unsigned short counters[16]; | ||
2331 | } sg; | ||
2332 | |||
2333 | group--; | ||
2334 | if (group == 0) | ||
2335 | seq_printf(seq, "#%-5s: %-5s %-5s %-5s " | ||
2336 | "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " | ||
2337 | "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", | ||
2338 | "group", "free", "frags", "first", | ||
2339 | "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", | ||
2340 | "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); | ||
2341 | |||
2342 | i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + | ||
2343 | sizeof(struct ext4_group_info); | ||
2344 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
2345 | if (err) { | ||
2346 | seq_printf(seq, "#%-5lu: I/O error\n", group); | ||
2347 | return 0; | ||
2348 | } | ||
2349 | ext4_lock_group(sb, group); | ||
2350 | memcpy(&sg, ext4_get_group_info(sb, group), i); | ||
2351 | ext4_unlock_group(sb, group); | ||
2352 | ext4_mb_release_desc(&e4b); | ||
2353 | |||
2354 | seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, | ||
2355 | sg.info.bb_fragments, sg.info.bb_first_free); | ||
2356 | for (i = 0; i <= 13; i++) | ||
2357 | seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? | ||
2358 | sg.info.bb_counters[i] : 0); | ||
2359 | seq_printf(seq, " ]\n"); | ||
2360 | |||
2361 | return 0; | ||
2362 | } | ||
2363 | |||
2364 | static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) | ||
2365 | { | ||
2366 | } | ||
2367 | |||
2368 | static struct seq_operations ext4_mb_seq_groups_ops = { | ||
2369 | .start = ext4_mb_seq_groups_start, | ||
2370 | .next = ext4_mb_seq_groups_next, | ||
2371 | .stop = ext4_mb_seq_groups_stop, | ||
2372 | .show = ext4_mb_seq_groups_show, | ||
2373 | }; | ||
2374 | |||
2375 | static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) | ||
2376 | { | ||
2377 | struct super_block *sb = PDE(inode)->data; | ||
2378 | int rc; | ||
2379 | |||
2380 | rc = seq_open(file, &ext4_mb_seq_groups_ops); | ||
2381 | if (rc == 0) { | ||
2382 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
2383 | m->private = sb; | ||
2384 | } | ||
2385 | return rc; | ||
2386 | |||
2387 | } | ||
2388 | |||
2389 | static struct file_operations ext4_mb_seq_groups_fops = { | ||
2390 | .owner = THIS_MODULE, | ||
2391 | .open = ext4_mb_seq_groups_open, | ||
2392 | .read = seq_read, | ||
2393 | .llseek = seq_lseek, | ||
2394 | .release = seq_release, | ||
2395 | }; | ||
2396 | |||
2397 | static void ext4_mb_history_release(struct super_block *sb) | ||
2398 | { | ||
2399 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2400 | |||
2401 | remove_proc_entry("mb_groups", sbi->s_mb_proc); | ||
2402 | remove_proc_entry("mb_history", sbi->s_mb_proc); | ||
2403 | |||
2404 | kfree(sbi->s_mb_history); | ||
2405 | } | ||
2406 | |||
2407 | static void ext4_mb_history_init(struct super_block *sb) | ||
2408 | { | ||
2409 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2410 | int i; | ||
2411 | |||
2412 | if (sbi->s_mb_proc != NULL) { | ||
2413 | struct proc_dir_entry *p; | ||
2414 | p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); | ||
2415 | if (p) { | ||
2416 | p->proc_fops = &ext4_mb_seq_history_fops; | ||
2417 | p->data = sb; | ||
2418 | } | ||
2419 | p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); | ||
2420 | if (p) { | ||
2421 | p->proc_fops = &ext4_mb_seq_groups_fops; | ||
2422 | p->data = sb; | ||
2423 | } | ||
2424 | } | ||
2425 | |||
2426 | sbi->s_mb_history_max = 1000; | ||
2427 | sbi->s_mb_history_cur = 0; | ||
2428 | spin_lock_init(&sbi->s_mb_history_lock); | ||
2429 | i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); | ||
2430 | sbi->s_mb_history = kmalloc(i, GFP_KERNEL); | ||
2431 | if (likely(sbi->s_mb_history != NULL)) | ||
2432 | memset(sbi->s_mb_history, 0, i); | ||
2433 | /* if we can't allocate history, then we simple won't use it */ | ||
2434 | } | ||
2435 | |||
2436 | static void ext4_mb_store_history(struct ext4_allocation_context *ac) | ||
2437 | { | ||
2438 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
2439 | struct ext4_mb_history h; | ||
2440 | |||
2441 | if (unlikely(sbi->s_mb_history == NULL)) | ||
2442 | return; | ||
2443 | |||
2444 | if (!(ac->ac_op & sbi->s_mb_history_filter)) | ||
2445 | return; | ||
2446 | |||
2447 | h.op = ac->ac_op; | ||
2448 | h.pid = current->pid; | ||
2449 | h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; | ||
2450 | h.orig = ac->ac_o_ex; | ||
2451 | h.result = ac->ac_b_ex; | ||
2452 | h.flags = ac->ac_flags; | ||
2453 | h.found = ac->ac_found; | ||
2454 | h.groups = ac->ac_groups_scanned; | ||
2455 | h.cr = ac->ac_criteria; | ||
2456 | h.tail = ac->ac_tail; | ||
2457 | h.buddy = ac->ac_buddy; | ||
2458 | h.merged = 0; | ||
2459 | if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { | ||
2460 | if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && | ||
2461 | ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) | ||
2462 | h.merged = 1; | ||
2463 | h.goal = ac->ac_g_ex; | ||
2464 | h.result = ac->ac_f_ex; | ||
2465 | } | ||
2466 | |||
2467 | spin_lock(&sbi->s_mb_history_lock); | ||
2468 | memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); | ||
2469 | if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) | ||
2470 | sbi->s_mb_history_cur = 0; | ||
2471 | spin_unlock(&sbi->s_mb_history_lock); | ||
2472 | } | ||
2473 | |||
2474 | #else | ||
2475 | #define ext4_mb_history_release(sb) | ||
2476 | #define ext4_mb_history_init(sb) | ||
2477 | #endif | ||
2478 | |||
2479 | static int ext4_mb_init_backend(struct super_block *sb) | ||
2480 | { | ||
2481 | ext4_group_t i; | ||
2482 | int j, len, metalen; | ||
2483 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2484 | int num_meta_group_infos = | ||
2485 | (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> | ||
2486 | EXT4_DESC_PER_BLOCK_BITS(sb); | ||
2487 | struct ext4_group_info **meta_group_info; | ||
2488 | |||
2489 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte | ||
2490 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. | ||
2491 | * So a two level scheme suffices for now. */ | ||
2492 | sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * | ||
2493 | num_meta_group_infos, GFP_KERNEL); | ||
2494 | if (sbi->s_group_info == NULL) { | ||
2495 | printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); | ||
2496 | return -ENOMEM; | ||
2497 | } | ||
2498 | sbi->s_buddy_cache = new_inode(sb); | ||
2499 | if (sbi->s_buddy_cache == NULL) { | ||
2500 | printk(KERN_ERR "EXT4-fs: can't get new inode\n"); | ||
2501 | goto err_freesgi; | ||
2502 | } | ||
2503 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; | ||
2504 | |||
2505 | metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); | ||
2506 | for (i = 0; i < num_meta_group_infos; i++) { | ||
2507 | if ((i + 1) == num_meta_group_infos) | ||
2508 | metalen = sizeof(*meta_group_info) * | ||
2509 | (sbi->s_groups_count - | ||
2510 | (i << EXT4_DESC_PER_BLOCK_BITS(sb))); | ||
2511 | meta_group_info = kmalloc(metalen, GFP_KERNEL); | ||
2512 | if (meta_group_info == NULL) { | ||
2513 | printk(KERN_ERR "EXT4-fs: can't allocate mem for a " | ||
2514 | "buddy group\n"); | ||
2515 | goto err_freemeta; | ||
2516 | } | ||
2517 | sbi->s_group_info[i] = meta_group_info; | ||
2518 | } | ||
2519 | |||
2520 | /* | ||
2521 | * calculate needed size. if change bb_counters size, | ||
2522 | * don't forget about ext4_mb_generate_buddy() | ||
2523 | */ | ||
2524 | len = sizeof(struct ext4_group_info); | ||
2525 | len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); | ||
2526 | for (i = 0; i < sbi->s_groups_count; i++) { | ||
2527 | struct ext4_group_desc *desc; | ||
2528 | |||
2529 | meta_group_info = | ||
2530 | sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)]; | ||
2531 | j = i & (EXT4_DESC_PER_BLOCK(sb) - 1); | ||
2532 | |||
2533 | meta_group_info[j] = kzalloc(len, GFP_KERNEL); | ||
2534 | if (meta_group_info[j] == NULL) { | ||
2535 | printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); | ||
2536 | i--; | ||
2537 | goto err_freebuddy; | ||
2538 | } | ||
2539 | desc = ext4_get_group_desc(sb, i, NULL); | ||
2540 | if (desc == NULL) { | ||
2541 | printk(KERN_ERR | ||
2542 | "EXT4-fs: can't read descriptor %lu\n", i); | ||
2543 | goto err_freebuddy; | ||
2544 | } | ||
2545 | memset(meta_group_info[j], 0, len); | ||
2546 | set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, | ||
2547 | &(meta_group_info[j]->bb_state)); | ||
2548 | |||
2549 | /* | ||
2550 | * initialize bb_free to be able to skip | ||
2551 | * empty groups without initialization | ||
2552 | */ | ||
2553 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
2554 | meta_group_info[j]->bb_free = | ||
2555 | ext4_free_blocks_after_init(sb, i, desc); | ||
2556 | } else { | ||
2557 | meta_group_info[j]->bb_free = | ||
2558 | le16_to_cpu(desc->bg_free_blocks_count); | ||
2559 | } | ||
2560 | |||
2561 | INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list); | ||
2562 | |||
2563 | #ifdef DOUBLE_CHECK | ||
2564 | { | ||
2565 | struct buffer_head *bh; | ||
2566 | meta_group_info[j]->bb_bitmap = | ||
2567 | kmalloc(sb->s_blocksize, GFP_KERNEL); | ||
2568 | BUG_ON(meta_group_info[j]->bb_bitmap == NULL); | ||
2569 | bh = read_block_bitmap(sb, i); | ||
2570 | BUG_ON(bh == NULL); | ||
2571 | memcpy(meta_group_info[j]->bb_bitmap, bh->b_data, | ||
2572 | sb->s_blocksize); | ||
2573 | put_bh(bh); | ||
2574 | } | ||
2575 | #endif | ||
2576 | |||
2577 | } | ||
2578 | |||
2579 | return 0; | ||
2580 | |||
2581 | err_freebuddy: | ||
2582 | while (i >= 0) { | ||
2583 | kfree(ext4_get_group_info(sb, i)); | ||
2584 | i--; | ||
2585 | } | ||
2586 | i = num_meta_group_infos; | ||
2587 | err_freemeta: | ||
2588 | while (--i >= 0) | ||
2589 | kfree(sbi->s_group_info[i]); | ||
2590 | iput(sbi->s_buddy_cache); | ||
2591 | err_freesgi: | ||
2592 | kfree(sbi->s_group_info); | ||
2593 | return -ENOMEM; | ||
2594 | } | ||
2595 | |||
2596 | int ext4_mb_init(struct super_block *sb, int needs_recovery) | ||
2597 | { | ||
2598 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2599 | unsigned i; | ||
2600 | unsigned offset; | ||
2601 | unsigned max; | ||
2602 | |||
2603 | if (!test_opt(sb, MBALLOC)) | ||
2604 | return 0; | ||
2605 | |||
2606 | i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); | ||
2607 | |||
2608 | sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); | ||
2609 | if (sbi->s_mb_offsets == NULL) { | ||
2610 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
2611 | return -ENOMEM; | ||
2612 | } | ||
2613 | sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); | ||
2614 | if (sbi->s_mb_maxs == NULL) { | ||
2615 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
2616 | kfree(sbi->s_mb_maxs); | ||
2617 | return -ENOMEM; | ||
2618 | } | ||
2619 | |||
2620 | /* order 0 is regular bitmap */ | ||
2621 | sbi->s_mb_maxs[0] = sb->s_blocksize << 3; | ||
2622 | sbi->s_mb_offsets[0] = 0; | ||
2623 | |||
2624 | i = 1; | ||
2625 | offset = 0; | ||
2626 | max = sb->s_blocksize << 2; | ||
2627 | do { | ||
2628 | sbi->s_mb_offsets[i] = offset; | ||
2629 | sbi->s_mb_maxs[i] = max; | ||
2630 | offset += 1 << (sb->s_blocksize_bits - i); | ||
2631 | max = max >> 1; | ||
2632 | i++; | ||
2633 | } while (i <= sb->s_blocksize_bits + 1); | ||
2634 | |||
2635 | /* init file for buddy data */ | ||
2636 | i = ext4_mb_init_backend(sb); | ||
2637 | if (i) { | ||
2638 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
2639 | kfree(sbi->s_mb_offsets); | ||
2640 | kfree(sbi->s_mb_maxs); | ||
2641 | return i; | ||
2642 | } | ||
2643 | |||
2644 | spin_lock_init(&sbi->s_md_lock); | ||
2645 | INIT_LIST_HEAD(&sbi->s_active_transaction); | ||
2646 | INIT_LIST_HEAD(&sbi->s_closed_transaction); | ||
2647 | INIT_LIST_HEAD(&sbi->s_committed_transaction); | ||
2648 | spin_lock_init(&sbi->s_bal_lock); | ||
2649 | |||
2650 | sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; | ||
2651 | sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; | ||
2652 | sbi->s_mb_stats = MB_DEFAULT_STATS; | ||
2653 | sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; | ||
2654 | sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; | ||
2655 | sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; | ||
2656 | sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; | ||
2657 | |||
2658 | i = sizeof(struct ext4_locality_group) * NR_CPUS; | ||
2659 | sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); | ||
2660 | if (sbi->s_locality_groups == NULL) { | ||
2661 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
2662 | kfree(sbi->s_mb_offsets); | ||
2663 | kfree(sbi->s_mb_maxs); | ||
2664 | return -ENOMEM; | ||
2665 | } | ||
2666 | for (i = 0; i < NR_CPUS; i++) { | ||
2667 | struct ext4_locality_group *lg; | ||
2668 | lg = &sbi->s_locality_groups[i]; | ||
2669 | mutex_init(&lg->lg_mutex); | ||
2670 | INIT_LIST_HEAD(&lg->lg_prealloc_list); | ||
2671 | spin_lock_init(&lg->lg_prealloc_lock); | ||
2672 | } | ||
2673 | |||
2674 | ext4_mb_init_per_dev_proc(sb); | ||
2675 | ext4_mb_history_init(sb); | ||
2676 | |||
2677 | printk("EXT4-fs: mballoc enabled\n"); | ||
2678 | return 0; | ||
2679 | } | ||
2680 | |||
2681 | /* need to called with ext4 group lock (ext4_lock_group) */ | ||
2682 | static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) | ||
2683 | { | ||
2684 | struct ext4_prealloc_space *pa; | ||
2685 | struct list_head *cur, *tmp; | ||
2686 | int count = 0; | ||
2687 | |||
2688 | list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { | ||
2689 | pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); | ||
2690 | list_del(&pa->pa_group_list); | ||
2691 | count++; | ||
2692 | kfree(pa); | ||
2693 | } | ||
2694 | if (count) | ||
2695 | mb_debug("mballoc: %u PAs left\n", count); | ||
2696 | |||
2697 | } | ||
2698 | |||
2699 | int ext4_mb_release(struct super_block *sb) | ||
2700 | { | ||
2701 | ext4_group_t i; | ||
2702 | int num_meta_group_infos; | ||
2703 | struct ext4_group_info *grinfo; | ||
2704 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2705 | |||
2706 | if (!test_opt(sb, MBALLOC)) | ||
2707 | return 0; | ||
2708 | |||
2709 | /* release freed, non-committed blocks */ | ||
2710 | spin_lock(&sbi->s_md_lock); | ||
2711 | list_splice_init(&sbi->s_closed_transaction, | ||
2712 | &sbi->s_committed_transaction); | ||
2713 | list_splice_init(&sbi->s_active_transaction, | ||
2714 | &sbi->s_committed_transaction); | ||
2715 | spin_unlock(&sbi->s_md_lock); | ||
2716 | ext4_mb_free_committed_blocks(sb); | ||
2717 | |||
2718 | if (sbi->s_group_info) { | ||
2719 | for (i = 0; i < sbi->s_groups_count; i++) { | ||
2720 | grinfo = ext4_get_group_info(sb, i); | ||
2721 | #ifdef DOUBLE_CHECK | ||
2722 | kfree(grinfo->bb_bitmap); | ||
2723 | #endif | ||
2724 | ext4_lock_group(sb, i); | ||
2725 | ext4_mb_cleanup_pa(grinfo); | ||
2726 | ext4_unlock_group(sb, i); | ||
2727 | kfree(grinfo); | ||
2728 | } | ||
2729 | num_meta_group_infos = (sbi->s_groups_count + | ||
2730 | EXT4_DESC_PER_BLOCK(sb) - 1) >> | ||
2731 | EXT4_DESC_PER_BLOCK_BITS(sb); | ||
2732 | for (i = 0; i < num_meta_group_infos; i++) | ||
2733 | kfree(sbi->s_group_info[i]); | ||
2734 | kfree(sbi->s_group_info); | ||
2735 | } | ||
2736 | kfree(sbi->s_mb_offsets); | ||
2737 | kfree(sbi->s_mb_maxs); | ||
2738 | if (sbi->s_buddy_cache) | ||
2739 | iput(sbi->s_buddy_cache); | ||
2740 | if (sbi->s_mb_stats) { | ||
2741 | printk(KERN_INFO | ||
2742 | "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", | ||
2743 | atomic_read(&sbi->s_bal_allocated), | ||
2744 | atomic_read(&sbi->s_bal_reqs), | ||
2745 | atomic_read(&sbi->s_bal_success)); | ||
2746 | printk(KERN_INFO | ||
2747 | "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " | ||
2748 | "%u 2^N hits, %u breaks, %u lost\n", | ||
2749 | atomic_read(&sbi->s_bal_ex_scanned), | ||
2750 | atomic_read(&sbi->s_bal_goals), | ||
2751 | atomic_read(&sbi->s_bal_2orders), | ||
2752 | atomic_read(&sbi->s_bal_breaks), | ||
2753 | atomic_read(&sbi->s_mb_lost_chunks)); | ||
2754 | printk(KERN_INFO | ||
2755 | "EXT4-fs: mballoc: %lu generated and it took %Lu\n", | ||
2756 | sbi->s_mb_buddies_generated++, | ||
2757 | sbi->s_mb_generation_time); | ||
2758 | printk(KERN_INFO | ||
2759 | "EXT4-fs: mballoc: %u preallocated, %u discarded\n", | ||
2760 | atomic_read(&sbi->s_mb_preallocated), | ||
2761 | atomic_read(&sbi->s_mb_discarded)); | ||
2762 | } | ||
2763 | |||
2764 | kfree(sbi->s_locality_groups); | ||
2765 | |||
2766 | ext4_mb_history_release(sb); | ||
2767 | ext4_mb_destroy_per_dev_proc(sb); | ||
2768 | |||
2769 | return 0; | ||
2770 | } | ||
2771 | |||
2772 | static void ext4_mb_free_committed_blocks(struct super_block *sb) | ||
2773 | { | ||
2774 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2775 | int err; | ||
2776 | int i; | ||
2777 | int count = 0; | ||
2778 | int count2 = 0; | ||
2779 | struct ext4_free_metadata *md; | ||
2780 | struct ext4_buddy e4b; | ||
2781 | |||
2782 | if (list_empty(&sbi->s_committed_transaction)) | ||
2783 | return; | ||
2784 | |||
2785 | /* there is committed blocks to be freed yet */ | ||
2786 | do { | ||
2787 | /* get next array of blocks */ | ||
2788 | md = NULL; | ||
2789 | spin_lock(&sbi->s_md_lock); | ||
2790 | if (!list_empty(&sbi->s_committed_transaction)) { | ||
2791 | md = list_entry(sbi->s_committed_transaction.next, | ||
2792 | struct ext4_free_metadata, list); | ||
2793 | list_del(&md->list); | ||
2794 | } | ||
2795 | spin_unlock(&sbi->s_md_lock); | ||
2796 | |||
2797 | if (md == NULL) | ||
2798 | break; | ||
2799 | |||
2800 | mb_debug("gonna free %u blocks in group %lu (0x%p):", | ||
2801 | md->num, md->group, md); | ||
2802 | |||
2803 | err = ext4_mb_load_buddy(sb, md->group, &e4b); | ||
2804 | /* we expect to find existing buddy because it's pinned */ | ||
2805 | BUG_ON(err != 0); | ||
2806 | |||
2807 | /* there are blocks to put in buddy to make them really free */ | ||
2808 | count += md->num; | ||
2809 | count2++; | ||
2810 | ext4_lock_group(sb, md->group); | ||
2811 | for (i = 0; i < md->num; i++) { | ||
2812 | mb_debug(" %u", md->blocks[i]); | ||
2813 | err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); | ||
2814 | BUG_ON(err != 0); | ||
2815 | } | ||
2816 | mb_debug("\n"); | ||
2817 | ext4_unlock_group(sb, md->group); | ||
2818 | |||
2819 | /* balance refcounts from ext4_mb_free_metadata() */ | ||
2820 | page_cache_release(e4b.bd_buddy_page); | ||
2821 | page_cache_release(e4b.bd_bitmap_page); | ||
2822 | |||
2823 | kfree(md); | ||
2824 | ext4_mb_release_desc(&e4b); | ||
2825 | |||
2826 | } while (md); | ||
2827 | |||
2828 | mb_debug("freed %u blocks in %u structures\n", count, count2); | ||
2829 | } | ||
2830 | |||
2831 | #define EXT4_ROOT "ext4" | ||
2832 | #define EXT4_MB_STATS_NAME "stats" | ||
2833 | #define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" | ||
2834 | #define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" | ||
2835 | #define EXT4_MB_ORDER2_REQ "order2_req" | ||
2836 | #define EXT4_MB_STREAM_REQ "stream_req" | ||
2837 | #define EXT4_MB_GROUP_PREALLOC "group_prealloc" | ||
2838 | |||
2839 | |||
2840 | |||
2841 | #define MB_PROC_VALUE_READ(name) \ | ||
2842 | static int ext4_mb_read_##name(char *page, char **start, \ | ||
2843 | off_t off, int count, int *eof, void *data) \ | ||
2844 | { \ | ||
2845 | struct ext4_sb_info *sbi = data; \ | ||
2846 | int len; \ | ||
2847 | *eof = 1; \ | ||
2848 | if (off != 0) \ | ||
2849 | return 0; \ | ||
2850 | len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ | ||
2851 | *start = page; \ | ||
2852 | return len; \ | ||
2853 | } | ||
2854 | |||
2855 | #define MB_PROC_VALUE_WRITE(name) \ | ||
2856 | static int ext4_mb_write_##name(struct file *file, \ | ||
2857 | const char __user *buf, unsigned long cnt, void *data) \ | ||
2858 | { \ | ||
2859 | struct ext4_sb_info *sbi = data; \ | ||
2860 | char str[32]; \ | ||
2861 | long value; \ | ||
2862 | if (cnt >= sizeof(str)) \ | ||
2863 | return -EINVAL; \ | ||
2864 | if (copy_from_user(str, buf, cnt)) \ | ||
2865 | return -EFAULT; \ | ||
2866 | value = simple_strtol(str, NULL, 0); \ | ||
2867 | if (value <= 0) \ | ||
2868 | return -ERANGE; \ | ||
2869 | sbi->s_mb_##name = value; \ | ||
2870 | return cnt; \ | ||
2871 | } | ||
2872 | |||
2873 | MB_PROC_VALUE_READ(stats); | ||
2874 | MB_PROC_VALUE_WRITE(stats); | ||
2875 | MB_PROC_VALUE_READ(max_to_scan); | ||
2876 | MB_PROC_VALUE_WRITE(max_to_scan); | ||
2877 | MB_PROC_VALUE_READ(min_to_scan); | ||
2878 | MB_PROC_VALUE_WRITE(min_to_scan); | ||
2879 | MB_PROC_VALUE_READ(order2_reqs); | ||
2880 | MB_PROC_VALUE_WRITE(order2_reqs); | ||
2881 | MB_PROC_VALUE_READ(stream_request); | ||
2882 | MB_PROC_VALUE_WRITE(stream_request); | ||
2883 | MB_PROC_VALUE_READ(group_prealloc); | ||
2884 | MB_PROC_VALUE_WRITE(group_prealloc); | ||
2885 | |||
2886 | #define MB_PROC_HANDLER(name, var) \ | ||
2887 | do { \ | ||
2888 | proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ | ||
2889 | if (proc == NULL) { \ | ||
2890 | printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ | ||
2891 | goto err_out; \ | ||
2892 | } \ | ||
2893 | proc->data = sbi; \ | ||
2894 | proc->read_proc = ext4_mb_read_##var ; \ | ||
2895 | proc->write_proc = ext4_mb_write_##var; \ | ||
2896 | } while (0) | ||
2897 | |||
2898 | static int ext4_mb_init_per_dev_proc(struct super_block *sb) | ||
2899 | { | ||
2900 | mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; | ||
2901 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2902 | struct proc_dir_entry *proc; | ||
2903 | char devname[64]; | ||
2904 | |||
2905 | snprintf(devname, sizeof(devname) - 1, "%s", | ||
2906 | bdevname(sb->s_bdev, devname)); | ||
2907 | sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); | ||
2908 | |||
2909 | MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats); | ||
2910 | MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan); | ||
2911 | MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan); | ||
2912 | MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs); | ||
2913 | MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request); | ||
2914 | MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc); | ||
2915 | |||
2916 | return 0; | ||
2917 | |||
2918 | err_out: | ||
2919 | printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); | ||
2920 | remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); | ||
2921 | remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); | ||
2922 | remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); | ||
2923 | remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); | ||
2924 | remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); | ||
2925 | remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); | ||
2926 | remove_proc_entry(devname, proc_root_ext4); | ||
2927 | sbi->s_mb_proc = NULL; | ||
2928 | |||
2929 | return -ENOMEM; | ||
2930 | } | ||
2931 | |||
2932 | static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) | ||
2933 | { | ||
2934 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2935 | char devname[64]; | ||
2936 | |||
2937 | if (sbi->s_mb_proc == NULL) | ||
2938 | return -EINVAL; | ||
2939 | |||
2940 | snprintf(devname, sizeof(devname) - 1, "%s", | ||
2941 | bdevname(sb->s_bdev, devname)); | ||
2942 | remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); | ||
2943 | remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); | ||
2944 | remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); | ||
2945 | remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); | ||
2946 | remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); | ||
2947 | remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); | ||
2948 | remove_proc_entry(devname, proc_root_ext4); | ||
2949 | |||
2950 | return 0; | ||
2951 | } | ||
2952 | |||
2953 | int __init init_ext4_mballoc(void) | ||
2954 | { | ||
2955 | ext4_pspace_cachep = | ||
2956 | kmem_cache_create("ext4_prealloc_space", | ||
2957 | sizeof(struct ext4_prealloc_space), | ||
2958 | 0, SLAB_RECLAIM_ACCOUNT, NULL); | ||
2959 | if (ext4_pspace_cachep == NULL) | ||
2960 | return -ENOMEM; | ||
2961 | |||
2962 | #ifdef CONFIG_PROC_FS | ||
2963 | proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs); | ||
2964 | if (proc_root_ext4 == NULL) | ||
2965 | printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT); | ||
2966 | #endif | ||
2967 | |||
2968 | return 0; | ||
2969 | } | ||
2970 | |||
2971 | void exit_ext4_mballoc(void) | ||
2972 | { | ||
2973 | /* XXX: synchronize_rcu(); */ | ||
2974 | kmem_cache_destroy(ext4_pspace_cachep); | ||
2975 | #ifdef CONFIG_PROC_FS | ||
2976 | remove_proc_entry(EXT4_ROOT, proc_root_fs); | ||
2977 | #endif | ||
2978 | } | ||
2979 | |||
2980 | |||
2981 | /* | ||
2982 | * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps | ||
2983 | * Returns 0 if success or error code | ||
2984 | */ | ||
2985 | static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | ||
2986 | handle_t *handle) | ||
2987 | { | ||
2988 | struct buffer_head *bitmap_bh = NULL; | ||
2989 | struct ext4_super_block *es; | ||
2990 | struct ext4_group_desc *gdp; | ||
2991 | struct buffer_head *gdp_bh; | ||
2992 | struct ext4_sb_info *sbi; | ||
2993 | struct super_block *sb; | ||
2994 | ext4_fsblk_t block; | ||
2995 | int err; | ||
2996 | |||
2997 | BUG_ON(ac->ac_status != AC_STATUS_FOUND); | ||
2998 | BUG_ON(ac->ac_b_ex.fe_len <= 0); | ||
2999 | |||
3000 | sb = ac->ac_sb; | ||
3001 | sbi = EXT4_SB(sb); | ||
3002 | es = sbi->s_es; | ||
3003 | |||
3004 | ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, | ||
3005 | gdp->bg_free_blocks_count); | ||
3006 | |||
3007 | err = -EIO; | ||
3008 | bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); | ||
3009 | if (!bitmap_bh) | ||
3010 | goto out_err; | ||
3011 | |||
3012 | err = ext4_journal_get_write_access(handle, bitmap_bh); | ||
3013 | if (err) | ||
3014 | goto out_err; | ||
3015 | |||
3016 | err = -EIO; | ||
3017 | gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); | ||
3018 | if (!gdp) | ||
3019 | goto out_err; | ||
3020 | |||
3021 | err = ext4_journal_get_write_access(handle, gdp_bh); | ||
3022 | if (err) | ||
3023 | goto out_err; | ||
3024 | |||
3025 | block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) | ||
3026 | + ac->ac_b_ex.fe_start | ||
3027 | + le32_to_cpu(es->s_first_data_block); | ||
3028 | |||
3029 | if (block == ext4_block_bitmap(sb, gdp) || | ||
3030 | block == ext4_inode_bitmap(sb, gdp) || | ||
3031 | in_range(block, ext4_inode_table(sb, gdp), | ||
3032 | EXT4_SB(sb)->s_itb_per_group)) { | ||
3033 | |||
3034 | ext4_error(sb, __FUNCTION__, | ||
3035 | "Allocating block in system zone - block = %llu", | ||
3036 | block); | ||
3037 | } | ||
3038 | #ifdef AGGRESSIVE_CHECK | ||
3039 | { | ||
3040 | int i; | ||
3041 | for (i = 0; i < ac->ac_b_ex.fe_len; i++) { | ||
3042 | BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, | ||
3043 | bitmap_bh->b_data)); | ||
3044 | } | ||
3045 | } | ||
3046 | #endif | ||
3047 | mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, | ||
3048 | ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); | ||
3049 | |||
3050 | spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); | ||
3051 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
3052 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); | ||
3053 | gdp->bg_free_blocks_count = | ||
3054 | cpu_to_le16(ext4_free_blocks_after_init(sb, | ||
3055 | ac->ac_b_ex.fe_group, | ||
3056 | gdp)); | ||
3057 | } | ||
3058 | gdp->bg_free_blocks_count = | ||
3059 | cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) | ||
3060 | - ac->ac_b_ex.fe_len); | ||
3061 | gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); | ||
3062 | spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); | ||
3063 | percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); | ||
3064 | |||
3065 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | ||
3066 | if (err) | ||
3067 | goto out_err; | ||
3068 | err = ext4_journal_dirty_metadata(handle, gdp_bh); | ||
3069 | |||
3070 | out_err: | ||
3071 | sb->s_dirt = 1; | ||
3072 | put_bh(bitmap_bh); | ||
3073 | return err; | ||
3074 | } | ||
3075 | |||
3076 | /* | ||
3077 | * here we normalize request for locality group | ||
3078 | * Group request are normalized to s_strip size if we set the same via mount | ||
3079 | * option. If not we set it to s_mb_group_prealloc which can be configured via | ||
3080 | * /proc/fs/ext4/<partition>/group_prealloc | ||
3081 | * | ||
3082 | * XXX: should we try to preallocate more than the group has now? | ||
3083 | */ | ||
3084 | static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) | ||
3085 | { | ||
3086 | struct super_block *sb = ac->ac_sb; | ||
3087 | struct ext4_locality_group *lg = ac->ac_lg; | ||
3088 | |||
3089 | BUG_ON(lg == NULL); | ||
3090 | if (EXT4_SB(sb)->s_stripe) | ||
3091 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; | ||
3092 | else | ||
3093 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; | ||
3094 | mb_debug("#%u: goal %lu blocks for locality group\n", | ||
3095 | current->pid, ac->ac_g_ex.fe_len); | ||
3096 | } | ||
3097 | |||
3098 | /* | ||
3099 | * Normalization means making request better in terms of | ||
3100 | * size and alignment | ||
3101 | */ | ||
3102 | static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, | ||
3103 | struct ext4_allocation_request *ar) | ||
3104 | { | ||
3105 | int bsbits, max; | ||
3106 | ext4_lblk_t end; | ||
3107 | struct list_head *cur; | ||
3108 | loff_t size, orig_size, start_off; | ||
3109 | ext4_lblk_t start, orig_start; | ||
3110 | struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); | ||
3111 | |||
3112 | /* do normalize only data requests, metadata requests | ||
3113 | do not need preallocation */ | ||
3114 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) | ||
3115 | return; | ||
3116 | |||
3117 | /* sometime caller may want exact blocks */ | ||
3118 | if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) | ||
3119 | return; | ||
3120 | |||
3121 | /* caller may indicate that preallocation isn't | ||
3122 | * required (it's a tail, for example) */ | ||
3123 | if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) | ||
3124 | return; | ||
3125 | |||
3126 | if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { | ||
3127 | ext4_mb_normalize_group_request(ac); | ||
3128 | return ; | ||
3129 | } | ||
3130 | |||
3131 | bsbits = ac->ac_sb->s_blocksize_bits; | ||
3132 | |||
3133 | /* first, let's learn actual file size | ||
3134 | * given current request is allocated */ | ||
3135 | size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; | ||
3136 | size = size << bsbits; | ||
3137 | if (size < i_size_read(ac->ac_inode)) | ||
3138 | size = i_size_read(ac->ac_inode); | ||
3139 | |||
3140 | /* max available blocks in a free group */ | ||
3141 | max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 - | ||
3142 | EXT4_SB(ac->ac_sb)->s_itb_per_group; | ||
3143 | |||
3144 | #define NRL_CHECK_SIZE(req, size, max,bits) \ | ||
3145 | (req <= (size) || max <= ((size) >> bits)) | ||
3146 | |||
3147 | /* first, try to predict filesize */ | ||
3148 | /* XXX: should this table be tunable? */ | ||
3149 | start_off = 0; | ||
3150 | if (size <= 16 * 1024) { | ||
3151 | size = 16 * 1024; | ||
3152 | } else if (size <= 32 * 1024) { | ||
3153 | size = 32 * 1024; | ||
3154 | } else if (size <= 64 * 1024) { | ||
3155 | size = 64 * 1024; | ||
3156 | } else if (size <= 128 * 1024) { | ||
3157 | size = 128 * 1024; | ||
3158 | } else if (size <= 256 * 1024) { | ||
3159 | size = 256 * 1024; | ||
3160 | } else if (size <= 512 * 1024) { | ||
3161 | size = 512 * 1024; | ||
3162 | } else if (size <= 1024 * 1024) { | ||
3163 | size = 1024 * 1024; | ||
3164 | } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) { | ||
3165 | start_off = ((loff_t)ac->ac_o_ex.fe_logical >> | ||
3166 | (20 - bsbits)) << 20; | ||
3167 | size = 1024 * 1024; | ||
3168 | } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) { | ||
3169 | start_off = ((loff_t)ac->ac_o_ex.fe_logical >> | ||
3170 | (22 - bsbits)) << 22; | ||
3171 | size = 4 * 1024 * 1024; | ||
3172 | } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, | ||
3173 | (8<<20)>>bsbits, max, bsbits)) { | ||
3174 | start_off = ((loff_t)ac->ac_o_ex.fe_logical >> | ||
3175 | (23 - bsbits)) << 23; | ||
3176 | size = 8 * 1024 * 1024; | ||
3177 | } else { | ||
3178 | start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; | ||
3179 | size = ac->ac_o_ex.fe_len << bsbits; | ||
3180 | } | ||
3181 | orig_size = size = size >> bsbits; | ||
3182 | orig_start = start = start_off >> bsbits; | ||
3183 | |||
3184 | /* don't cover already allocated blocks in selected range */ | ||
3185 | if (ar->pleft && start <= ar->lleft) { | ||
3186 | size -= ar->lleft + 1 - start; | ||
3187 | start = ar->lleft + 1; | ||
3188 | } | ||
3189 | if (ar->pright && start + size - 1 >= ar->lright) | ||
3190 | size -= start + size - ar->lright; | ||
3191 | |||
3192 | end = start + size; | ||
3193 | |||
3194 | /* check we don't cross already preallocated blocks */ | ||
3195 | rcu_read_lock(); | ||
3196 | list_for_each_rcu(cur, &ei->i_prealloc_list) { | ||
3197 | struct ext4_prealloc_space *pa; | ||
3198 | unsigned long pa_end; | ||
3199 | |||
3200 | pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); | ||
3201 | |||
3202 | if (pa->pa_deleted) | ||
3203 | continue; | ||
3204 | spin_lock(&pa->pa_lock); | ||
3205 | if (pa->pa_deleted) { | ||
3206 | spin_unlock(&pa->pa_lock); | ||
3207 | continue; | ||
3208 | } | ||
3209 | |||
3210 | pa_end = pa->pa_lstart + pa->pa_len; | ||
3211 | |||
3212 | /* PA must not overlap original request */ | ||
3213 | BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || | ||
3214 | ac->ac_o_ex.fe_logical < pa->pa_lstart)); | ||
3215 | |||
3216 | /* skip PA normalized request doesn't overlap with */ | ||
3217 | if (pa->pa_lstart >= end) { | ||
3218 | spin_unlock(&pa->pa_lock); | ||
3219 | continue; | ||
3220 | } | ||
3221 | if (pa_end <= start) { | ||
3222 | spin_unlock(&pa->pa_lock); | ||
3223 | continue; | ||
3224 | } | ||
3225 | BUG_ON(pa->pa_lstart <= start && pa_end >= end); | ||
3226 | |||
3227 | if (pa_end <= ac->ac_o_ex.fe_logical) { | ||
3228 | BUG_ON(pa_end < start); | ||
3229 | start = pa_end; | ||
3230 | } | ||
3231 | |||
3232 | if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { | ||
3233 | BUG_ON(pa->pa_lstart > end); | ||
3234 | end = pa->pa_lstart; | ||
3235 | } | ||
3236 | spin_unlock(&pa->pa_lock); | ||
3237 | } | ||
3238 | rcu_read_unlock(); | ||
3239 | size = end - start; | ||
3240 | |||
3241 | /* XXX: extra loop to check we really don't overlap preallocations */ | ||
3242 | rcu_read_lock(); | ||
3243 | list_for_each_rcu(cur, &ei->i_prealloc_list) { | ||
3244 | struct ext4_prealloc_space *pa; | ||
3245 | unsigned long pa_end; | ||
3246 | pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); | ||
3247 | spin_lock(&pa->pa_lock); | ||
3248 | if (pa->pa_deleted == 0) { | ||
3249 | pa_end = pa->pa_lstart + pa->pa_len; | ||
3250 | BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); | ||
3251 | } | ||
3252 | spin_unlock(&pa->pa_lock); | ||
3253 | } | ||
3254 | rcu_read_unlock(); | ||
3255 | |||
3256 | if (start + size <= ac->ac_o_ex.fe_logical && | ||
3257 | start > ac->ac_o_ex.fe_logical) { | ||
3258 | printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", | ||
3259 | (unsigned long) start, (unsigned long) size, | ||
3260 | (unsigned long) ac->ac_o_ex.fe_logical); | ||
3261 | } | ||
3262 | BUG_ON(start + size <= ac->ac_o_ex.fe_logical && | ||
3263 | start > ac->ac_o_ex.fe_logical); | ||
3264 | BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); | ||
3265 | |||
3266 | /* now prepare goal request */ | ||
3267 | |||
3268 | /* XXX: is it better to align blocks WRT to logical | ||
3269 | * placement or satisfy big request as is */ | ||
3270 | ac->ac_g_ex.fe_logical = start; | ||
3271 | ac->ac_g_ex.fe_len = size; | ||
3272 | |||
3273 | /* define goal start in order to merge */ | ||
3274 | if (ar->pright && (ar->lright == (start + size))) { | ||
3275 | /* merge to the right */ | ||
3276 | ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, | ||
3277 | &ac->ac_f_ex.fe_group, | ||
3278 | &ac->ac_f_ex.fe_start); | ||
3279 | ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; | ||
3280 | } | ||
3281 | if (ar->pleft && (ar->lleft + 1 == start)) { | ||
3282 | /* merge to the left */ | ||
3283 | ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, | ||
3284 | &ac->ac_f_ex.fe_group, | ||
3285 | &ac->ac_f_ex.fe_start); | ||
3286 | ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; | ||
3287 | } | ||
3288 | |||
3289 | mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, | ||
3290 | (unsigned) orig_size, (unsigned) start); | ||
3291 | } | ||
3292 | |||
3293 | static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) | ||
3294 | { | ||
3295 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
3296 | |||
3297 | if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { | ||
3298 | atomic_inc(&sbi->s_bal_reqs); | ||
3299 | atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); | ||
3300 | if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) | ||
3301 | atomic_inc(&sbi->s_bal_success); | ||
3302 | atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); | ||
3303 | if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && | ||
3304 | ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) | ||
3305 | atomic_inc(&sbi->s_bal_goals); | ||
3306 | if (ac->ac_found > sbi->s_mb_max_to_scan) | ||
3307 | atomic_inc(&sbi->s_bal_breaks); | ||
3308 | } | ||
3309 | |||
3310 | ext4_mb_store_history(ac); | ||
3311 | } | ||
3312 | |||
3313 | /* | ||
3314 | * use blocks preallocated to inode | ||
3315 | */ | ||
3316 | static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, | ||
3317 | struct ext4_prealloc_space *pa) | ||
3318 | { | ||
3319 | ext4_fsblk_t start; | ||
3320 | ext4_fsblk_t end; | ||
3321 | int len; | ||
3322 | |||
3323 | /* found preallocated blocks, use them */ | ||
3324 | start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); | ||
3325 | end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); | ||
3326 | len = end - start; | ||
3327 | ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, | ||
3328 | &ac->ac_b_ex.fe_start); | ||
3329 | ac->ac_b_ex.fe_len = len; | ||
3330 | ac->ac_status = AC_STATUS_FOUND; | ||
3331 | ac->ac_pa = pa; | ||
3332 | |||
3333 | BUG_ON(start < pa->pa_pstart); | ||
3334 | BUG_ON(start + len > pa->pa_pstart + pa->pa_len); | ||
3335 | BUG_ON(pa->pa_free < len); | ||
3336 | pa->pa_free -= len; | ||
3337 | |||
3338 | mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa); | ||
3339 | } | ||
3340 | |||
3341 | /* | ||
3342 | * use blocks preallocated to locality group | ||
3343 | */ | ||
3344 | static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, | ||
3345 | struct ext4_prealloc_space *pa) | ||
3346 | { | ||
3347 | unsigned len = ac->ac_o_ex.fe_len; | ||
3348 | |||
3349 | ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, | ||
3350 | &ac->ac_b_ex.fe_group, | ||
3351 | &ac->ac_b_ex.fe_start); | ||
3352 | ac->ac_b_ex.fe_len = len; | ||
3353 | ac->ac_status = AC_STATUS_FOUND; | ||
3354 | ac->ac_pa = pa; | ||
3355 | |||
3356 | /* we don't correct pa_pstart or pa_plen here to avoid | ||
3357 | * possible race when tte group is being loaded concurrently | ||
3358 | * instead we correct pa later, after blocks are marked | ||
3359 | * in on-disk bitmap -- see ext4_mb_release_context() */ | ||
3360 | /* | ||
3361 | * FIXME!! but the other CPUs can look at this particular | ||
3362 | * pa and think that it have enought free blocks if we | ||
3363 | * don't update pa_free here right ? | ||
3364 | */ | ||
3365 | mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); | ||
3366 | } | ||
3367 | |||
3368 | /* | ||
3369 | * search goal blocks in preallocated space | ||
3370 | */ | ||
3371 | static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) | ||
3372 | { | ||
3373 | struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); | ||
3374 | struct ext4_locality_group *lg; | ||
3375 | struct ext4_prealloc_space *pa; | ||
3376 | struct list_head *cur; | ||
3377 | |||
3378 | /* only data can be preallocated */ | ||
3379 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) | ||
3380 | return 0; | ||
3381 | |||
3382 | /* first, try per-file preallocation */ | ||
3383 | rcu_read_lock(); | ||
3384 | list_for_each_rcu(cur, &ei->i_prealloc_list) { | ||
3385 | pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); | ||
3386 | |||
3387 | /* all fields in this condition don't change, | ||
3388 | * so we can skip locking for them */ | ||
3389 | if (ac->ac_o_ex.fe_logical < pa->pa_lstart || | ||
3390 | ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) | ||
3391 | continue; | ||
3392 | |||
3393 | /* found preallocated blocks, use them */ | ||
3394 | spin_lock(&pa->pa_lock); | ||
3395 | if (pa->pa_deleted == 0 && pa->pa_free) { | ||
3396 | atomic_inc(&pa->pa_count); | ||
3397 | ext4_mb_use_inode_pa(ac, pa); | ||
3398 | spin_unlock(&pa->pa_lock); | ||
3399 | ac->ac_criteria = 10; | ||
3400 | rcu_read_unlock(); | ||
3401 | return 1; | ||
3402 | } | ||
3403 | spin_unlock(&pa->pa_lock); | ||
3404 | } | ||
3405 | rcu_read_unlock(); | ||
3406 | |||
3407 | /* can we use group allocation? */ | ||
3408 | if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) | ||
3409 | return 0; | ||
3410 | |||
3411 | /* inode may have no locality group for some reason */ | ||
3412 | lg = ac->ac_lg; | ||
3413 | if (lg == NULL) | ||
3414 | return 0; | ||
3415 | |||
3416 | rcu_read_lock(); | ||
3417 | list_for_each_rcu(cur, &lg->lg_prealloc_list) { | ||
3418 | pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); | ||
3419 | spin_lock(&pa->pa_lock); | ||
3420 | if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { | ||
3421 | atomic_inc(&pa->pa_count); | ||
3422 | ext4_mb_use_group_pa(ac, pa); | ||
3423 | spin_unlock(&pa->pa_lock); | ||
3424 | ac->ac_criteria = 20; | ||
3425 | rcu_read_unlock(); | ||
3426 | return 1; | ||
3427 | } | ||
3428 | spin_unlock(&pa->pa_lock); | ||
3429 | } | ||
3430 | rcu_read_unlock(); | ||
3431 | |||
3432 | return 0; | ||
3433 | } | ||
3434 | |||
3435 | /* | ||
3436 | * the function goes through all preallocation in this group and marks them | ||
3437 | * used in in-core bitmap. buddy must be generated from this bitmap | ||
3438 | * Need to be called with ext4 group lock (ext4_lock_group) | ||
3439 | */ | ||
3440 | static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | ||
3441 | ext4_group_t group) | ||
3442 | { | ||
3443 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); | ||
3444 | struct ext4_prealloc_space *pa; | ||
3445 | struct list_head *cur; | ||
3446 | ext4_group_t groupnr; | ||
3447 | ext4_grpblk_t start; | ||
3448 | int preallocated = 0; | ||
3449 | int count = 0; | ||
3450 | int len; | ||
3451 | |||
3452 | /* all form of preallocation discards first load group, | ||
3453 | * so the only competing code is preallocation use. | ||
3454 | * we don't need any locking here | ||
3455 | * notice we do NOT ignore preallocations with pa_deleted | ||
3456 | * otherwise we could leave used blocks available for | ||
3457 | * allocation in buddy when concurrent ext4_mb_put_pa() | ||
3458 | * is dropping preallocation | ||
3459 | */ | ||
3460 | list_for_each(cur, &grp->bb_prealloc_list) { | ||
3461 | pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); | ||
3462 | spin_lock(&pa->pa_lock); | ||
3463 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, | ||
3464 | &groupnr, &start); | ||
3465 | len = pa->pa_len; | ||
3466 | spin_unlock(&pa->pa_lock); | ||
3467 | if (unlikely(len == 0)) | ||
3468 | continue; | ||
3469 | BUG_ON(groupnr != group); | ||
3470 | mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), | ||
3471 | bitmap, start, len); | ||
3472 | preallocated += len; | ||
3473 | count++; | ||
3474 | } | ||
3475 | mb_debug("prellocated %u for group %lu\n", preallocated, group); | ||
3476 | } | ||
3477 | |||
3478 | static void ext4_mb_pa_callback(struct rcu_head *head) | ||
3479 | { | ||
3480 | struct ext4_prealloc_space *pa; | ||
3481 | pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); | ||
3482 | kmem_cache_free(ext4_pspace_cachep, pa); | ||
3483 | } | ||
3484 | |||
3485 | /* | ||
3486 | * drops a reference to preallocated space descriptor | ||
3487 | * if this was the last reference and the space is consumed | ||
3488 | */ | ||
3489 | static void ext4_mb_put_pa(struct ext4_allocation_context *ac, | ||
3490 | struct super_block *sb, struct ext4_prealloc_space *pa) | ||
3491 | { | ||
3492 | unsigned long grp; | ||
3493 | |||
3494 | if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) | ||
3495 | return; | ||
3496 | |||
3497 | /* in this short window concurrent discard can set pa_deleted */ | ||
3498 | spin_lock(&pa->pa_lock); | ||
3499 | if (pa->pa_deleted == 1) { | ||
3500 | spin_unlock(&pa->pa_lock); | ||
3501 | return; | ||
3502 | } | ||
3503 | |||
3504 | pa->pa_deleted = 1; | ||
3505 | spin_unlock(&pa->pa_lock); | ||
3506 | |||
3507 | /* -1 is to protect from crossing allocation group */ | ||
3508 | ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); | ||
3509 | |||
3510 | /* | ||
3511 | * possible race: | ||
3512 | * | ||
3513 | * P1 (buddy init) P2 (regular allocation) | ||
3514 | * find block B in PA | ||
3515 | * copy on-disk bitmap to buddy | ||
3516 | * mark B in on-disk bitmap | ||
3517 | * drop PA from group | ||
3518 | * mark all PAs in buddy | ||
3519 | * | ||
3520 | * thus, P1 initializes buddy with B available. to prevent this | ||
3521 | * we make "copy" and "mark all PAs" atomic and serialize "drop PA" | ||
3522 | * against that pair | ||
3523 | */ | ||
3524 | ext4_lock_group(sb, grp); | ||
3525 | list_del(&pa->pa_group_list); | ||
3526 | ext4_unlock_group(sb, grp); | ||
3527 | |||
3528 | spin_lock(pa->pa_obj_lock); | ||
3529 | list_del_rcu(&pa->pa_inode_list); | ||
3530 | spin_unlock(pa->pa_obj_lock); | ||
3531 | |||
3532 | call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); | ||
3533 | } | ||
3534 | |||
3535 | /* | ||
3536 | * creates new preallocated space for given inode | ||
3537 | */ | ||
3538 | static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) | ||
3539 | { | ||
3540 | struct super_block *sb = ac->ac_sb; | ||
3541 | struct ext4_prealloc_space *pa; | ||
3542 | struct ext4_group_info *grp; | ||
3543 | struct ext4_inode_info *ei; | ||
3544 | |||
3545 | /* preallocate only when found space is larger then requested */ | ||
3546 | BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); | ||
3547 | BUG_ON(ac->ac_status != AC_STATUS_FOUND); | ||
3548 | BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); | ||
3549 | |||
3550 | pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); | ||
3551 | if (pa == NULL) | ||
3552 | return -ENOMEM; | ||
3553 | |||
3554 | if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { | ||
3555 | int winl; | ||
3556 | int wins; | ||
3557 | int win; | ||
3558 | int offs; | ||
3559 | |||
3560 | /* we can't allocate as much as normalizer wants. | ||
3561 | * so, found space must get proper lstart | ||
3562 | * to cover original request */ | ||
3563 | BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); | ||
3564 | BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); | ||
3565 | |||
3566 | /* we're limited by original request in that | ||
3567 | * logical block must be covered any way | ||
3568 | * winl is window we can move our chunk within */ | ||
3569 | winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; | ||
3570 | |||
3571 | /* also, we should cover whole original request */ | ||
3572 | wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; | ||
3573 | |||
3574 | /* the smallest one defines real window */ | ||
3575 | win = min(winl, wins); | ||
3576 | |||
3577 | offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; | ||
3578 | if (offs && offs < win) | ||
3579 | win = offs; | ||
3580 | |||
3581 | ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; | ||
3582 | BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); | ||
3583 | BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); | ||
3584 | } | ||
3585 | |||
3586 | /* preallocation can change ac_b_ex, thus we store actually | ||
3587 | * allocated blocks for history */ | ||
3588 | ac->ac_f_ex = ac->ac_b_ex; | ||
3589 | |||
3590 | pa->pa_lstart = ac->ac_b_ex.fe_logical; | ||
3591 | pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); | ||
3592 | pa->pa_len = ac->ac_b_ex.fe_len; | ||
3593 | pa->pa_free = pa->pa_len; | ||
3594 | atomic_set(&pa->pa_count, 1); | ||
3595 | spin_lock_init(&pa->pa_lock); | ||
3596 | pa->pa_deleted = 0; | ||
3597 | pa->pa_linear = 0; | ||
3598 | |||
3599 | mb_debug("new inode pa %p: %llu/%u for %u\n", pa, | ||
3600 | pa->pa_pstart, pa->pa_len, pa->pa_lstart); | ||
3601 | |||
3602 | ext4_mb_use_inode_pa(ac, pa); | ||
3603 | atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); | ||
3604 | |||
3605 | ei = EXT4_I(ac->ac_inode); | ||
3606 | grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); | ||
3607 | |||
3608 | pa->pa_obj_lock = &ei->i_prealloc_lock; | ||
3609 | pa->pa_inode = ac->ac_inode; | ||
3610 | |||
3611 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); | ||
3612 | list_add(&pa->pa_group_list, &grp->bb_prealloc_list); | ||
3613 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); | ||
3614 | |||
3615 | spin_lock(pa->pa_obj_lock); | ||
3616 | list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); | ||
3617 | spin_unlock(pa->pa_obj_lock); | ||
3618 | |||
3619 | return 0; | ||
3620 | } | ||
3621 | |||
3622 | /* | ||
3623 | * creates new preallocated space for locality group inodes belongs to | ||
3624 | */ | ||
3625 | static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac) | ||
3626 | { | ||
3627 | struct super_block *sb = ac->ac_sb; | ||
3628 | struct ext4_locality_group *lg; | ||
3629 | struct ext4_prealloc_space *pa; | ||
3630 | struct ext4_group_info *grp; | ||
3631 | |||
3632 | /* preallocate only when found space is larger then requested */ | ||
3633 | BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); | ||
3634 | BUG_ON(ac->ac_status != AC_STATUS_FOUND); | ||
3635 | BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); | ||
3636 | |||
3637 | BUG_ON(ext4_pspace_cachep == NULL); | ||
3638 | pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); | ||
3639 | if (pa == NULL) | ||
3640 | return -ENOMEM; | ||
3641 | |||
3642 | /* preallocation can change ac_b_ex, thus we store actually | ||
3643 | * allocated blocks for history */ | ||
3644 | ac->ac_f_ex = ac->ac_b_ex; | ||
3645 | |||
3646 | pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); | ||
3647 | pa->pa_lstart = pa->pa_pstart; | ||
3648 | pa->pa_len = ac->ac_b_ex.fe_len; | ||
3649 | pa->pa_free = pa->pa_len; | ||
3650 | atomic_set(&pa->pa_count, 1); | ||
3651 | spin_lock_init(&pa->pa_lock); | ||
3652 | pa->pa_deleted = 0; | ||
3653 | pa->pa_linear = 1; | ||
3654 | |||
3655 | mb_debug("new group pa %p: %llu/%u for %u\n", pa, | ||
3656 | pa->pa_pstart, pa->pa_len, pa->pa_lstart); | ||
3657 | |||
3658 | ext4_mb_use_group_pa(ac, pa); | ||
3659 | atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); | ||
3660 | |||
3661 | grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); | ||
3662 | lg = ac->ac_lg; | ||
3663 | BUG_ON(lg == NULL); | ||
3664 | |||
3665 | pa->pa_obj_lock = &lg->lg_prealloc_lock; | ||
3666 | pa->pa_inode = NULL; | ||
3667 | |||
3668 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); | ||
3669 | list_add(&pa->pa_group_list, &grp->bb_prealloc_list); | ||
3670 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); | ||
3671 | |||
3672 | spin_lock(pa->pa_obj_lock); | ||
3673 | list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); | ||
3674 | spin_unlock(pa->pa_obj_lock); | ||
3675 | |||
3676 | return 0; | ||
3677 | } | ||
3678 | |||
3679 | static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) | ||
3680 | { | ||
3681 | int err; | ||
3682 | |||
3683 | if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) | ||
3684 | err = ext4_mb_new_group_pa(ac); | ||
3685 | else | ||
3686 | err = ext4_mb_new_inode_pa(ac); | ||
3687 | return err; | ||
3688 | } | ||
3689 | |||
3690 | /* | ||
3691 | * finds all unused blocks in on-disk bitmap, frees them in | ||
3692 | * in-core bitmap and buddy. | ||
3693 | * @pa must be unlinked from inode and group lists, so that | ||
3694 | * nobody else can find/use it. | ||
3695 | * the caller MUST hold group/inode locks. | ||
3696 | * TODO: optimize the case when there are no in-core structures yet | ||
3697 | */ | ||
3698 | static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, | ||
3699 | struct buffer_head *bitmap_bh, | ||
3700 | struct ext4_prealloc_space *pa) | ||
3701 | { | ||
3702 | struct ext4_allocation_context ac; | ||
3703 | struct super_block *sb = e4b->bd_sb; | ||
3704 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
3705 | unsigned long end; | ||
3706 | unsigned long next; | ||
3707 | ext4_group_t group; | ||
3708 | ext4_grpblk_t bit; | ||
3709 | sector_t start; | ||
3710 | int err = 0; | ||
3711 | int free = 0; | ||
3712 | |||
3713 | BUG_ON(pa->pa_deleted == 0); | ||
3714 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); | ||
3715 | BUG_ON(group != e4b->bd_group && pa->pa_len != 0); | ||
3716 | end = bit + pa->pa_len; | ||
3717 | |||
3718 | ac.ac_sb = sb; | ||
3719 | ac.ac_inode = pa->pa_inode; | ||
3720 | ac.ac_op = EXT4_MB_HISTORY_DISCARD; | ||
3721 | |||
3722 | while (bit < end) { | ||
3723 | bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit); | ||
3724 | if (bit >= end) | ||
3725 | break; | ||
3726 | next = ext4_find_next_bit(bitmap_bh->b_data, end, bit); | ||
3727 | if (next > end) | ||
3728 | next = end; | ||
3729 | start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + | ||
3730 | le32_to_cpu(sbi->s_es->s_first_data_block); | ||
3731 | mb_debug(" free preallocated %u/%u in group %u\n", | ||
3732 | (unsigned) start, (unsigned) next - bit, | ||
3733 | (unsigned) group); | ||
3734 | free += next - bit; | ||
3735 | |||
3736 | ac.ac_b_ex.fe_group = group; | ||
3737 | ac.ac_b_ex.fe_start = bit; | ||
3738 | ac.ac_b_ex.fe_len = next - bit; | ||
3739 | ac.ac_b_ex.fe_logical = 0; | ||
3740 | ext4_mb_store_history(&ac); | ||
3741 | |||
3742 | mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); | ||
3743 | bit = next + 1; | ||
3744 | } | ||
3745 | if (free != pa->pa_free) { | ||
3746 | printk(KERN_ERR "pa %p: logic %lu, phys. %lu, len %lu\n", | ||
3747 | pa, (unsigned long) pa->pa_lstart, | ||
3748 | (unsigned long) pa->pa_pstart, | ||
3749 | (unsigned long) pa->pa_len); | ||
3750 | printk(KERN_ERR "free %u, pa_free %u\n", free, pa->pa_free); | ||
3751 | } | ||
3752 | BUG_ON(free != pa->pa_free); | ||
3753 | atomic_add(free, &sbi->s_mb_discarded); | ||
3754 | |||
3755 | return err; | ||
3756 | } | ||
3757 | |||
3758 | static int ext4_mb_release_group_pa(struct ext4_buddy *e4b, | ||
3759 | struct ext4_prealloc_space *pa) | ||
3760 | { | ||
3761 | struct ext4_allocation_context ac; | ||
3762 | struct super_block *sb = e4b->bd_sb; | ||
3763 | ext4_group_t group; | ||
3764 | ext4_grpblk_t bit; | ||
3765 | |||
3766 | ac.ac_op = EXT4_MB_HISTORY_DISCARD; | ||
3767 | |||
3768 | BUG_ON(pa->pa_deleted == 0); | ||
3769 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); | ||
3770 | BUG_ON(group != e4b->bd_group && pa->pa_len != 0); | ||
3771 | mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); | ||
3772 | atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); | ||
3773 | |||
3774 | ac.ac_sb = sb; | ||
3775 | ac.ac_inode = NULL; | ||
3776 | ac.ac_b_ex.fe_group = group; | ||
3777 | ac.ac_b_ex.fe_start = bit; | ||
3778 | ac.ac_b_ex.fe_len = pa->pa_len; | ||
3779 | ac.ac_b_ex.fe_logical = 0; | ||
3780 | ext4_mb_store_history(&ac); | ||
3781 | |||
3782 | return 0; | ||
3783 | } | ||
3784 | |||
3785 | /* | ||
3786 | * releases all preallocations in given group | ||
3787 | * | ||
3788 | * first, we need to decide discard policy: | ||
3789 | * - when do we discard | ||
3790 | * 1) ENOSPC | ||
3791 | * - how many do we discard | ||
3792 | * 1) how many requested | ||
3793 | */ | ||
3794 | static int ext4_mb_discard_group_preallocations(struct super_block *sb, | ||
3795 | ext4_group_t group, int needed) | ||
3796 | { | ||
3797 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); | ||
3798 | struct buffer_head *bitmap_bh = NULL; | ||
3799 | struct ext4_prealloc_space *pa, *tmp; | ||
3800 | struct list_head list; | ||
3801 | struct ext4_buddy e4b; | ||
3802 | int err; | ||
3803 | int busy = 0; | ||
3804 | int free = 0; | ||
3805 | |||
3806 | mb_debug("discard preallocation for group %lu\n", group); | ||
3807 | |||
3808 | if (list_empty(&grp->bb_prealloc_list)) | ||
3809 | return 0; | ||
3810 | |||
3811 | bitmap_bh = read_block_bitmap(sb, group); | ||
3812 | if (bitmap_bh == NULL) { | ||
3813 | /* error handling here */ | ||
3814 | ext4_mb_release_desc(&e4b); | ||
3815 | BUG_ON(bitmap_bh == NULL); | ||
3816 | } | ||
3817 | |||
3818 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
3819 | BUG_ON(err != 0); /* error handling here */ | ||
3820 | |||
3821 | if (needed == 0) | ||
3822 | needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; | ||
3823 | |||
3824 | grp = ext4_get_group_info(sb, group); | ||
3825 | INIT_LIST_HEAD(&list); | ||
3826 | |||
3827 | repeat: | ||
3828 | ext4_lock_group(sb, group); | ||
3829 | list_for_each_entry_safe(pa, tmp, | ||
3830 | &grp->bb_prealloc_list, pa_group_list) { | ||
3831 | spin_lock(&pa->pa_lock); | ||
3832 | if (atomic_read(&pa->pa_count)) { | ||
3833 | spin_unlock(&pa->pa_lock); | ||
3834 | busy = 1; | ||
3835 | continue; | ||
3836 | } | ||
3837 | if (pa->pa_deleted) { | ||
3838 | spin_unlock(&pa->pa_lock); | ||
3839 | continue; | ||
3840 | } | ||
3841 | |||
3842 | /* seems this one can be freed ... */ | ||
3843 | pa->pa_deleted = 1; | ||
3844 | |||
3845 | /* we can trust pa_free ... */ | ||
3846 | free += pa->pa_free; | ||
3847 | |||
3848 | spin_unlock(&pa->pa_lock); | ||
3849 | |||
3850 | list_del(&pa->pa_group_list); | ||
3851 | list_add(&pa->u.pa_tmp_list, &list); | ||
3852 | } | ||
3853 | |||
3854 | /* if we still need more blocks and some PAs were used, try again */ | ||
3855 | if (free < needed && busy) { | ||
3856 | busy = 0; | ||
3857 | ext4_unlock_group(sb, group); | ||
3858 | /* | ||
3859 | * Yield the CPU here so that we don't get soft lockup | ||
3860 | * in non preempt case. | ||
3861 | */ | ||
3862 | yield(); | ||
3863 | goto repeat; | ||
3864 | } | ||
3865 | |||
3866 | /* found anything to free? */ | ||
3867 | if (list_empty(&list)) { | ||
3868 | BUG_ON(free != 0); | ||
3869 | goto out; | ||
3870 | } | ||
3871 | |||
3872 | /* now free all selected PAs */ | ||
3873 | list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { | ||
3874 | |||
3875 | /* remove from object (inode or locality group) */ | ||
3876 | spin_lock(pa->pa_obj_lock); | ||
3877 | list_del_rcu(&pa->pa_inode_list); | ||
3878 | spin_unlock(pa->pa_obj_lock); | ||
3879 | |||
3880 | if (pa->pa_linear) | ||
3881 | ext4_mb_release_group_pa(&e4b, pa); | ||
3882 | else | ||
3883 | ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); | ||
3884 | |||
3885 | list_del(&pa->u.pa_tmp_list); | ||
3886 | call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); | ||
3887 | } | ||
3888 | |||
3889 | out: | ||
3890 | ext4_unlock_group(sb, group); | ||
3891 | ext4_mb_release_desc(&e4b); | ||
3892 | put_bh(bitmap_bh); | ||
3893 | return free; | ||
3894 | } | ||
3895 | |||
3896 | /* | ||
3897 | * releases all non-used preallocated blocks for given inode | ||
3898 | * | ||
3899 | * It's important to discard preallocations under i_data_sem | ||
3900 | * We don't want another block to be served from the prealloc | ||
3901 | * space when we are discarding the inode prealloc space. | ||
3902 | * | ||
3903 | * FIXME!! Make sure it is valid at all the call sites | ||
3904 | */ | ||
3905 | void ext4_mb_discard_inode_preallocations(struct inode *inode) | ||
3906 | { | ||
3907 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
3908 | struct super_block *sb = inode->i_sb; | ||
3909 | struct buffer_head *bitmap_bh = NULL; | ||
3910 | struct ext4_prealloc_space *pa, *tmp; | ||
3911 | ext4_group_t group = 0; | ||
3912 | struct list_head list; | ||
3913 | struct ext4_buddy e4b; | ||
3914 | int err; | ||
3915 | |||
3916 | if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { | ||
3917 | /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ | ||
3918 | return; | ||
3919 | } | ||
3920 | |||
3921 | mb_debug("discard preallocation for inode %lu\n", inode->i_ino); | ||
3922 | |||
3923 | INIT_LIST_HEAD(&list); | ||
3924 | |||
3925 | repeat: | ||
3926 | /* first, collect all pa's in the inode */ | ||
3927 | spin_lock(&ei->i_prealloc_lock); | ||
3928 | while (!list_empty(&ei->i_prealloc_list)) { | ||
3929 | pa = list_entry(ei->i_prealloc_list.next, | ||
3930 | struct ext4_prealloc_space, pa_inode_list); | ||
3931 | BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); | ||
3932 | spin_lock(&pa->pa_lock); | ||
3933 | if (atomic_read(&pa->pa_count)) { | ||
3934 | /* this shouldn't happen often - nobody should | ||
3935 | * use preallocation while we're discarding it */ | ||
3936 | spin_unlock(&pa->pa_lock); | ||
3937 | spin_unlock(&ei->i_prealloc_lock); | ||
3938 | printk(KERN_ERR "uh-oh! used pa while discarding\n"); | ||
3939 | WARN_ON(1); | ||
3940 | schedule_timeout_uninterruptible(HZ); | ||
3941 | goto repeat; | ||
3942 | |||
3943 | } | ||
3944 | if (pa->pa_deleted == 0) { | ||
3945 | pa->pa_deleted = 1; | ||
3946 | spin_unlock(&pa->pa_lock); | ||
3947 | list_del_rcu(&pa->pa_inode_list); | ||
3948 | list_add(&pa->u.pa_tmp_list, &list); | ||
3949 | continue; | ||
3950 | } | ||
3951 | |||
3952 | /* someone is deleting pa right now */ | ||
3953 | spin_unlock(&pa->pa_lock); | ||
3954 | spin_unlock(&ei->i_prealloc_lock); | ||
3955 | |||
3956 | /* we have to wait here because pa_deleted | ||
3957 | * doesn't mean pa is already unlinked from | ||
3958 | * the list. as we might be called from | ||
3959 | * ->clear_inode() the inode will get freed | ||
3960 | * and concurrent thread which is unlinking | ||
3961 | * pa from inode's list may access already | ||
3962 | * freed memory, bad-bad-bad */ | ||
3963 | |||
3964 | /* XXX: if this happens too often, we can | ||
3965 | * add a flag to force wait only in case | ||
3966 | * of ->clear_inode(), but not in case of | ||
3967 | * regular truncate */ | ||
3968 | schedule_timeout_uninterruptible(HZ); | ||
3969 | goto repeat; | ||
3970 | } | ||
3971 | spin_unlock(&ei->i_prealloc_lock); | ||
3972 | |||
3973 | list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { | ||
3974 | BUG_ON(pa->pa_linear != 0); | ||
3975 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); | ||
3976 | |||
3977 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
3978 | BUG_ON(err != 0); /* error handling here */ | ||
3979 | |||
3980 | bitmap_bh = read_block_bitmap(sb, group); | ||
3981 | if (bitmap_bh == NULL) { | ||
3982 | /* error handling here */ | ||
3983 | ext4_mb_release_desc(&e4b); | ||
3984 | BUG_ON(bitmap_bh == NULL); | ||
3985 | } | ||
3986 | |||
3987 | ext4_lock_group(sb, group); | ||
3988 | list_del(&pa->pa_group_list); | ||
3989 | ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); | ||
3990 | ext4_unlock_group(sb, group); | ||
3991 | |||
3992 | ext4_mb_release_desc(&e4b); | ||
3993 | put_bh(bitmap_bh); | ||
3994 | |||
3995 | list_del(&pa->u.pa_tmp_list); | ||
3996 | call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); | ||
3997 | } | ||
3998 | } | ||
3999 | |||
4000 | /* | ||
4001 | * finds all preallocated spaces and return blocks being freed to them | ||
4002 | * if preallocated space becomes full (no block is used from the space) | ||
4003 | * then the function frees space in buddy | ||
4004 | * XXX: at the moment, truncate (which is the only way to free blocks) | ||
4005 | * discards all preallocations | ||
4006 | */ | ||
4007 | static void ext4_mb_return_to_preallocation(struct inode *inode, | ||
4008 | struct ext4_buddy *e4b, | ||
4009 | sector_t block, int count) | ||
4010 | { | ||
4011 | BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); | ||
4012 | } | ||
4013 | #ifdef MB_DEBUG | ||
4014 | static void ext4_mb_show_ac(struct ext4_allocation_context *ac) | ||
4015 | { | ||
4016 | struct super_block *sb = ac->ac_sb; | ||
4017 | ext4_group_t i; | ||
4018 | |||
4019 | printk(KERN_ERR "EXT4-fs: Can't allocate:" | ||
4020 | " Allocation context details:\n"); | ||
4021 | printk(KERN_ERR "EXT4-fs: status %d flags %d\n", | ||
4022 | ac->ac_status, ac->ac_flags); | ||
4023 | printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " | ||
4024 | "best %lu/%lu/%lu@%lu cr %d\n", | ||
4025 | (unsigned long)ac->ac_o_ex.fe_group, | ||
4026 | (unsigned long)ac->ac_o_ex.fe_start, | ||
4027 | (unsigned long)ac->ac_o_ex.fe_len, | ||
4028 | (unsigned long)ac->ac_o_ex.fe_logical, | ||
4029 | (unsigned long)ac->ac_g_ex.fe_group, | ||
4030 | (unsigned long)ac->ac_g_ex.fe_start, | ||
4031 | (unsigned long)ac->ac_g_ex.fe_len, | ||
4032 | (unsigned long)ac->ac_g_ex.fe_logical, | ||
4033 | (unsigned long)ac->ac_b_ex.fe_group, | ||
4034 | (unsigned long)ac->ac_b_ex.fe_start, | ||
4035 | (unsigned long)ac->ac_b_ex.fe_len, | ||
4036 | (unsigned long)ac->ac_b_ex.fe_logical, | ||
4037 | (int)ac->ac_criteria); | ||
4038 | printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, | ||
4039 | ac->ac_found); | ||
4040 | printk(KERN_ERR "EXT4-fs: groups: \n"); | ||
4041 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { | ||
4042 | struct ext4_group_info *grp = ext4_get_group_info(sb, i); | ||
4043 | struct ext4_prealloc_space *pa; | ||
4044 | ext4_grpblk_t start; | ||
4045 | struct list_head *cur; | ||
4046 | ext4_lock_group(sb, i); | ||
4047 | list_for_each(cur, &grp->bb_prealloc_list) { | ||
4048 | pa = list_entry(cur, struct ext4_prealloc_space, | ||
4049 | pa_group_list); | ||
4050 | spin_lock(&pa->pa_lock); | ||
4051 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, | ||
4052 | NULL, &start); | ||
4053 | spin_unlock(&pa->pa_lock); | ||
4054 | printk(KERN_ERR "PA:%lu:%d:%u \n", i, | ||
4055 | start, pa->pa_len); | ||
4056 | } | ||
4057 | ext4_lock_group(sb, i); | ||
4058 | |||
4059 | if (grp->bb_free == 0) | ||
4060 | continue; | ||
4061 | printk(KERN_ERR "%lu: %d/%d \n", | ||
4062 | i, grp->bb_free, grp->bb_fragments); | ||
4063 | } | ||
4064 | printk(KERN_ERR "\n"); | ||
4065 | } | ||
4066 | #else | ||
4067 | static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) | ||
4068 | { | ||
4069 | return; | ||
4070 | } | ||
4071 | #endif | ||
4072 | |||
4073 | /* | ||
4074 | * We use locality group preallocation for small size file. The size of the | ||
4075 | * file is determined by the current size or the resulting size after | ||
4076 | * allocation which ever is larger | ||
4077 | * | ||
4078 | * One can tune this size via /proc/fs/ext4/<partition>/stream_req | ||
4079 | */ | ||
4080 | static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) | ||
4081 | { | ||
4082 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
4083 | int bsbits = ac->ac_sb->s_blocksize_bits; | ||
4084 | loff_t size, isize; | ||
4085 | |||
4086 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) | ||
4087 | return; | ||
4088 | |||
4089 | size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; | ||
4090 | isize = i_size_read(ac->ac_inode) >> bsbits; | ||
4091 | size = max(size, isize); | ||
4092 | |||
4093 | /* don't use group allocation for large files */ | ||
4094 | if (size >= sbi->s_mb_stream_request) | ||
4095 | return; | ||
4096 | |||
4097 | if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) | ||
4098 | return; | ||
4099 | |||
4100 | BUG_ON(ac->ac_lg != NULL); | ||
4101 | /* | ||
4102 | * locality group prealloc space are per cpu. The reason for having | ||
4103 | * per cpu locality group is to reduce the contention between block | ||
4104 | * request from multiple CPUs. | ||
4105 | */ | ||
4106 | ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; | ||
4107 | put_cpu(); | ||
4108 | |||
4109 | /* we're going to use group allocation */ | ||
4110 | ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; | ||
4111 | |||
4112 | /* serialize all allocations in the group */ | ||
4113 | mutex_lock(&ac->ac_lg->lg_mutex); | ||
4114 | } | ||
4115 | |||
4116 | static int ext4_mb_initialize_context(struct ext4_allocation_context *ac, | ||
4117 | struct ext4_allocation_request *ar) | ||
4118 | { | ||
4119 | struct super_block *sb = ar->inode->i_sb; | ||
4120 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
4121 | struct ext4_super_block *es = sbi->s_es; | ||
4122 | ext4_group_t group; | ||
4123 | unsigned long len; | ||
4124 | unsigned long goal; | ||
4125 | ext4_grpblk_t block; | ||
4126 | |||
4127 | /* we can't allocate > group size */ | ||
4128 | len = ar->len; | ||
4129 | |||
4130 | /* just a dirty hack to filter too big requests */ | ||
4131 | if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) | ||
4132 | len = EXT4_BLOCKS_PER_GROUP(sb) - 10; | ||
4133 | |||
4134 | /* start searching from the goal */ | ||
4135 | goal = ar->goal; | ||
4136 | if (goal < le32_to_cpu(es->s_first_data_block) || | ||
4137 | goal >= ext4_blocks_count(es)) | ||
4138 | goal = le32_to_cpu(es->s_first_data_block); | ||
4139 | ext4_get_group_no_and_offset(sb, goal, &group, &block); | ||
4140 | |||
4141 | /* set up allocation goals */ | ||
4142 | ac->ac_b_ex.fe_logical = ar->logical; | ||
4143 | ac->ac_b_ex.fe_group = 0; | ||
4144 | ac->ac_b_ex.fe_start = 0; | ||
4145 | ac->ac_b_ex.fe_len = 0; | ||
4146 | ac->ac_status = AC_STATUS_CONTINUE; | ||
4147 | ac->ac_groups_scanned = 0; | ||
4148 | ac->ac_ex_scanned = 0; | ||
4149 | ac->ac_found = 0; | ||
4150 | ac->ac_sb = sb; | ||
4151 | ac->ac_inode = ar->inode; | ||
4152 | ac->ac_o_ex.fe_logical = ar->logical; | ||
4153 | ac->ac_o_ex.fe_group = group; | ||
4154 | ac->ac_o_ex.fe_start = block; | ||
4155 | ac->ac_o_ex.fe_len = len; | ||
4156 | ac->ac_g_ex.fe_logical = ar->logical; | ||
4157 | ac->ac_g_ex.fe_group = group; | ||
4158 | ac->ac_g_ex.fe_start = block; | ||
4159 | ac->ac_g_ex.fe_len = len; | ||
4160 | ac->ac_f_ex.fe_len = 0; | ||
4161 | ac->ac_flags = ar->flags; | ||
4162 | ac->ac_2order = 0; | ||
4163 | ac->ac_criteria = 0; | ||
4164 | ac->ac_pa = NULL; | ||
4165 | ac->ac_bitmap_page = NULL; | ||
4166 | ac->ac_buddy_page = NULL; | ||
4167 | ac->ac_lg = NULL; | ||
4168 | |||
4169 | /* we have to define context: we'll we work with a file or | ||
4170 | * locality group. this is a policy, actually */ | ||
4171 | ext4_mb_group_or_file(ac); | ||
4172 | |||
4173 | mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " | ||
4174 | "left: %u/%u, right %u/%u to %swritable\n", | ||
4175 | (unsigned) ar->len, (unsigned) ar->logical, | ||
4176 | (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, | ||
4177 | (unsigned) ar->lleft, (unsigned) ar->pleft, | ||
4178 | (unsigned) ar->lright, (unsigned) ar->pright, | ||
4179 | atomic_read(&ar->inode->i_writecount) ? "" : "non-"); | ||
4180 | return 0; | ||
4181 | |||
4182 | } | ||
4183 | |||
4184 | /* | ||
4185 | * release all resource we used in allocation | ||
4186 | */ | ||
4187 | static int ext4_mb_release_context(struct ext4_allocation_context *ac) | ||
4188 | { | ||
4189 | if (ac->ac_pa) { | ||
4190 | if (ac->ac_pa->pa_linear) { | ||
4191 | /* see comment in ext4_mb_use_group_pa() */ | ||
4192 | spin_lock(&ac->ac_pa->pa_lock); | ||
4193 | ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; | ||
4194 | ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; | ||
4195 | ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; | ||
4196 | ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; | ||
4197 | spin_unlock(&ac->ac_pa->pa_lock); | ||
4198 | } | ||
4199 | ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); | ||
4200 | } | ||
4201 | if (ac->ac_bitmap_page) | ||
4202 | page_cache_release(ac->ac_bitmap_page); | ||
4203 | if (ac->ac_buddy_page) | ||
4204 | page_cache_release(ac->ac_buddy_page); | ||
4205 | if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) | ||
4206 | mutex_unlock(&ac->ac_lg->lg_mutex); | ||
4207 | ext4_mb_collect_stats(ac); | ||
4208 | return 0; | ||
4209 | } | ||
4210 | |||
4211 | static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) | ||
4212 | { | ||
4213 | ext4_group_t i; | ||
4214 | int ret; | ||
4215 | int freed = 0; | ||
4216 | |||
4217 | for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { | ||
4218 | ret = ext4_mb_discard_group_preallocations(sb, i, needed); | ||
4219 | freed += ret; | ||
4220 | needed -= ret; | ||
4221 | } | ||
4222 | |||
4223 | return freed; | ||
4224 | } | ||
4225 | |||
4226 | /* | ||
4227 | * Main entry point into mballoc to allocate blocks | ||
4228 | * it tries to use preallocation first, then falls back | ||
4229 | * to usual allocation | ||
4230 | */ | ||
4231 | ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | ||
4232 | struct ext4_allocation_request *ar, int *errp) | ||
4233 | { | ||
4234 | struct ext4_allocation_context ac; | ||
4235 | struct ext4_sb_info *sbi; | ||
4236 | struct super_block *sb; | ||
4237 | ext4_fsblk_t block = 0; | ||
4238 | int freed; | ||
4239 | int inquota; | ||
4240 | |||
4241 | sb = ar->inode->i_sb; | ||
4242 | sbi = EXT4_SB(sb); | ||
4243 | |||
4244 | if (!test_opt(sb, MBALLOC)) { | ||
4245 | block = ext4_new_blocks_old(handle, ar->inode, ar->goal, | ||
4246 | &(ar->len), errp); | ||
4247 | return block; | ||
4248 | } | ||
4249 | |||
4250 | while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { | ||
4251 | ar->flags |= EXT4_MB_HINT_NOPREALLOC; | ||
4252 | ar->len--; | ||
4253 | } | ||
4254 | if (ar->len == 0) { | ||
4255 | *errp = -EDQUOT; | ||
4256 | return 0; | ||
4257 | } | ||
4258 | inquota = ar->len; | ||
4259 | |||
4260 | ext4_mb_poll_new_transaction(sb, handle); | ||
4261 | |||
4262 | *errp = ext4_mb_initialize_context(&ac, ar); | ||
4263 | if (*errp) { | ||
4264 | ar->len = 0; | ||
4265 | goto out; | ||
4266 | } | ||
4267 | |||
4268 | ac.ac_op = EXT4_MB_HISTORY_PREALLOC; | ||
4269 | if (!ext4_mb_use_preallocated(&ac)) { | ||
4270 | |||
4271 | ac.ac_op = EXT4_MB_HISTORY_ALLOC; | ||
4272 | ext4_mb_normalize_request(&ac, ar); | ||
4273 | |||
4274 | repeat: | ||
4275 | /* allocate space in core */ | ||
4276 | ext4_mb_regular_allocator(&ac); | ||
4277 | |||
4278 | /* as we've just preallocated more space than | ||
4279 | * user requested orinally, we store allocated | ||
4280 | * space in a special descriptor */ | ||
4281 | if (ac.ac_status == AC_STATUS_FOUND && | ||
4282 | ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len) | ||
4283 | ext4_mb_new_preallocation(&ac); | ||
4284 | } | ||
4285 | |||
4286 | if (likely(ac.ac_status == AC_STATUS_FOUND)) { | ||
4287 | ext4_mb_mark_diskspace_used(&ac, handle); | ||
4288 | *errp = 0; | ||
4289 | block = ext4_grp_offs_to_block(sb, &ac.ac_b_ex); | ||
4290 | ar->len = ac.ac_b_ex.fe_len; | ||
4291 | } else { | ||
4292 | freed = ext4_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len); | ||
4293 | if (freed) | ||
4294 | goto repeat; | ||
4295 | *errp = -ENOSPC; | ||
4296 | ac.ac_b_ex.fe_len = 0; | ||
4297 | ar->len = 0; | ||
4298 | ext4_mb_show_ac(&ac); | ||
4299 | } | ||
4300 | |||
4301 | ext4_mb_release_context(&ac); | ||
4302 | |||
4303 | out: | ||
4304 | if (ar->len < inquota) | ||
4305 | DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); | ||
4306 | |||
4307 | return block; | ||
4308 | } | ||
4309 | static void ext4_mb_poll_new_transaction(struct super_block *sb, | ||
4310 | handle_t *handle) | ||
4311 | { | ||
4312 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
4313 | |||
4314 | if (sbi->s_last_transaction == handle->h_transaction->t_tid) | ||
4315 | return; | ||
4316 | |||
4317 | /* new transaction! time to close last one and free blocks for | ||
4318 | * committed transaction. we know that only transaction can be | ||
4319 | * active, so previos transaction can be being logged and we | ||
4320 | * know that transaction before previous is known to be already | ||
4321 | * logged. this means that now we may free blocks freed in all | ||
4322 | * transactions before previous one. hope I'm clear enough ... */ | ||
4323 | |||
4324 | spin_lock(&sbi->s_md_lock); | ||
4325 | if (sbi->s_last_transaction != handle->h_transaction->t_tid) { | ||
4326 | mb_debug("new transaction %lu, old %lu\n", | ||
4327 | (unsigned long) handle->h_transaction->t_tid, | ||
4328 | (unsigned long) sbi->s_last_transaction); | ||
4329 | list_splice_init(&sbi->s_closed_transaction, | ||
4330 | &sbi->s_committed_transaction); | ||
4331 | list_splice_init(&sbi->s_active_transaction, | ||
4332 | &sbi->s_closed_transaction); | ||
4333 | sbi->s_last_transaction = handle->h_transaction->t_tid; | ||
4334 | } | ||
4335 | spin_unlock(&sbi->s_md_lock); | ||
4336 | |||
4337 | ext4_mb_free_committed_blocks(sb); | ||
4338 | } | ||
4339 | |||
4340 | static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | ||
4341 | ext4_group_t group, ext4_grpblk_t block, int count) | ||
4342 | { | ||
4343 | struct ext4_group_info *db = e4b->bd_info; | ||
4344 | struct super_block *sb = e4b->bd_sb; | ||
4345 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
4346 | struct ext4_free_metadata *md; | ||
4347 | int i; | ||
4348 | |||
4349 | BUG_ON(e4b->bd_bitmap_page == NULL); | ||
4350 | BUG_ON(e4b->bd_buddy_page == NULL); | ||
4351 | |||
4352 | ext4_lock_group(sb, group); | ||
4353 | for (i = 0; i < count; i++) { | ||
4354 | md = db->bb_md_cur; | ||
4355 | if (md && db->bb_tid != handle->h_transaction->t_tid) { | ||
4356 | db->bb_md_cur = NULL; | ||
4357 | md = NULL; | ||
4358 | } | ||
4359 | |||
4360 | if (md == NULL) { | ||
4361 | ext4_unlock_group(sb, group); | ||
4362 | md = kmalloc(sizeof(*md), GFP_NOFS); | ||
4363 | if (md == NULL) | ||
4364 | return -ENOMEM; | ||
4365 | md->num = 0; | ||
4366 | md->group = group; | ||
4367 | |||
4368 | ext4_lock_group(sb, group); | ||
4369 | if (db->bb_md_cur == NULL) { | ||
4370 | spin_lock(&sbi->s_md_lock); | ||
4371 | list_add(&md->list, &sbi->s_active_transaction); | ||
4372 | spin_unlock(&sbi->s_md_lock); | ||
4373 | /* protect buddy cache from being freed, | ||
4374 | * otherwise we'll refresh it from | ||
4375 | * on-disk bitmap and lose not-yet-available | ||
4376 | * blocks */ | ||
4377 | page_cache_get(e4b->bd_buddy_page); | ||
4378 | page_cache_get(e4b->bd_bitmap_page); | ||
4379 | db->bb_md_cur = md; | ||
4380 | db->bb_tid = handle->h_transaction->t_tid; | ||
4381 | mb_debug("new md 0x%p for group %lu\n", | ||
4382 | md, md->group); | ||
4383 | } else { | ||
4384 | kfree(md); | ||
4385 | md = db->bb_md_cur; | ||
4386 | } | ||
4387 | } | ||
4388 | |||
4389 | BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); | ||
4390 | md->blocks[md->num] = block + i; | ||
4391 | md->num++; | ||
4392 | if (md->num == EXT4_BB_MAX_BLOCKS) { | ||
4393 | /* no more space, put full container on a sb's list */ | ||
4394 | db->bb_md_cur = NULL; | ||
4395 | } | ||
4396 | } | ||
4397 | ext4_unlock_group(sb, group); | ||
4398 | return 0; | ||
4399 | } | ||
4400 | |||
4401 | /* | ||
4402 | * Main entry point into mballoc to free blocks | ||
4403 | */ | ||
4404 | void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, | ||
4405 | unsigned long block, unsigned long count, | ||
4406 | int metadata, unsigned long *freed) | ||
4407 | { | ||
4408 | struct buffer_head *bitmap_bh = 0; | ||
4409 | struct super_block *sb = inode->i_sb; | ||
4410 | struct ext4_allocation_context ac; | ||
4411 | struct ext4_group_desc *gdp; | ||
4412 | struct ext4_super_block *es; | ||
4413 | unsigned long overflow; | ||
4414 | ext4_grpblk_t bit; | ||
4415 | struct buffer_head *gd_bh; | ||
4416 | ext4_group_t block_group; | ||
4417 | struct ext4_sb_info *sbi; | ||
4418 | struct ext4_buddy e4b; | ||
4419 | int err = 0; | ||
4420 | int ret; | ||
4421 | |||
4422 | *freed = 0; | ||
4423 | |||
4424 | ext4_mb_poll_new_transaction(sb, handle); | ||
4425 | |||
4426 | sbi = EXT4_SB(sb); | ||
4427 | es = EXT4_SB(sb)->s_es; | ||
4428 | if (block < le32_to_cpu(es->s_first_data_block) || | ||
4429 | block + count < block || | ||
4430 | block + count > ext4_blocks_count(es)) { | ||
4431 | ext4_error(sb, __FUNCTION__, | ||
4432 | "Freeing blocks not in datazone - " | ||
4433 | "block = %lu, count = %lu", block, count); | ||
4434 | goto error_return; | ||
4435 | } | ||
4436 | |||
4437 | ext4_debug("freeing block %lu\n", block); | ||
4438 | |||
4439 | ac.ac_op = EXT4_MB_HISTORY_FREE; | ||
4440 | ac.ac_inode = inode; | ||
4441 | ac.ac_sb = sb; | ||
4442 | |||
4443 | do_more: | ||
4444 | overflow = 0; | ||
4445 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); | ||
4446 | |||
4447 | /* | ||
4448 | * Check to see if we are freeing blocks across a group | ||
4449 | * boundary. | ||
4450 | */ | ||
4451 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { | ||
4452 | overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); | ||
4453 | count -= overflow; | ||
4454 | } | ||
4455 | bitmap_bh = read_block_bitmap(sb, block_group); | ||
4456 | if (!bitmap_bh) | ||
4457 | goto error_return; | ||
4458 | gdp = ext4_get_group_desc(sb, block_group, &gd_bh); | ||
4459 | if (!gdp) | ||
4460 | goto error_return; | ||
4461 | |||
4462 | if (in_range(ext4_block_bitmap(sb, gdp), block, count) || | ||
4463 | in_range(ext4_inode_bitmap(sb, gdp), block, count) || | ||
4464 | in_range(block, ext4_inode_table(sb, gdp), | ||
4465 | EXT4_SB(sb)->s_itb_per_group) || | ||
4466 | in_range(block + count - 1, ext4_inode_table(sb, gdp), | ||
4467 | EXT4_SB(sb)->s_itb_per_group)) { | ||
4468 | |||
4469 | ext4_error(sb, __FUNCTION__, | ||
4470 | "Freeing blocks in system zone - " | ||
4471 | "Block = %lu, count = %lu", block, count); | ||
4472 | } | ||
4473 | |||
4474 | BUFFER_TRACE(bitmap_bh, "getting write access"); | ||
4475 | err = ext4_journal_get_write_access(handle, bitmap_bh); | ||
4476 | if (err) | ||
4477 | goto error_return; | ||
4478 | |||
4479 | /* | ||
4480 | * We are about to modify some metadata. Call the journal APIs | ||
4481 | * to unshare ->b_data if a currently-committing transaction is | ||
4482 | * using it | ||
4483 | */ | ||
4484 | BUFFER_TRACE(gd_bh, "get_write_access"); | ||
4485 | err = ext4_journal_get_write_access(handle, gd_bh); | ||
4486 | if (err) | ||
4487 | goto error_return; | ||
4488 | |||
4489 | err = ext4_mb_load_buddy(sb, block_group, &e4b); | ||
4490 | if (err) | ||
4491 | goto error_return; | ||
4492 | |||
4493 | #ifdef AGGRESSIVE_CHECK | ||
4494 | { | ||
4495 | int i; | ||
4496 | for (i = 0; i < count; i++) | ||
4497 | BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); | ||
4498 | } | ||
4499 | #endif | ||
4500 | mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, | ||
4501 | bit, count); | ||
4502 | |||
4503 | /* We dirtied the bitmap block */ | ||
4504 | BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | ||
4505 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | ||
4506 | |||
4507 | ac.ac_b_ex.fe_group = block_group; | ||
4508 | ac.ac_b_ex.fe_start = bit; | ||
4509 | ac.ac_b_ex.fe_len = count; | ||
4510 | ext4_mb_store_history(&ac); | ||
4511 | |||
4512 | if (metadata) { | ||
4513 | /* blocks being freed are metadata. these blocks shouldn't | ||
4514 | * be used until this transaction is committed */ | ||
4515 | ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); | ||
4516 | } else { | ||
4517 | ext4_lock_group(sb, block_group); | ||
4518 | err = mb_free_blocks(inode, &e4b, bit, count); | ||
4519 | ext4_mb_return_to_preallocation(inode, &e4b, block, count); | ||
4520 | ext4_unlock_group(sb, block_group); | ||
4521 | BUG_ON(err != 0); | ||
4522 | } | ||
4523 | |||
4524 | spin_lock(sb_bgl_lock(sbi, block_group)); | ||
4525 | gdp->bg_free_blocks_count = | ||
4526 | cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); | ||
4527 | gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); | ||
4528 | spin_unlock(sb_bgl_lock(sbi, block_group)); | ||
4529 | percpu_counter_add(&sbi->s_freeblocks_counter, count); | ||
4530 | |||
4531 | ext4_mb_release_desc(&e4b); | ||
4532 | |||
4533 | *freed += count; | ||
4534 | |||
4535 | /* And the group descriptor block */ | ||
4536 | BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); | ||
4537 | ret = ext4_journal_dirty_metadata(handle, gd_bh); | ||
4538 | if (!err) | ||
4539 | err = ret; | ||
4540 | |||
4541 | if (overflow && !err) { | ||
4542 | block += count; | ||
4543 | count = overflow; | ||
4544 | put_bh(bitmap_bh); | ||
4545 | goto do_more; | ||
4546 | } | ||
4547 | sb->s_dirt = 1; | ||
4548 | error_return: | ||
4549 | brelse(bitmap_bh); | ||
4550 | ext4_std_error(sb, err); | ||
4551 | return; | ||
4552 | } | ||
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c new file mode 100644 index 000000000000..3ebc2332f52e --- /dev/null +++ b/fs/ext4/migrate.c | |||
@@ -0,0 +1,560 @@ | |||
1 | /* | ||
2 | * Copyright IBM Corporation, 2007 | ||
3 | * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms of version 2.1 of the GNU Lesser General Public License | ||
7 | * as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it would be useful, but | ||
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | #include <linux/module.h> | ||
16 | #include <linux/ext4_jbd2.h> | ||
17 | #include <linux/ext4_fs_extents.h> | ||
18 | |||
19 | /* | ||
20 | * The contiguous blocks details which can be | ||
21 | * represented by a single extent | ||
22 | */ | ||
23 | struct list_blocks_struct { | ||
24 | ext4_lblk_t first_block, last_block; | ||
25 | ext4_fsblk_t first_pblock, last_pblock; | ||
26 | }; | ||
27 | |||
28 | static int finish_range(handle_t *handle, struct inode *inode, | ||
29 | struct list_blocks_struct *lb) | ||
30 | |||
31 | { | ||
32 | int retval = 0, needed; | ||
33 | struct ext4_extent newext; | ||
34 | struct ext4_ext_path *path; | ||
35 | if (lb->first_pblock == 0) | ||
36 | return 0; | ||
37 | |||
38 | /* Add the extent to temp inode*/ | ||
39 | newext.ee_block = cpu_to_le32(lb->first_block); | ||
40 | newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); | ||
41 | ext4_ext_store_pblock(&newext, lb->first_pblock); | ||
42 | path = ext4_ext_find_extent(inode, lb->first_block, NULL); | ||
43 | |||
44 | if (IS_ERR(path)) { | ||
45 | retval = PTR_ERR(path); | ||
46 | goto err_out; | ||
47 | } | ||
48 | |||
49 | /* | ||
50 | * Calculate the credit needed to inserting this extent | ||
51 | * Since we are doing this in loop we may accumalate extra | ||
52 | * credit. But below we try to not accumalate too much | ||
53 | * of them by restarting the journal. | ||
54 | */ | ||
55 | needed = ext4_ext_calc_credits_for_insert(inode, path); | ||
56 | |||
57 | /* | ||
58 | * Make sure the credit we accumalated is not really high | ||
59 | */ | ||
60 | if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { | ||
61 | retval = ext4_journal_restart(handle, needed); | ||
62 | if (retval) | ||
63 | goto err_out; | ||
64 | } | ||
65 | if (needed) { | ||
66 | retval = ext4_journal_extend(handle, needed); | ||
67 | if (retval != 0) { | ||
68 | /* | ||
69 | * IF not able to extend the journal restart the journal | ||
70 | */ | ||
71 | retval = ext4_journal_restart(handle, needed); | ||
72 | if (retval) | ||
73 | goto err_out; | ||
74 | } | ||
75 | } | ||
76 | retval = ext4_ext_insert_extent(handle, inode, path, &newext); | ||
77 | err_out: | ||
78 | lb->first_pblock = 0; | ||
79 | return retval; | ||
80 | } | ||
81 | |||
82 | static int update_extent_range(handle_t *handle, struct inode *inode, | ||
83 | ext4_fsblk_t pblock, ext4_lblk_t blk_num, | ||
84 | struct list_blocks_struct *lb) | ||
85 | { | ||
86 | int retval; | ||
87 | /* | ||
88 | * See if we can add on to the existing range (if it exists) | ||
89 | */ | ||
90 | if (lb->first_pblock && | ||
91 | (lb->last_pblock+1 == pblock) && | ||
92 | (lb->last_block+1 == blk_num)) { | ||
93 | lb->last_pblock = pblock; | ||
94 | lb->last_block = blk_num; | ||
95 | return 0; | ||
96 | } | ||
97 | /* | ||
98 | * Start a new range. | ||
99 | */ | ||
100 | retval = finish_range(handle, inode, lb); | ||
101 | lb->first_pblock = lb->last_pblock = pblock; | ||
102 | lb->first_block = lb->last_block = blk_num; | ||
103 | |||
104 | return retval; | ||
105 | } | ||
106 | |||
107 | static int update_ind_extent_range(handle_t *handle, struct inode *inode, | ||
108 | ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, | ||
109 | struct list_blocks_struct *lb) | ||
110 | { | ||
111 | struct buffer_head *bh; | ||
112 | __le32 *i_data; | ||
113 | int i, retval = 0; | ||
114 | ext4_lblk_t blk_count = *blk_nump; | ||
115 | unsigned long max_entries = inode->i_sb->s_blocksize >> 2; | ||
116 | |||
117 | if (!pblock) { | ||
118 | /* Only update the file block number */ | ||
119 | *blk_nump += max_entries; | ||
120 | return 0; | ||
121 | } | ||
122 | |||
123 | bh = sb_bread(inode->i_sb, pblock); | ||
124 | if (!bh) | ||
125 | return -EIO; | ||
126 | |||
127 | i_data = (__le32 *)bh->b_data; | ||
128 | for (i = 0; i < max_entries; i++, blk_count++) { | ||
129 | if (i_data[i]) { | ||
130 | retval = update_extent_range(handle, inode, | ||
131 | le32_to_cpu(i_data[i]), | ||
132 | blk_count, lb); | ||
133 | if (retval) | ||
134 | break; | ||
135 | } | ||
136 | } | ||
137 | |||
138 | /* Update the file block number */ | ||
139 | *blk_nump = blk_count; | ||
140 | put_bh(bh); | ||
141 | return retval; | ||
142 | |||
143 | } | ||
144 | |||
145 | static int update_dind_extent_range(handle_t *handle, struct inode *inode, | ||
146 | ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, | ||
147 | struct list_blocks_struct *lb) | ||
148 | { | ||
149 | struct buffer_head *bh; | ||
150 | __le32 *i_data; | ||
151 | int i, retval = 0; | ||
152 | ext4_lblk_t blk_count = *blk_nump; | ||
153 | unsigned long max_entries = inode->i_sb->s_blocksize >> 2; | ||
154 | |||
155 | if (!pblock) { | ||
156 | /* Only update the file block number */ | ||
157 | *blk_nump += max_entries * max_entries; | ||
158 | return 0; | ||
159 | } | ||
160 | bh = sb_bread(inode->i_sb, pblock); | ||
161 | if (!bh) | ||
162 | return -EIO; | ||
163 | |||
164 | i_data = (__le32 *)bh->b_data; | ||
165 | for (i = 0; i < max_entries; i++) { | ||
166 | if (i_data[i]) { | ||
167 | retval = update_ind_extent_range(handle, inode, | ||
168 | le32_to_cpu(i_data[i]), | ||
169 | &blk_count, lb); | ||
170 | if (retval) | ||
171 | break; | ||
172 | } else { | ||
173 | /* Only update the file block number */ | ||
174 | blk_count += max_entries; | ||
175 | } | ||
176 | } | ||
177 | |||
178 | /* Update the file block number */ | ||
179 | *blk_nump = blk_count; | ||
180 | put_bh(bh); | ||
181 | return retval; | ||
182 | |||
183 | } | ||
184 | |||
185 | static int update_tind_extent_range(handle_t *handle, struct inode *inode, | ||
186 | ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, | ||
187 | struct list_blocks_struct *lb) | ||
188 | { | ||
189 | struct buffer_head *bh; | ||
190 | __le32 *i_data; | ||
191 | int i, retval = 0; | ||
192 | ext4_lblk_t blk_count = *blk_nump; | ||
193 | unsigned long max_entries = inode->i_sb->s_blocksize >> 2; | ||
194 | |||
195 | if (!pblock) { | ||
196 | /* Only update the file block number */ | ||
197 | *blk_nump += max_entries * max_entries * max_entries; | ||
198 | return 0; | ||
199 | } | ||
200 | bh = sb_bread(inode->i_sb, pblock); | ||
201 | if (!bh) | ||
202 | return -EIO; | ||
203 | |||
204 | i_data = (__le32 *)bh->b_data; | ||
205 | for (i = 0; i < max_entries; i++) { | ||
206 | if (i_data[i]) { | ||
207 | retval = update_dind_extent_range(handle, inode, | ||
208 | le32_to_cpu(i_data[i]), | ||
209 | &blk_count, lb); | ||
210 | if (retval) | ||
211 | break; | ||
212 | } else | ||
213 | /* Only update the file block number */ | ||
214 | blk_count += max_entries * max_entries; | ||
215 | } | ||
216 | /* Update the file block number */ | ||
217 | *blk_nump = blk_count; | ||
218 | put_bh(bh); | ||
219 | return retval; | ||
220 | |||
221 | } | ||
222 | |||
223 | static int free_dind_blocks(handle_t *handle, | ||
224 | struct inode *inode, __le32 i_data) | ||
225 | { | ||
226 | int i; | ||
227 | __le32 *tmp_idata; | ||
228 | struct buffer_head *bh; | ||
229 | unsigned long max_entries = inode->i_sb->s_blocksize >> 2; | ||
230 | |||
231 | bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); | ||
232 | if (!bh) | ||
233 | return -EIO; | ||
234 | |||
235 | tmp_idata = (__le32 *)bh->b_data; | ||
236 | for (i = 0; i < max_entries; i++) { | ||
237 | if (tmp_idata[i]) | ||
238 | ext4_free_blocks(handle, inode, | ||
239 | le32_to_cpu(tmp_idata[i]), 1, 1); | ||
240 | } | ||
241 | put_bh(bh); | ||
242 | ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); | ||
243 | return 0; | ||
244 | } | ||
245 | |||
246 | static int free_tind_blocks(handle_t *handle, | ||
247 | struct inode *inode, __le32 i_data) | ||
248 | { | ||
249 | int i, retval = 0; | ||
250 | __le32 *tmp_idata; | ||
251 | struct buffer_head *bh; | ||
252 | unsigned long max_entries = inode->i_sb->s_blocksize >> 2; | ||
253 | |||
254 | bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); | ||
255 | if (!bh) | ||
256 | return -EIO; | ||
257 | |||
258 | tmp_idata = (__le32 *)bh->b_data; | ||
259 | for (i = 0; i < max_entries; i++) { | ||
260 | if (tmp_idata[i]) { | ||
261 | retval = free_dind_blocks(handle, | ||
262 | inode, tmp_idata[i]); | ||
263 | if (retval) { | ||
264 | put_bh(bh); | ||
265 | return retval; | ||
266 | } | ||
267 | } | ||
268 | } | ||
269 | put_bh(bh); | ||
270 | ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); | ||
271 | return 0; | ||
272 | } | ||
273 | |||
274 | static int free_ind_block(handle_t *handle, struct inode *inode) | ||
275 | { | ||
276 | int retval; | ||
277 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
278 | |||
279 | if (ei->i_data[EXT4_IND_BLOCK]) | ||
280 | ext4_free_blocks(handle, inode, | ||
281 | le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1); | ||
282 | |||
283 | if (ei->i_data[EXT4_DIND_BLOCK]) { | ||
284 | retval = free_dind_blocks(handle, inode, | ||
285 | ei->i_data[EXT4_DIND_BLOCK]); | ||
286 | if (retval) | ||
287 | return retval; | ||
288 | } | ||
289 | |||
290 | if (ei->i_data[EXT4_TIND_BLOCK]) { | ||
291 | retval = free_tind_blocks(handle, inode, | ||
292 | ei->i_data[EXT4_TIND_BLOCK]); | ||
293 | if (retval) | ||
294 | return retval; | ||
295 | } | ||
296 | return 0; | ||
297 | } | ||
298 | |||
299 | static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, | ||
300 | struct inode *tmp_inode, int retval) | ||
301 | { | ||
302 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
303 | struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); | ||
304 | |||
305 | retval = free_ind_block(handle, inode); | ||
306 | if (retval) | ||
307 | goto err_out; | ||
308 | |||
309 | /* | ||
310 | * One credit accounted for writing the | ||
311 | * i_data field of the original inode | ||
312 | */ | ||
313 | retval = ext4_journal_extend(handle, 1); | ||
314 | if (retval != 0) { | ||
315 | retval = ext4_journal_restart(handle, 1); | ||
316 | if (retval) | ||
317 | goto err_out; | ||
318 | } | ||
319 | |||
320 | /* | ||
321 | * We have the extent map build with the tmp inode. | ||
322 | * Now copy the i_data across | ||
323 | */ | ||
324 | ei->i_flags |= EXT4_EXTENTS_FL; | ||
325 | memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); | ||
326 | |||
327 | /* | ||
328 | * Update i_blocks with the new blocks that got | ||
329 | * allocated while adding extents for extent index | ||
330 | * blocks. | ||
331 | * | ||
332 | * While converting to extents we need not | ||
333 | * update the orignal inode i_blocks for extent blocks | ||
334 | * via quota APIs. The quota update happened via tmp_inode already. | ||
335 | */ | ||
336 | spin_lock(&inode->i_lock); | ||
337 | inode->i_blocks += tmp_inode->i_blocks; | ||
338 | spin_unlock(&inode->i_lock); | ||
339 | |||
340 | ext4_mark_inode_dirty(handle, inode); | ||
341 | err_out: | ||
342 | return retval; | ||
343 | } | ||
344 | |||
345 | static int free_ext_idx(handle_t *handle, struct inode *inode, | ||
346 | struct ext4_extent_idx *ix) | ||
347 | { | ||
348 | int i, retval = 0; | ||
349 | ext4_fsblk_t block; | ||
350 | struct buffer_head *bh; | ||
351 | struct ext4_extent_header *eh; | ||
352 | |||
353 | block = idx_pblock(ix); | ||
354 | bh = sb_bread(inode->i_sb, block); | ||
355 | if (!bh) | ||
356 | return -EIO; | ||
357 | |||
358 | eh = (struct ext4_extent_header *)bh->b_data; | ||
359 | if (eh->eh_depth != 0) { | ||
360 | ix = EXT_FIRST_INDEX(eh); | ||
361 | for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { | ||
362 | retval = free_ext_idx(handle, inode, ix); | ||
363 | if (retval) | ||
364 | break; | ||
365 | } | ||
366 | } | ||
367 | put_bh(bh); | ||
368 | ext4_free_blocks(handle, inode, block, 1, 1); | ||
369 | return retval; | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * Free the extent meta data blocks only | ||
374 | */ | ||
375 | static int free_ext_block(handle_t *handle, struct inode *inode) | ||
376 | { | ||
377 | int i, retval = 0; | ||
378 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
379 | struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; | ||
380 | struct ext4_extent_idx *ix; | ||
381 | if (eh->eh_depth == 0) | ||
382 | /* | ||
383 | * No extra blocks allocated for extent meta data | ||
384 | */ | ||
385 | return 0; | ||
386 | ix = EXT_FIRST_INDEX(eh); | ||
387 | for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { | ||
388 | retval = free_ext_idx(handle, inode, ix); | ||
389 | if (retval) | ||
390 | return retval; | ||
391 | } | ||
392 | return retval; | ||
393 | |||
394 | } | ||
395 | |||
396 | int ext4_ext_migrate(struct inode *inode, struct file *filp, | ||
397 | unsigned int cmd, unsigned long arg) | ||
398 | { | ||
399 | handle_t *handle; | ||
400 | int retval = 0, i; | ||
401 | __le32 *i_data; | ||
402 | ext4_lblk_t blk_count = 0; | ||
403 | struct ext4_inode_info *ei; | ||
404 | struct inode *tmp_inode = NULL; | ||
405 | struct list_blocks_struct lb; | ||
406 | unsigned long max_entries; | ||
407 | |||
408 | if (!test_opt(inode->i_sb, EXTENTS)) | ||
409 | /* | ||
410 | * if mounted with noextents we don't allow the migrate | ||
411 | */ | ||
412 | return -EINVAL; | ||
413 | |||
414 | if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) | ||
415 | return -EINVAL; | ||
416 | |||
417 | down_write(&EXT4_I(inode)->i_data_sem); | ||
418 | handle = ext4_journal_start(inode, | ||
419 | EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + | ||
420 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + | ||
421 | 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) | ||
422 | + 1); | ||
423 | if (IS_ERR(handle)) { | ||
424 | retval = PTR_ERR(handle); | ||
425 | goto err_out; | ||
426 | } | ||
427 | tmp_inode = ext4_new_inode(handle, | ||
428 | inode->i_sb->s_root->d_inode, | ||
429 | S_IFREG); | ||
430 | if (IS_ERR(tmp_inode)) { | ||
431 | retval = -ENOMEM; | ||
432 | ext4_journal_stop(handle); | ||
433 | tmp_inode = NULL; | ||
434 | goto err_out; | ||
435 | } | ||
436 | i_size_write(tmp_inode, i_size_read(inode)); | ||
437 | /* | ||
438 | * We don't want the inode to be reclaimed | ||
439 | * if we got interrupted in between. We have | ||
440 | * this tmp inode carrying reference to the | ||
441 | * data blocks of the original file. We set | ||
442 | * the i_nlink to zero at the last stage after | ||
443 | * switching the original file to extent format | ||
444 | */ | ||
445 | tmp_inode->i_nlink = 1; | ||
446 | |||
447 | ext4_ext_tree_init(handle, tmp_inode); | ||
448 | ext4_orphan_add(handle, tmp_inode); | ||
449 | ext4_journal_stop(handle); | ||
450 | |||
451 | ei = EXT4_I(inode); | ||
452 | i_data = ei->i_data; | ||
453 | memset(&lb, 0, sizeof(lb)); | ||
454 | |||
455 | /* 32 bit block address 4 bytes */ | ||
456 | max_entries = inode->i_sb->s_blocksize >> 2; | ||
457 | |||
458 | /* | ||
459 | * start with one credit accounted for | ||
460 | * superblock modification. | ||
461 | * | ||
462 | * For the tmp_inode we already have commited the | ||
463 | * trascation that created the inode. Later as and | ||
464 | * when we add extents we extent the journal | ||
465 | */ | ||
466 | handle = ext4_journal_start(inode, 1); | ||
467 | for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { | ||
468 | if (i_data[i]) { | ||
469 | retval = update_extent_range(handle, tmp_inode, | ||
470 | le32_to_cpu(i_data[i]), | ||
471 | blk_count, &lb); | ||
472 | if (retval) | ||
473 | goto err_out; | ||
474 | } | ||
475 | } | ||
476 | if (i_data[EXT4_IND_BLOCK]) { | ||
477 | retval = update_ind_extent_range(handle, tmp_inode, | ||
478 | le32_to_cpu(i_data[EXT4_IND_BLOCK]), | ||
479 | &blk_count, &lb); | ||
480 | if (retval) | ||
481 | goto err_out; | ||
482 | } else | ||
483 | blk_count += max_entries; | ||
484 | if (i_data[EXT4_DIND_BLOCK]) { | ||
485 | retval = update_dind_extent_range(handle, tmp_inode, | ||
486 | le32_to_cpu(i_data[EXT4_DIND_BLOCK]), | ||
487 | &blk_count, &lb); | ||
488 | if (retval) | ||
489 | goto err_out; | ||
490 | } else | ||
491 | blk_count += max_entries * max_entries; | ||
492 | if (i_data[EXT4_TIND_BLOCK]) { | ||
493 | retval = update_tind_extent_range(handle, tmp_inode, | ||
494 | le32_to_cpu(i_data[EXT4_TIND_BLOCK]), | ||
495 | &blk_count, &lb); | ||
496 | if (retval) | ||
497 | goto err_out; | ||
498 | } | ||
499 | /* | ||
500 | * Build the last extent | ||
501 | */ | ||
502 | retval = finish_range(handle, tmp_inode, &lb); | ||
503 | err_out: | ||
504 | /* | ||
505 | * We are either freeing extent information or indirect | ||
506 | * blocks. During this we touch superblock, group descriptor | ||
507 | * and block bitmap. Later we mark the tmp_inode dirty | ||
508 | * via ext4_ext_tree_init. So allocate a credit of 4 | ||
509 | * We may update quota (user and group). | ||
510 | * | ||
511 | * FIXME!! we may be touching bitmaps in different block groups. | ||
512 | */ | ||
513 | if (ext4_journal_extend(handle, | ||
514 | 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)) != 0) | ||
515 | ext4_journal_restart(handle, | ||
516 | 4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); | ||
517 | if (retval) | ||
518 | /* | ||
519 | * Failure case delete the extent information with the | ||
520 | * tmp_inode | ||
521 | */ | ||
522 | free_ext_block(handle, tmp_inode); | ||
523 | else | ||
524 | retval = ext4_ext_swap_inode_data(handle, inode, | ||
525 | tmp_inode, retval); | ||
526 | |||
527 | /* | ||
528 | * Mark the tmp_inode as of size zero | ||
529 | */ | ||
530 | i_size_write(tmp_inode, 0); | ||
531 | |||
532 | /* | ||
533 | * set the i_blocks count to zero | ||
534 | * so that the ext4_delete_inode does the | ||
535 | * right job | ||
536 | * | ||
537 | * We don't need to take the i_lock because | ||
538 | * the inode is not visible to user space. | ||
539 | */ | ||
540 | tmp_inode->i_blocks = 0; | ||
541 | |||
542 | /* Reset the extent details */ | ||
543 | ext4_ext_tree_init(handle, tmp_inode); | ||
544 | |||
545 | /* | ||
546 | * Set the i_nlink to zero so that | ||
547 | * generic_drop_inode really deletes the | ||
548 | * inode | ||
549 | */ | ||
550 | tmp_inode->i_nlink = 0; | ||
551 | |||
552 | ext4_journal_stop(handle); | ||
553 | |||
554 | up_write(&EXT4_I(inode)->i_data_sem); | ||
555 | |||
556 | if (tmp_inode) | ||
557 | iput(tmp_inode); | ||
558 | |||
559 | return retval; | ||
560 | } | ||
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 94ee6f315dc1..67b6d8a1ceff 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -51,7 +51,7 @@ | |||
51 | 51 | ||
52 | static struct buffer_head *ext4_append(handle_t *handle, | 52 | static struct buffer_head *ext4_append(handle_t *handle, |
53 | struct inode *inode, | 53 | struct inode *inode, |
54 | u32 *block, int *err) | 54 | ext4_lblk_t *block, int *err) |
55 | { | 55 | { |
56 | struct buffer_head *bh; | 56 | struct buffer_head *bh; |
57 | 57 | ||
@@ -144,8 +144,8 @@ struct dx_map_entry | |||
144 | u16 size; | 144 | u16 size; |
145 | }; | 145 | }; |
146 | 146 | ||
147 | static inline unsigned dx_get_block (struct dx_entry *entry); | 147 | static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); |
148 | static void dx_set_block (struct dx_entry *entry, unsigned value); | 148 | static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); |
149 | static inline unsigned dx_get_hash (struct dx_entry *entry); | 149 | static inline unsigned dx_get_hash (struct dx_entry *entry); |
150 | static void dx_set_hash (struct dx_entry *entry, unsigned value); | 150 | static void dx_set_hash (struct dx_entry *entry, unsigned value); |
151 | static unsigned dx_get_count (struct dx_entry *entries); | 151 | static unsigned dx_get_count (struct dx_entry *entries); |
@@ -166,7 +166,8 @@ static void dx_sort_map(struct dx_map_entry *map, unsigned count); | |||
166 | static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, | 166 | static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, |
167 | struct dx_map_entry *offsets, int count); | 167 | struct dx_map_entry *offsets, int count); |
168 | static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); | 168 | static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); |
169 | static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); | 169 | static void dx_insert_block(struct dx_frame *frame, |
170 | u32 hash, ext4_lblk_t block); | ||
170 | static int ext4_htree_next_block(struct inode *dir, __u32 hash, | 171 | static int ext4_htree_next_block(struct inode *dir, __u32 hash, |
171 | struct dx_frame *frame, | 172 | struct dx_frame *frame, |
172 | struct dx_frame *frames, | 173 | struct dx_frame *frames, |
@@ -181,12 +182,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | |||
181 | * Mask them off for now. | 182 | * Mask them off for now. |
182 | */ | 183 | */ |
183 | 184 | ||
184 | static inline unsigned dx_get_block (struct dx_entry *entry) | 185 | static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) |
185 | { | 186 | { |
186 | return le32_to_cpu(entry->block) & 0x00ffffff; | 187 | return le32_to_cpu(entry->block) & 0x00ffffff; |
187 | } | 188 | } |
188 | 189 | ||
189 | static inline void dx_set_block (struct dx_entry *entry, unsigned value) | 190 | static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) |
190 | { | 191 | { |
191 | entry->block = cpu_to_le32(value); | 192 | entry->block = cpu_to_le32(value); |
192 | } | 193 | } |
@@ -243,8 +244,8 @@ static void dx_show_index (char * label, struct dx_entry *entries) | |||
243 | int i, n = dx_get_count (entries); | 244 | int i, n = dx_get_count (entries); |
244 | printk("%s index ", label); | 245 | printk("%s index ", label); |
245 | for (i = 0; i < n; i++) { | 246 | for (i = 0; i < n; i++) { |
246 | printk("%x->%u ", i? dx_get_hash(entries + i) : | 247 | printk("%x->%lu ", i? dx_get_hash(entries + i) : |
247 | 0, dx_get_block(entries + i)); | 248 | 0, (unsigned long)dx_get_block(entries + i)); |
248 | } | 249 | } |
249 | printk("\n"); | 250 | printk("\n"); |
250 | } | 251 | } |
@@ -280,7 +281,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent | |||
280 | space += EXT4_DIR_REC_LEN(de->name_len); | 281 | space += EXT4_DIR_REC_LEN(de->name_len); |
281 | names++; | 282 | names++; |
282 | } | 283 | } |
283 | de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); | 284 | de = ext4_next_entry(de); |
284 | } | 285 | } |
285 | printk("(%i)\n", names); | 286 | printk("(%i)\n", names); |
286 | return (struct stats) { names, space, 1 }; | 287 | return (struct stats) { names, space, 1 }; |
@@ -297,7 +298,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, | |||
297 | printk("%i indexed blocks...\n", count); | 298 | printk("%i indexed blocks...\n", count); |
298 | for (i = 0; i < count; i++, entries++) | 299 | for (i = 0; i < count; i++, entries++) |
299 | { | 300 | { |
300 | u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; | 301 | ext4_lblk_t block = dx_get_block(entries); |
302 | ext4_lblk_t hash = i ? dx_get_hash(entries): 0; | ||
301 | u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; | 303 | u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; |
302 | struct stats stats; | 304 | struct stats stats; |
303 | printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); | 305 | printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); |
@@ -551,7 +553,8 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, | |||
551 | */ | 553 | */ |
552 | static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p) | 554 | static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p) |
553 | { | 555 | { |
554 | return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); | 556 | return (struct ext4_dir_entry_2 *)((char *)p + |
557 | ext4_rec_len_from_disk(p->rec_len)); | ||
555 | } | 558 | } |
556 | 559 | ||
557 | /* | 560 | /* |
@@ -560,7 +563,7 @@ static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 * | |||
560 | * into the tree. If there is an error it is returned in err. | 563 | * into the tree. If there is an error it is returned in err. |
561 | */ | 564 | */ |
562 | static int htree_dirblock_to_tree(struct file *dir_file, | 565 | static int htree_dirblock_to_tree(struct file *dir_file, |
563 | struct inode *dir, int block, | 566 | struct inode *dir, ext4_lblk_t block, |
564 | struct dx_hash_info *hinfo, | 567 | struct dx_hash_info *hinfo, |
565 | __u32 start_hash, __u32 start_minor_hash) | 568 | __u32 start_hash, __u32 start_minor_hash) |
566 | { | 569 | { |
@@ -568,7 +571,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, | |||
568 | struct ext4_dir_entry_2 *de, *top; | 571 | struct ext4_dir_entry_2 *de, *top; |
569 | int err, count = 0; | 572 | int err, count = 0; |
570 | 573 | ||
571 | dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); | 574 | dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", |
575 | (unsigned long)block)); | ||
572 | if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) | 576 | if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) |
573 | return err; | 577 | return err; |
574 | 578 | ||
@@ -620,9 +624,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, | |||
620 | struct ext4_dir_entry_2 *de; | 624 | struct ext4_dir_entry_2 *de; |
621 | struct dx_frame frames[2], *frame; | 625 | struct dx_frame frames[2], *frame; |
622 | struct inode *dir; | 626 | struct inode *dir; |
623 | int block, err; | 627 | ext4_lblk_t block; |
624 | int count = 0; | 628 | int count = 0; |
625 | int ret; | 629 | int ret, err; |
626 | __u32 hashval; | 630 | __u32 hashval; |
627 | 631 | ||
628 | dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, | 632 | dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, |
@@ -720,7 +724,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size, | |||
720 | cond_resched(); | 724 | cond_resched(); |
721 | } | 725 | } |
722 | /* XXX: do we need to check rec_len == 0 case? -Chris */ | 726 | /* XXX: do we need to check rec_len == 0 case? -Chris */ |
723 | de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); | 727 | de = ext4_next_entry(de); |
724 | } | 728 | } |
725 | return count; | 729 | return count; |
726 | } | 730 | } |
@@ -752,7 +756,7 @@ static void dx_sort_map (struct dx_map_entry *map, unsigned count) | |||
752 | } while(more); | 756 | } while(more); |
753 | } | 757 | } |
754 | 758 | ||
755 | static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) | 759 | static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) |
756 | { | 760 | { |
757 | struct dx_entry *entries = frame->entries; | 761 | struct dx_entry *entries = frame->entries; |
758 | struct dx_entry *old = frame->at, *new = old + 1; | 762 | struct dx_entry *old = frame->at, *new = old + 1; |
@@ -820,7 +824,7 @@ static inline int search_dirblock(struct buffer_head * bh, | |||
820 | return 1; | 824 | return 1; |
821 | } | 825 | } |
822 | /* prevent looping on a bad block */ | 826 | /* prevent looping on a bad block */ |
823 | de_len = le16_to_cpu(de->rec_len); | 827 | de_len = ext4_rec_len_from_disk(de->rec_len); |
824 | if (de_len <= 0) | 828 | if (de_len <= 0) |
825 | return -1; | 829 | return -1; |
826 | offset += de_len; | 830 | offset += de_len; |
@@ -847,23 +851,20 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry, | |||
847 | struct super_block * sb; | 851 | struct super_block * sb; |
848 | struct buffer_head * bh_use[NAMEI_RA_SIZE]; | 852 | struct buffer_head * bh_use[NAMEI_RA_SIZE]; |
849 | struct buffer_head * bh, *ret = NULL; | 853 | struct buffer_head * bh, *ret = NULL; |
850 | unsigned long start, block, b; | 854 | ext4_lblk_t start, block, b; |
851 | int ra_max = 0; /* Number of bh's in the readahead | 855 | int ra_max = 0; /* Number of bh's in the readahead |
852 | buffer, bh_use[] */ | 856 | buffer, bh_use[] */ |
853 | int ra_ptr = 0; /* Current index into readahead | 857 | int ra_ptr = 0; /* Current index into readahead |
854 | buffer */ | 858 | buffer */ |
855 | int num = 0; | 859 | int num = 0; |
856 | int nblocks, i, err; | 860 | ext4_lblk_t nblocks; |
861 | int i, err; | ||
857 | struct inode *dir = dentry->d_parent->d_inode; | 862 | struct inode *dir = dentry->d_parent->d_inode; |
858 | int namelen; | 863 | int namelen; |
859 | const u8 *name; | ||
860 | unsigned blocksize; | ||
861 | 864 | ||
862 | *res_dir = NULL; | 865 | *res_dir = NULL; |
863 | sb = dir->i_sb; | 866 | sb = dir->i_sb; |
864 | blocksize = sb->s_blocksize; | ||
865 | namelen = dentry->d_name.len; | 867 | namelen = dentry->d_name.len; |
866 | name = dentry->d_name.name; | ||
867 | if (namelen > EXT4_NAME_LEN) | 868 | if (namelen > EXT4_NAME_LEN) |
868 | return NULL; | 869 | return NULL; |
869 | if (is_dx(dir)) { | 870 | if (is_dx(dir)) { |
@@ -914,7 +915,8 @@ restart: | |||
914 | if (!buffer_uptodate(bh)) { | 915 | if (!buffer_uptodate(bh)) { |
915 | /* read error, skip block & hope for the best */ | 916 | /* read error, skip block & hope for the best */ |
916 | ext4_error(sb, __FUNCTION__, "reading directory #%lu " | 917 | ext4_error(sb, __FUNCTION__, "reading directory #%lu " |
917 | "offset %lu", dir->i_ino, block); | 918 | "offset %lu", dir->i_ino, |
919 | (unsigned long)block); | ||
918 | brelse(bh); | 920 | brelse(bh); |
919 | goto next; | 921 | goto next; |
920 | } | 922 | } |
@@ -961,7 +963,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, | |||
961 | struct dx_frame frames[2], *frame; | 963 | struct dx_frame frames[2], *frame; |
962 | struct ext4_dir_entry_2 *de, *top; | 964 | struct ext4_dir_entry_2 *de, *top; |
963 | struct buffer_head *bh; | 965 | struct buffer_head *bh; |
964 | unsigned long block; | 966 | ext4_lblk_t block; |
965 | int retval; | 967 | int retval; |
966 | int namelen = dentry->d_name.len; | 968 | int namelen = dentry->d_name.len; |
967 | const u8 *name = dentry->d_name.name; | 969 | const u8 *name = dentry->d_name.name; |
@@ -1128,7 +1130,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) | |||
1128 | rec_len = EXT4_DIR_REC_LEN(de->name_len); | 1130 | rec_len = EXT4_DIR_REC_LEN(de->name_len); |
1129 | memcpy (to, de, rec_len); | 1131 | memcpy (to, de, rec_len); |
1130 | ((struct ext4_dir_entry_2 *) to)->rec_len = | 1132 | ((struct ext4_dir_entry_2 *) to)->rec_len = |
1131 | cpu_to_le16(rec_len); | 1133 | ext4_rec_len_to_disk(rec_len); |
1132 | de->inode = 0; | 1134 | de->inode = 0; |
1133 | map++; | 1135 | map++; |
1134 | to += rec_len; | 1136 | to += rec_len; |
@@ -1147,13 +1149,12 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size) | |||
1147 | 1149 | ||
1148 | prev = to = de; | 1150 | prev = to = de; |
1149 | while ((char*)de < base + size) { | 1151 | while ((char*)de < base + size) { |
1150 | next = (struct ext4_dir_entry_2 *) ((char *) de + | 1152 | next = ext4_next_entry(de); |
1151 | le16_to_cpu(de->rec_len)); | ||
1152 | if (de->inode && de->name_len) { | 1153 | if (de->inode && de->name_len) { |
1153 | rec_len = EXT4_DIR_REC_LEN(de->name_len); | 1154 | rec_len = EXT4_DIR_REC_LEN(de->name_len); |
1154 | if (de > to) | 1155 | if (de > to) |
1155 | memmove(to, de, rec_len); | 1156 | memmove(to, de, rec_len); |
1156 | to->rec_len = cpu_to_le16(rec_len); | 1157 | to->rec_len = ext4_rec_len_to_disk(rec_len); |
1157 | prev = to; | 1158 | prev = to; |
1158 | to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); | 1159 | to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); |
1159 | } | 1160 | } |
@@ -1174,7 +1175,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, | |||
1174 | unsigned blocksize = dir->i_sb->s_blocksize; | 1175 | unsigned blocksize = dir->i_sb->s_blocksize; |
1175 | unsigned count, continued; | 1176 | unsigned count, continued; |
1176 | struct buffer_head *bh2; | 1177 | struct buffer_head *bh2; |
1177 | u32 newblock; | 1178 | ext4_lblk_t newblock; |
1178 | u32 hash2; | 1179 | u32 hash2; |
1179 | struct dx_map_entry *map; | 1180 | struct dx_map_entry *map; |
1180 | char *data1 = (*bh)->b_data, *data2; | 1181 | char *data1 = (*bh)->b_data, *data2; |
@@ -1221,14 +1222,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, | |||
1221 | split = count - move; | 1222 | split = count - move; |
1222 | hash2 = map[split].hash; | 1223 | hash2 = map[split].hash; |
1223 | continued = hash2 == map[split - 1].hash; | 1224 | continued = hash2 == map[split - 1].hash; |
1224 | dxtrace(printk("Split block %i at %x, %i/%i\n", | 1225 | dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", |
1225 | dx_get_block(frame->at), hash2, split, count-split)); | 1226 | (unsigned long)dx_get_block(frame->at), |
1227 | hash2, split, count-split)); | ||
1226 | 1228 | ||
1227 | /* Fancy dance to stay within two buffers */ | 1229 | /* Fancy dance to stay within two buffers */ |
1228 | de2 = dx_move_dirents(data1, data2, map + split, count - split); | 1230 | de2 = dx_move_dirents(data1, data2, map + split, count - split); |
1229 | de = dx_pack_dirents(data1,blocksize); | 1231 | de = dx_pack_dirents(data1,blocksize); |
1230 | de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); | 1232 | de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); |
1231 | de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); | 1233 | de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); |
1232 | dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); | 1234 | dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); |
1233 | dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); | 1235 | dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); |
1234 | 1236 | ||
@@ -1297,7 +1299,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, | |||
1297 | return -EEXIST; | 1299 | return -EEXIST; |
1298 | } | 1300 | } |
1299 | nlen = EXT4_DIR_REC_LEN(de->name_len); | 1301 | nlen = EXT4_DIR_REC_LEN(de->name_len); |
1300 | rlen = le16_to_cpu(de->rec_len); | 1302 | rlen = ext4_rec_len_from_disk(de->rec_len); |
1301 | if ((de->inode? rlen - nlen: rlen) >= reclen) | 1303 | if ((de->inode? rlen - nlen: rlen) >= reclen) |
1302 | break; | 1304 | break; |
1303 | de = (struct ext4_dir_entry_2 *)((char *)de + rlen); | 1305 | de = (struct ext4_dir_entry_2 *)((char *)de + rlen); |
@@ -1316,11 +1318,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, | |||
1316 | 1318 | ||
1317 | /* By now the buffer is marked for journaling */ | 1319 | /* By now the buffer is marked for journaling */ |
1318 | nlen = EXT4_DIR_REC_LEN(de->name_len); | 1320 | nlen = EXT4_DIR_REC_LEN(de->name_len); |
1319 | rlen = le16_to_cpu(de->rec_len); | 1321 | rlen = ext4_rec_len_from_disk(de->rec_len); |
1320 | if (de->inode) { | 1322 | if (de->inode) { |
1321 | struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); | 1323 | struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); |
1322 | de1->rec_len = cpu_to_le16(rlen - nlen); | 1324 | de1->rec_len = ext4_rec_len_to_disk(rlen - nlen); |
1323 | de->rec_len = cpu_to_le16(nlen); | 1325 | de->rec_len = ext4_rec_len_to_disk(nlen); |
1324 | de = de1; | 1326 | de = de1; |
1325 | } | 1327 | } |
1326 | de->file_type = EXT4_FT_UNKNOWN; | 1328 | de->file_type = EXT4_FT_UNKNOWN; |
@@ -1374,7 +1376,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, | |||
1374 | int retval; | 1376 | int retval; |
1375 | unsigned blocksize; | 1377 | unsigned blocksize; |
1376 | struct dx_hash_info hinfo; | 1378 | struct dx_hash_info hinfo; |
1377 | u32 block; | 1379 | ext4_lblk_t block; |
1378 | struct fake_dirent *fde; | 1380 | struct fake_dirent *fde; |
1379 | 1381 | ||
1380 | blocksize = dir->i_sb->s_blocksize; | 1382 | blocksize = dir->i_sb->s_blocksize; |
@@ -1397,17 +1399,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, | |||
1397 | 1399 | ||
1398 | /* The 0th block becomes the root, move the dirents out */ | 1400 | /* The 0th block becomes the root, move the dirents out */ |
1399 | fde = &root->dotdot; | 1401 | fde = &root->dotdot; |
1400 | de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len)); | 1402 | de = (struct ext4_dir_entry_2 *)((char *)fde + |
1403 | ext4_rec_len_from_disk(fde->rec_len)); | ||
1401 | len = ((char *) root) + blocksize - (char *) de; | 1404 | len = ((char *) root) + blocksize - (char *) de; |
1402 | memcpy (data1, de, len); | 1405 | memcpy (data1, de, len); |
1403 | de = (struct ext4_dir_entry_2 *) data1; | 1406 | de = (struct ext4_dir_entry_2 *) data1; |
1404 | top = data1 + len; | 1407 | top = data1 + len; |
1405 | while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top) | 1408 | while ((char *)(de2 = ext4_next_entry(de)) < top) |
1406 | de = de2; | 1409 | de = de2; |
1407 | de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); | 1410 | de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); |
1408 | /* Initialize the root; the dot dirents already exist */ | 1411 | /* Initialize the root; the dot dirents already exist */ |
1409 | de = (struct ext4_dir_entry_2 *) (&root->dotdot); | 1412 | de = (struct ext4_dir_entry_2 *) (&root->dotdot); |
1410 | de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2)); | 1413 | de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2)); |
1411 | memset (&root->info, 0, sizeof(root->info)); | 1414 | memset (&root->info, 0, sizeof(root->info)); |
1412 | root->info.info_length = sizeof(root->info); | 1415 | root->info.info_length = sizeof(root->info); |
1413 | root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; | 1416 | root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; |
@@ -1454,7 +1457,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry, | |||
1454 | int retval; | 1457 | int retval; |
1455 | int dx_fallback=0; | 1458 | int dx_fallback=0; |
1456 | unsigned blocksize; | 1459 | unsigned blocksize; |
1457 | u32 block, blocks; | 1460 | ext4_lblk_t block, blocks; |
1458 | 1461 | ||
1459 | sb = dir->i_sb; | 1462 | sb = dir->i_sb; |
1460 | blocksize = sb->s_blocksize; | 1463 | blocksize = sb->s_blocksize; |
@@ -1487,7 +1490,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry, | |||
1487 | return retval; | 1490 | return retval; |
1488 | de = (struct ext4_dir_entry_2 *) bh->b_data; | 1491 | de = (struct ext4_dir_entry_2 *) bh->b_data; |
1489 | de->inode = 0; | 1492 | de->inode = 0; |
1490 | de->rec_len = cpu_to_le16(blocksize); | 1493 | de->rec_len = ext4_rec_len_to_disk(blocksize); |
1491 | return add_dirent_to_buf(handle, dentry, inode, de, bh); | 1494 | return add_dirent_to_buf(handle, dentry, inode, de, bh); |
1492 | } | 1495 | } |
1493 | 1496 | ||
@@ -1531,7 +1534,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | |||
1531 | dx_get_count(entries), dx_get_limit(entries))); | 1534 | dx_get_count(entries), dx_get_limit(entries))); |
1532 | /* Need to split index? */ | 1535 | /* Need to split index? */ |
1533 | if (dx_get_count(entries) == dx_get_limit(entries)) { | 1536 | if (dx_get_count(entries) == dx_get_limit(entries)) { |
1534 | u32 newblock; | 1537 | ext4_lblk_t newblock; |
1535 | unsigned icount = dx_get_count(entries); | 1538 | unsigned icount = dx_get_count(entries); |
1536 | int levels = frame - frames; | 1539 | int levels = frame - frames; |
1537 | struct dx_entry *entries2; | 1540 | struct dx_entry *entries2; |
@@ -1550,7 +1553,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | |||
1550 | goto cleanup; | 1553 | goto cleanup; |
1551 | node2 = (struct dx_node *)(bh2->b_data); | 1554 | node2 = (struct dx_node *)(bh2->b_data); |
1552 | entries2 = node2->entries; | 1555 | entries2 = node2->entries; |
1553 | node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); | 1556 | node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize); |
1554 | node2->fake.inode = 0; | 1557 | node2->fake.inode = 0; |
1555 | BUFFER_TRACE(frame->bh, "get_write_access"); | 1558 | BUFFER_TRACE(frame->bh, "get_write_access"); |
1556 | err = ext4_journal_get_write_access(handle, frame->bh); | 1559 | err = ext4_journal_get_write_access(handle, frame->bh); |
@@ -1648,9 +1651,9 @@ static int ext4_delete_entry (handle_t *handle, | |||
1648 | BUFFER_TRACE(bh, "get_write_access"); | 1651 | BUFFER_TRACE(bh, "get_write_access"); |
1649 | ext4_journal_get_write_access(handle, bh); | 1652 | ext4_journal_get_write_access(handle, bh); |
1650 | if (pde) | 1653 | if (pde) |
1651 | pde->rec_len = | 1654 | pde->rec_len = ext4_rec_len_to_disk( |
1652 | cpu_to_le16(le16_to_cpu(pde->rec_len) + | 1655 | ext4_rec_len_from_disk(pde->rec_len) + |
1653 | le16_to_cpu(de->rec_len)); | 1656 | ext4_rec_len_from_disk(de->rec_len)); |
1654 | else | 1657 | else |
1655 | de->inode = 0; | 1658 | de->inode = 0; |
1656 | dir->i_version++; | 1659 | dir->i_version++; |
@@ -1658,10 +1661,9 @@ static int ext4_delete_entry (handle_t *handle, | |||
1658 | ext4_journal_dirty_metadata(handle, bh); | 1661 | ext4_journal_dirty_metadata(handle, bh); |
1659 | return 0; | 1662 | return 0; |
1660 | } | 1663 | } |
1661 | i += le16_to_cpu(de->rec_len); | 1664 | i += ext4_rec_len_from_disk(de->rec_len); |
1662 | pde = de; | 1665 | pde = de; |
1663 | de = (struct ext4_dir_entry_2 *) | 1666 | de = ext4_next_entry(de); |
1664 | ((char *) de + le16_to_cpu(de->rec_len)); | ||
1665 | } | 1667 | } |
1666 | return -ENOENT; | 1668 | return -ENOENT; |
1667 | } | 1669 | } |
@@ -1824,13 +1826,13 @@ retry: | |||
1824 | de = (struct ext4_dir_entry_2 *) dir_block->b_data; | 1826 | de = (struct ext4_dir_entry_2 *) dir_block->b_data; |
1825 | de->inode = cpu_to_le32(inode->i_ino); | 1827 | de->inode = cpu_to_le32(inode->i_ino); |
1826 | de->name_len = 1; | 1828 | de->name_len = 1; |
1827 | de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len)); | 1829 | de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); |
1828 | strcpy (de->name, "."); | 1830 | strcpy (de->name, "."); |
1829 | ext4_set_de_type(dir->i_sb, de, S_IFDIR); | 1831 | ext4_set_de_type(dir->i_sb, de, S_IFDIR); |
1830 | de = (struct ext4_dir_entry_2 *) | 1832 | de = ext4_next_entry(de); |
1831 | ((char *) de + le16_to_cpu(de->rec_len)); | ||
1832 | de->inode = cpu_to_le32(dir->i_ino); | 1833 | de->inode = cpu_to_le32(dir->i_ino); |
1833 | de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1)); | 1834 | de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - |
1835 | EXT4_DIR_REC_LEN(1)); | ||
1834 | de->name_len = 2; | 1836 | de->name_len = 2; |
1835 | strcpy (de->name, ".."); | 1837 | strcpy (de->name, ".."); |
1836 | ext4_set_de_type(dir->i_sb, de, S_IFDIR); | 1838 | ext4_set_de_type(dir->i_sb, de, S_IFDIR); |
@@ -1882,8 +1884,7 @@ static int empty_dir (struct inode * inode) | |||
1882 | return 1; | 1884 | return 1; |
1883 | } | 1885 | } |
1884 | de = (struct ext4_dir_entry_2 *) bh->b_data; | 1886 | de = (struct ext4_dir_entry_2 *) bh->b_data; |
1885 | de1 = (struct ext4_dir_entry_2 *) | 1887 | de1 = ext4_next_entry(de); |
1886 | ((char *) de + le16_to_cpu(de->rec_len)); | ||
1887 | if (le32_to_cpu(de->inode) != inode->i_ino || | 1888 | if (le32_to_cpu(de->inode) != inode->i_ino || |
1888 | !le32_to_cpu(de1->inode) || | 1889 | !le32_to_cpu(de1->inode) || |
1889 | strcmp (".", de->name) || | 1890 | strcmp (".", de->name) || |
@@ -1894,9 +1895,9 @@ static int empty_dir (struct inode * inode) | |||
1894 | brelse (bh); | 1895 | brelse (bh); |
1895 | return 1; | 1896 | return 1; |
1896 | } | 1897 | } |
1897 | offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); | 1898 | offset = ext4_rec_len_from_disk(de->rec_len) + |
1898 | de = (struct ext4_dir_entry_2 *) | 1899 | ext4_rec_len_from_disk(de1->rec_len); |
1899 | ((char *) de1 + le16_to_cpu(de1->rec_len)); | 1900 | de = ext4_next_entry(de1); |
1900 | while (offset < inode->i_size ) { | 1901 | while (offset < inode->i_size ) { |
1901 | if (!bh || | 1902 | if (!bh || |
1902 | (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { | 1903 | (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { |
@@ -1925,9 +1926,8 @@ static int empty_dir (struct inode * inode) | |||
1925 | brelse (bh); | 1926 | brelse (bh); |
1926 | return 0; | 1927 | return 0; |
1927 | } | 1928 | } |
1928 | offset += le16_to_cpu(de->rec_len); | 1929 | offset += ext4_rec_len_from_disk(de->rec_len); |
1929 | de = (struct ext4_dir_entry_2 *) | 1930 | de = ext4_next_entry(de); |
1930 | ((char *) de + le16_to_cpu(de->rec_len)); | ||
1931 | } | 1931 | } |
1932 | brelse (bh); | 1932 | brelse (bh); |
1933 | return 1; | 1933 | return 1; |
@@ -2282,8 +2282,7 @@ retry: | |||
2282 | } | 2282 | } |
2283 | 2283 | ||
2284 | #define PARENT_INO(buffer) \ | 2284 | #define PARENT_INO(buffer) \ |
2285 | ((struct ext4_dir_entry_2 *) ((char *) buffer + \ | 2285 | (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode) |
2286 | le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode | ||
2287 | 2286 | ||
2288 | /* | 2287 | /* |
2289 | * Anybody can rename anything with this: the permission checks are left to the | 2288 | * Anybody can rename anything with this: the permission checks are left to the |
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bd8a52bb3999..4fbba60816f4 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c | |||
@@ -28,7 +28,7 @@ static int verify_group_input(struct super_block *sb, | |||
28 | struct ext4_super_block *es = sbi->s_es; | 28 | struct ext4_super_block *es = sbi->s_es; |
29 | ext4_fsblk_t start = ext4_blocks_count(es); | 29 | ext4_fsblk_t start = ext4_blocks_count(es); |
30 | ext4_fsblk_t end = start + input->blocks_count; | 30 | ext4_fsblk_t end = start + input->blocks_count; |
31 | unsigned group = input->group; | 31 | ext4_group_t group = input->group; |
32 | ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; | 32 | ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; |
33 | unsigned overhead = ext4_bg_has_super(sb, group) ? | 33 | unsigned overhead = ext4_bg_has_super(sb, group) ? |
34 | (1 + ext4_bg_num_gdb(sb, group) + | 34 | (1 + ext4_bg_num_gdb(sb, group) + |
@@ -206,7 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
206 | } | 206 | } |
207 | 207 | ||
208 | if (ext4_bg_has_super(sb, input->group)) { | 208 | if (ext4_bg_has_super(sb, input->group)) { |
209 | ext4_debug("mark backup superblock %#04lx (+0)\n", start); | 209 | ext4_debug("mark backup superblock %#04llx (+0)\n", start); |
210 | ext4_set_bit(0, bh->b_data); | 210 | ext4_set_bit(0, bh->b_data); |
211 | } | 211 | } |
212 | 212 | ||
@@ -215,7 +215,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
215 | i < gdblocks; i++, block++, bit++) { | 215 | i < gdblocks; i++, block++, bit++) { |
216 | struct buffer_head *gdb; | 216 | struct buffer_head *gdb; |
217 | 217 | ||
218 | ext4_debug("update backup group %#04lx (+%d)\n", block, bit); | 218 | ext4_debug("update backup group %#04llx (+%d)\n", block, bit); |
219 | 219 | ||
220 | if ((err = extend_or_restart_transaction(handle, 1, bh))) | 220 | if ((err = extend_or_restart_transaction(handle, 1, bh))) |
221 | goto exit_bh; | 221 | goto exit_bh; |
@@ -243,7 +243,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
243 | i < reserved_gdb; i++, block++, bit++) { | 243 | i < reserved_gdb; i++, block++, bit++) { |
244 | struct buffer_head *gdb; | 244 | struct buffer_head *gdb; |
245 | 245 | ||
246 | ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit); | 246 | ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit); |
247 | 247 | ||
248 | if ((err = extend_or_restart_transaction(handle, 1, bh))) | 248 | if ((err = extend_or_restart_transaction(handle, 1, bh))) |
249 | goto exit_bh; | 249 | goto exit_bh; |
@@ -256,10 +256,10 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
256 | ext4_set_bit(bit, bh->b_data); | 256 | ext4_set_bit(bit, bh->b_data); |
257 | brelse(gdb); | 257 | brelse(gdb); |
258 | } | 258 | } |
259 | ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, | 259 | ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, |
260 | input->block_bitmap - start); | 260 | input->block_bitmap - start); |
261 | ext4_set_bit(input->block_bitmap - start, bh->b_data); | 261 | ext4_set_bit(input->block_bitmap - start, bh->b_data); |
262 | ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, | 262 | ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, |
263 | input->inode_bitmap - start); | 263 | input->inode_bitmap - start); |
264 | ext4_set_bit(input->inode_bitmap - start, bh->b_data); | 264 | ext4_set_bit(input->inode_bitmap - start, bh->b_data); |
265 | 265 | ||
@@ -268,7 +268,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
268 | i < sbi->s_itb_per_group; i++, bit++, block++) { | 268 | i < sbi->s_itb_per_group; i++, bit++, block++) { |
269 | struct buffer_head *it; | 269 | struct buffer_head *it; |
270 | 270 | ||
271 | ext4_debug("clear inode block %#04lx (+%d)\n", block, bit); | 271 | ext4_debug("clear inode block %#04llx (+%d)\n", block, bit); |
272 | 272 | ||
273 | if ((err = extend_or_restart_transaction(handle, 1, bh))) | 273 | if ((err = extend_or_restart_transaction(handle, 1, bh))) |
274 | goto exit_bh; | 274 | goto exit_bh; |
@@ -291,7 +291,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
291 | brelse(bh); | 291 | brelse(bh); |
292 | 292 | ||
293 | /* Mark unused entries in inode bitmap used */ | 293 | /* Mark unused entries in inode bitmap used */ |
294 | ext4_debug("clear inode bitmap %#04x (+%ld)\n", | 294 | ext4_debug("clear inode bitmap %#04llx (+%llu)\n", |
295 | input->inode_bitmap, input->inode_bitmap - start); | 295 | input->inode_bitmap, input->inode_bitmap - start); |
296 | if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { | 296 | if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { |
297 | err = PTR_ERR(bh); | 297 | err = PTR_ERR(bh); |
@@ -357,7 +357,7 @@ static int verify_reserved_gdb(struct super_block *sb, | |||
357 | struct buffer_head *primary) | 357 | struct buffer_head *primary) |
358 | { | 358 | { |
359 | const ext4_fsblk_t blk = primary->b_blocknr; | 359 | const ext4_fsblk_t blk = primary->b_blocknr; |
360 | const unsigned long end = EXT4_SB(sb)->s_groups_count; | 360 | const ext4_group_t end = EXT4_SB(sb)->s_groups_count; |
361 | unsigned three = 1; | 361 | unsigned three = 1; |
362 | unsigned five = 5; | 362 | unsigned five = 5; |
363 | unsigned seven = 7; | 363 | unsigned seven = 7; |
@@ -656,12 +656,12 @@ static void update_backups(struct super_block *sb, | |||
656 | int blk_off, char *data, int size) | 656 | int blk_off, char *data, int size) |
657 | { | 657 | { |
658 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 658 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
659 | const unsigned long last = sbi->s_groups_count; | 659 | const ext4_group_t last = sbi->s_groups_count; |
660 | const int bpg = EXT4_BLOCKS_PER_GROUP(sb); | 660 | const int bpg = EXT4_BLOCKS_PER_GROUP(sb); |
661 | unsigned three = 1; | 661 | unsigned three = 1; |
662 | unsigned five = 5; | 662 | unsigned five = 5; |
663 | unsigned seven = 7; | 663 | unsigned seven = 7; |
664 | unsigned group; | 664 | ext4_group_t group; |
665 | int rest = sb->s_blocksize - size; | 665 | int rest = sb->s_blocksize - size; |
666 | handle_t *handle; | 666 | handle_t *handle; |
667 | int err = 0, err2; | 667 | int err = 0, err2; |
@@ -716,7 +716,7 @@ static void update_backups(struct super_block *sb, | |||
716 | exit_err: | 716 | exit_err: |
717 | if (err) { | 717 | if (err) { |
718 | ext4_warning(sb, __FUNCTION__, | 718 | ext4_warning(sb, __FUNCTION__, |
719 | "can't update backup for group %d (err %d), " | 719 | "can't update backup for group %lu (err %d), " |
720 | "forcing fsck on next reboot", group, err); | 720 | "forcing fsck on next reboot", group, err); |
721 | sbi->s_mount_state &= ~EXT4_VALID_FS; | 721 | sbi->s_mount_state &= ~EXT4_VALID_FS; |
722 | sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); | 722 | sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); |
@@ -952,7 +952,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
952 | ext4_fsblk_t n_blocks_count) | 952 | ext4_fsblk_t n_blocks_count) |
953 | { | 953 | { |
954 | ext4_fsblk_t o_blocks_count; | 954 | ext4_fsblk_t o_blocks_count; |
955 | unsigned long o_groups_count; | 955 | ext4_group_t o_groups_count; |
956 | ext4_grpblk_t last; | 956 | ext4_grpblk_t last; |
957 | ext4_grpblk_t add; | 957 | ext4_grpblk_t add; |
958 | struct buffer_head * bh; | 958 | struct buffer_head * bh; |
@@ -1054,7 +1054,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
1054 | ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); | 1054 | ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); |
1055 | sb->s_dirt = 1; | 1055 | sb->s_dirt = 1; |
1056 | unlock_super(sb); | 1056 | unlock_super(sb); |
1057 | ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count, | 1057 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, |
1058 | o_blocks_count + add); | 1058 | o_blocks_count + add); |
1059 | ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); | 1059 | ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); |
1060 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, | 1060 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1ca0f546c466..055a0cd0168e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -373,6 +373,66 @@ void ext4_update_dynamic_rev(struct super_block *sb) | |||
373 | */ | 373 | */ |
374 | } | 374 | } |
375 | 375 | ||
376 | int ext4_update_compat_feature(handle_t *handle, | ||
377 | struct super_block *sb, __u32 compat) | ||
378 | { | ||
379 | int err = 0; | ||
380 | if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) { | ||
381 | err = ext4_journal_get_write_access(handle, | ||
382 | EXT4_SB(sb)->s_sbh); | ||
383 | if (err) | ||
384 | return err; | ||
385 | EXT4_SET_COMPAT_FEATURE(sb, compat); | ||
386 | sb->s_dirt = 1; | ||
387 | handle->h_sync = 1; | ||
388 | BUFFER_TRACE(EXT4_SB(sb)->s_sbh, | ||
389 | "call ext4_journal_dirty_met adata"); | ||
390 | err = ext4_journal_dirty_metadata(handle, | ||
391 | EXT4_SB(sb)->s_sbh); | ||
392 | } | ||
393 | return err; | ||
394 | } | ||
395 | |||
396 | int ext4_update_rocompat_feature(handle_t *handle, | ||
397 | struct super_block *sb, __u32 rocompat) | ||
398 | { | ||
399 | int err = 0; | ||
400 | if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) { | ||
401 | err = ext4_journal_get_write_access(handle, | ||
402 | EXT4_SB(sb)->s_sbh); | ||
403 | if (err) | ||
404 | return err; | ||
405 | EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat); | ||
406 | sb->s_dirt = 1; | ||
407 | handle->h_sync = 1; | ||
408 | BUFFER_TRACE(EXT4_SB(sb)->s_sbh, | ||
409 | "call ext4_journal_dirty_met adata"); | ||
410 | err = ext4_journal_dirty_metadata(handle, | ||
411 | EXT4_SB(sb)->s_sbh); | ||
412 | } | ||
413 | return err; | ||
414 | } | ||
415 | |||
416 | int ext4_update_incompat_feature(handle_t *handle, | ||
417 | struct super_block *sb, __u32 incompat) | ||
418 | { | ||
419 | int err = 0; | ||
420 | if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) { | ||
421 | err = ext4_journal_get_write_access(handle, | ||
422 | EXT4_SB(sb)->s_sbh); | ||
423 | if (err) | ||
424 | return err; | ||
425 | EXT4_SET_INCOMPAT_FEATURE(sb, incompat); | ||
426 | sb->s_dirt = 1; | ||
427 | handle->h_sync = 1; | ||
428 | BUFFER_TRACE(EXT4_SB(sb)->s_sbh, | ||
429 | "call ext4_journal_dirty_met adata"); | ||
430 | err = ext4_journal_dirty_metadata(handle, | ||
431 | EXT4_SB(sb)->s_sbh); | ||
432 | } | ||
433 | return err; | ||
434 | } | ||
435 | |||
376 | /* | 436 | /* |
377 | * Open the external journal device | 437 | * Open the external journal device |
378 | */ | 438 | */ |
@@ -443,6 +503,7 @@ static void ext4_put_super (struct super_block * sb) | |||
443 | struct ext4_super_block *es = sbi->s_es; | 503 | struct ext4_super_block *es = sbi->s_es; |
444 | int i; | 504 | int i; |
445 | 505 | ||
506 | ext4_mb_release(sb); | ||
446 | ext4_ext_release(sb); | 507 | ext4_ext_release(sb); |
447 | ext4_xattr_put_super(sb); | 508 | ext4_xattr_put_super(sb); |
448 | jbd2_journal_destroy(sbi->s_journal); | 509 | jbd2_journal_destroy(sbi->s_journal); |
@@ -509,6 +570,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
509 | ei->i_block_alloc_info = NULL; | 570 | ei->i_block_alloc_info = NULL; |
510 | ei->vfs_inode.i_version = 1; | 571 | ei->vfs_inode.i_version = 1; |
511 | memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); | 572 | memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); |
573 | INIT_LIST_HEAD(&ei->i_prealloc_list); | ||
574 | spin_lock_init(&ei->i_prealloc_lock); | ||
512 | return &ei->vfs_inode; | 575 | return &ei->vfs_inode; |
513 | } | 576 | } |
514 | 577 | ||
@@ -533,7 +596,7 @@ static void init_once(struct kmem_cache *cachep, void *foo) | |||
533 | #ifdef CONFIG_EXT4DEV_FS_XATTR | 596 | #ifdef CONFIG_EXT4DEV_FS_XATTR |
534 | init_rwsem(&ei->xattr_sem); | 597 | init_rwsem(&ei->xattr_sem); |
535 | #endif | 598 | #endif |
536 | mutex_init(&ei->truncate_mutex); | 599 | init_rwsem(&ei->i_data_sem); |
537 | inode_init_once(&ei->vfs_inode); | 600 | inode_init_once(&ei->vfs_inode); |
538 | } | 601 | } |
539 | 602 | ||
@@ -605,18 +668,20 @@ static inline void ext4_show_quota_options(struct seq_file *seq, struct super_bl | |||
605 | */ | 668 | */ |
606 | static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | 669 | static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) |
607 | { | 670 | { |
671 | int def_errors; | ||
672 | unsigned long def_mount_opts; | ||
608 | struct super_block *sb = vfs->mnt_sb; | 673 | struct super_block *sb = vfs->mnt_sb; |
609 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 674 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
610 | struct ext4_super_block *es = sbi->s_es; | 675 | struct ext4_super_block *es = sbi->s_es; |
611 | unsigned long def_mount_opts; | ||
612 | 676 | ||
613 | def_mount_opts = le32_to_cpu(es->s_default_mount_opts); | 677 | def_mount_opts = le32_to_cpu(es->s_default_mount_opts); |
678 | def_errors = le16_to_cpu(es->s_errors); | ||
614 | 679 | ||
615 | if (sbi->s_sb_block != 1) | 680 | if (sbi->s_sb_block != 1) |
616 | seq_printf(seq, ",sb=%llu", sbi->s_sb_block); | 681 | seq_printf(seq, ",sb=%llu", sbi->s_sb_block); |
617 | if (test_opt(sb, MINIX_DF)) | 682 | if (test_opt(sb, MINIX_DF)) |
618 | seq_puts(seq, ",minixdf"); | 683 | seq_puts(seq, ",minixdf"); |
619 | if (test_opt(sb, GRPID)) | 684 | if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS)) |
620 | seq_puts(seq, ",grpid"); | 685 | seq_puts(seq, ",grpid"); |
621 | if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) | 686 | if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) |
622 | seq_puts(seq, ",nogrpid"); | 687 | seq_puts(seq, ",nogrpid"); |
@@ -628,34 +693,33 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
628 | le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { | 693 | le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { |
629 | seq_printf(seq, ",resgid=%u", sbi->s_resgid); | 694 | seq_printf(seq, ",resgid=%u", sbi->s_resgid); |
630 | } | 695 | } |
631 | if (test_opt(sb, ERRORS_CONT)) { | 696 | if (test_opt(sb, ERRORS_RO)) { |
632 | int def_errors = le16_to_cpu(es->s_errors); | ||
633 | |||
634 | if (def_errors == EXT4_ERRORS_PANIC || | 697 | if (def_errors == EXT4_ERRORS_PANIC || |
635 | def_errors == EXT4_ERRORS_RO) { | 698 | def_errors == EXT4_ERRORS_CONTINUE) { |
636 | seq_puts(seq, ",errors=continue"); | 699 | seq_puts(seq, ",errors=remount-ro"); |
637 | } | 700 | } |
638 | } | 701 | } |
639 | if (test_opt(sb, ERRORS_RO)) | 702 | if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) |
640 | seq_puts(seq, ",errors=remount-ro"); | 703 | seq_puts(seq, ",errors=continue"); |
641 | if (test_opt(sb, ERRORS_PANIC)) | 704 | if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) |
642 | seq_puts(seq, ",errors=panic"); | 705 | seq_puts(seq, ",errors=panic"); |
643 | if (test_opt(sb, NO_UID32)) | 706 | if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16)) |
644 | seq_puts(seq, ",nouid32"); | 707 | seq_puts(seq, ",nouid32"); |
645 | if (test_opt(sb, DEBUG)) | 708 | if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) |
646 | seq_puts(seq, ",debug"); | 709 | seq_puts(seq, ",debug"); |
647 | if (test_opt(sb, OLDALLOC)) | 710 | if (test_opt(sb, OLDALLOC)) |
648 | seq_puts(seq, ",oldalloc"); | 711 | seq_puts(seq, ",oldalloc"); |
649 | #ifdef CONFIG_EXT4_FS_XATTR | 712 | #ifdef CONFIG_EXT4DEV_FS_XATTR |
650 | if (test_opt(sb, XATTR_USER)) | 713 | if (test_opt(sb, XATTR_USER) && |
714 | !(def_mount_opts & EXT4_DEFM_XATTR_USER)) | ||
651 | seq_puts(seq, ",user_xattr"); | 715 | seq_puts(seq, ",user_xattr"); |
652 | if (!test_opt(sb, XATTR_USER) && | 716 | if (!test_opt(sb, XATTR_USER) && |
653 | (def_mount_opts & EXT4_DEFM_XATTR_USER)) { | 717 | (def_mount_opts & EXT4_DEFM_XATTR_USER)) { |
654 | seq_puts(seq, ",nouser_xattr"); | 718 | seq_puts(seq, ",nouser_xattr"); |
655 | } | 719 | } |
656 | #endif | 720 | #endif |
657 | #ifdef CONFIG_EXT4_FS_POSIX_ACL | 721 | #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL |
658 | if (test_opt(sb, POSIX_ACL)) | 722 | if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) |
659 | seq_puts(seq, ",acl"); | 723 | seq_puts(seq, ",acl"); |
660 | if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) | 724 | if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) |
661 | seq_puts(seq, ",noacl"); | 725 | seq_puts(seq, ",noacl"); |
@@ -672,7 +736,17 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
672 | seq_puts(seq, ",nobh"); | 736 | seq_puts(seq, ",nobh"); |
673 | if (!test_opt(sb, EXTENTS)) | 737 | if (!test_opt(sb, EXTENTS)) |
674 | seq_puts(seq, ",noextents"); | 738 | seq_puts(seq, ",noextents"); |
739 | if (!test_opt(sb, MBALLOC)) | ||
740 | seq_puts(seq, ",nomballoc"); | ||
741 | if (test_opt(sb, I_VERSION)) | ||
742 | seq_puts(seq, ",i_version"); | ||
675 | 743 | ||
744 | if (sbi->s_stripe) | ||
745 | seq_printf(seq, ",stripe=%lu", sbi->s_stripe); | ||
746 | /* | ||
747 | * journal mode get enabled in different ways | ||
748 | * So just print the value even if we didn't specify it | ||
749 | */ | ||
676 | if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) | 750 | if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) |
677 | seq_puts(seq, ",data=journal"); | 751 | seq_puts(seq, ",data=journal"); |
678 | else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) | 752 | else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) |
@@ -681,7 +755,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
681 | seq_puts(seq, ",data=writeback"); | 755 | seq_puts(seq, ",data=writeback"); |
682 | 756 | ||
683 | ext4_show_quota_options(seq, sb); | 757 | ext4_show_quota_options(seq, sb); |
684 | |||
685 | return 0; | 758 | return 0; |
686 | } | 759 | } |
687 | 760 | ||
@@ -809,11 +882,13 @@ enum { | |||
809 | Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, | 882 | Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, |
810 | Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, | 883 | Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, |
811 | Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, | 884 | Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, |
885 | Opt_journal_checksum, Opt_journal_async_commit, | ||
812 | Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, | 886 | Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, |
813 | Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, | 887 | Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, |
814 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, | 888 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, |
815 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, | 889 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, |
816 | Opt_grpquota, Opt_extents, Opt_noextents, | 890 | Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, |
891 | Opt_mballoc, Opt_nomballoc, Opt_stripe, | ||
817 | }; | 892 | }; |
818 | 893 | ||
819 | static match_table_t tokens = { | 894 | static match_table_t tokens = { |
@@ -848,6 +923,8 @@ static match_table_t tokens = { | |||
848 | {Opt_journal_update, "journal=update"}, | 923 | {Opt_journal_update, "journal=update"}, |
849 | {Opt_journal_inum, "journal=%u"}, | 924 | {Opt_journal_inum, "journal=%u"}, |
850 | {Opt_journal_dev, "journal_dev=%u"}, | 925 | {Opt_journal_dev, "journal_dev=%u"}, |
926 | {Opt_journal_checksum, "journal_checksum"}, | ||
927 | {Opt_journal_async_commit, "journal_async_commit"}, | ||
851 | {Opt_abort, "abort"}, | 928 | {Opt_abort, "abort"}, |
852 | {Opt_data_journal, "data=journal"}, | 929 | {Opt_data_journal, "data=journal"}, |
853 | {Opt_data_ordered, "data=ordered"}, | 930 | {Opt_data_ordered, "data=ordered"}, |
@@ -865,6 +942,10 @@ static match_table_t tokens = { | |||
865 | {Opt_barrier, "barrier=%u"}, | 942 | {Opt_barrier, "barrier=%u"}, |
866 | {Opt_extents, "extents"}, | 943 | {Opt_extents, "extents"}, |
867 | {Opt_noextents, "noextents"}, | 944 | {Opt_noextents, "noextents"}, |
945 | {Opt_i_version, "i_version"}, | ||
946 | {Opt_mballoc, "mballoc"}, | ||
947 | {Opt_nomballoc, "nomballoc"}, | ||
948 | {Opt_stripe, "stripe=%u"}, | ||
868 | {Opt_err, NULL}, | 949 | {Opt_err, NULL}, |
869 | {Opt_resize, "resize"}, | 950 | {Opt_resize, "resize"}, |
870 | }; | 951 | }; |
@@ -1035,6 +1116,13 @@ static int parse_options (char *options, struct super_block *sb, | |||
1035 | return 0; | 1116 | return 0; |
1036 | *journal_devnum = option; | 1117 | *journal_devnum = option; |
1037 | break; | 1118 | break; |
1119 | case Opt_journal_checksum: | ||
1120 | set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); | ||
1121 | break; | ||
1122 | case Opt_journal_async_commit: | ||
1123 | set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); | ||
1124 | set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); | ||
1125 | break; | ||
1038 | case Opt_noload: | 1126 | case Opt_noload: |
1039 | set_opt (sbi->s_mount_opt, NOLOAD); | 1127 | set_opt (sbi->s_mount_opt, NOLOAD); |
1040 | break; | 1128 | break; |
@@ -1203,6 +1291,23 @@ clear_qf_name: | |||
1203 | case Opt_noextents: | 1291 | case Opt_noextents: |
1204 | clear_opt (sbi->s_mount_opt, EXTENTS); | 1292 | clear_opt (sbi->s_mount_opt, EXTENTS); |
1205 | break; | 1293 | break; |
1294 | case Opt_i_version: | ||
1295 | set_opt(sbi->s_mount_opt, I_VERSION); | ||
1296 | sb->s_flags |= MS_I_VERSION; | ||
1297 | break; | ||
1298 | case Opt_mballoc: | ||
1299 | set_opt(sbi->s_mount_opt, MBALLOC); | ||
1300 | break; | ||
1301 | case Opt_nomballoc: | ||
1302 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
1303 | break; | ||
1304 | case Opt_stripe: | ||
1305 | if (match_int(&args[0], &option)) | ||
1306 | return 0; | ||
1307 | if (option < 0) | ||
1308 | return 0; | ||
1309 | sbi->s_stripe = option; | ||
1310 | break; | ||
1206 | default: | 1311 | default: |
1207 | printk (KERN_ERR | 1312 | printk (KERN_ERR |
1208 | "EXT4-fs: Unrecognized mount option \"%s\" " | 1313 | "EXT4-fs: Unrecognized mount option \"%s\" " |
@@ -1364,7 +1469,7 @@ static int ext4_check_descriptors (struct super_block * sb) | |||
1364 | struct ext4_group_desc * gdp = NULL; | 1469 | struct ext4_group_desc * gdp = NULL; |
1365 | int desc_block = 0; | 1470 | int desc_block = 0; |
1366 | int flexbg_flag = 0; | 1471 | int flexbg_flag = 0; |
1367 | int i; | 1472 | ext4_group_t i; |
1368 | 1473 | ||
1369 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) | 1474 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) |
1370 | flexbg_flag = 1; | 1475 | flexbg_flag = 1; |
@@ -1386,7 +1491,7 @@ static int ext4_check_descriptors (struct super_block * sb) | |||
1386 | if (block_bitmap < first_block || block_bitmap > last_block) | 1491 | if (block_bitmap < first_block || block_bitmap > last_block) |
1387 | { | 1492 | { |
1388 | ext4_error (sb, "ext4_check_descriptors", | 1493 | ext4_error (sb, "ext4_check_descriptors", |
1389 | "Block bitmap for group %d" | 1494 | "Block bitmap for group %lu" |
1390 | " not in group (block %llu)!", | 1495 | " not in group (block %llu)!", |
1391 | i, block_bitmap); | 1496 | i, block_bitmap); |
1392 | return 0; | 1497 | return 0; |
@@ -1395,7 +1500,7 @@ static int ext4_check_descriptors (struct super_block * sb) | |||
1395 | if (inode_bitmap < first_block || inode_bitmap > last_block) | 1500 | if (inode_bitmap < first_block || inode_bitmap > last_block) |
1396 | { | 1501 | { |
1397 | ext4_error (sb, "ext4_check_descriptors", | 1502 | ext4_error (sb, "ext4_check_descriptors", |
1398 | "Inode bitmap for group %d" | 1503 | "Inode bitmap for group %lu" |
1399 | " not in group (block %llu)!", | 1504 | " not in group (block %llu)!", |
1400 | i, inode_bitmap); | 1505 | i, inode_bitmap); |
1401 | return 0; | 1506 | return 0; |
@@ -1405,17 +1510,16 @@ static int ext4_check_descriptors (struct super_block * sb) | |||
1405 | inode_table + sbi->s_itb_per_group - 1 > last_block) | 1510 | inode_table + sbi->s_itb_per_group - 1 > last_block) |
1406 | { | 1511 | { |
1407 | ext4_error (sb, "ext4_check_descriptors", | 1512 | ext4_error (sb, "ext4_check_descriptors", |
1408 | "Inode table for group %d" | 1513 | "Inode table for group %lu" |
1409 | " not in group (block %llu)!", | 1514 | " not in group (block %llu)!", |
1410 | i, inode_table); | 1515 | i, inode_table); |
1411 | return 0; | 1516 | return 0; |
1412 | } | 1517 | } |
1413 | if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { | 1518 | if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { |
1414 | ext4_error(sb, __FUNCTION__, | 1519 | ext4_error(sb, __FUNCTION__, |
1415 | "Checksum for group %d failed (%u!=%u)\n", i, | 1520 | "Checksum for group %lu failed (%u!=%u)\n", |
1416 | le16_to_cpu(ext4_group_desc_csum(sbi, i, | 1521 | i, le16_to_cpu(ext4_group_desc_csum(sbi, i, |
1417 | gdp)), | 1522 | gdp)), le16_to_cpu(gdp->bg_checksum)); |
1418 | le16_to_cpu(gdp->bg_checksum)); | ||
1419 | return 0; | 1523 | return 0; |
1420 | } | 1524 | } |
1421 | if (!flexbg_flag) | 1525 | if (!flexbg_flag) |
@@ -1429,7 +1533,6 @@ static int ext4_check_descriptors (struct super_block * sb) | |||
1429 | return 1; | 1533 | return 1; |
1430 | } | 1534 | } |
1431 | 1535 | ||
1432 | |||
1433 | /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at | 1536 | /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at |
1434 | * the superblock) which were deleted from all directories, but held open by | 1537 | * the superblock) which were deleted from all directories, but held open by |
1435 | * a process at the time of a crash. We walk the list and try to delete these | 1538 | * a process at the time of a crash. We walk the list and try to delete these |
@@ -1542,20 +1645,95 @@ static void ext4_orphan_cleanup (struct super_block * sb, | |||
1542 | #endif | 1645 | #endif |
1543 | sb->s_flags = s_flags; /* Restore MS_RDONLY status */ | 1646 | sb->s_flags = s_flags; /* Restore MS_RDONLY status */ |
1544 | } | 1647 | } |
1648 | /* | ||
1649 | * Maximal extent format file size. | ||
1650 | * Resulting logical blkno at s_maxbytes must fit in our on-disk | ||
1651 | * extent format containers, within a sector_t, and within i_blocks | ||
1652 | * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, | ||
1653 | * so that won't be a limiting factor. | ||
1654 | * | ||
1655 | * Note, this does *not* consider any metadata overhead for vfs i_blocks. | ||
1656 | */ | ||
1657 | static loff_t ext4_max_size(int blkbits) | ||
1658 | { | ||
1659 | loff_t res; | ||
1660 | loff_t upper_limit = MAX_LFS_FILESIZE; | ||
1661 | |||
1662 | /* small i_blocks in vfs inode? */ | ||
1663 | if (sizeof(blkcnt_t) < sizeof(u64)) { | ||
1664 | /* | ||
1665 | * CONFIG_LSF is not enabled implies the inode | ||
1666 | * i_block represent total blocks in 512 bytes | ||
1667 | * 32 == size of vfs inode i_blocks * 8 | ||
1668 | */ | ||
1669 | upper_limit = (1LL << 32) - 1; | ||
1670 | |||
1671 | /* total blocks in file system block size */ | ||
1672 | upper_limit >>= (blkbits - 9); | ||
1673 | upper_limit <<= blkbits; | ||
1674 | } | ||
1675 | |||
1676 | /* 32-bit extent-start container, ee_block */ | ||
1677 | res = 1LL << 32; | ||
1678 | res <<= blkbits; | ||
1679 | res -= 1; | ||
1680 | |||
1681 | /* Sanity check against vm- & vfs- imposed limits */ | ||
1682 | if (res > upper_limit) | ||
1683 | res = upper_limit; | ||
1684 | |||
1685 | return res; | ||
1686 | } | ||
1545 | 1687 | ||
1546 | /* | 1688 | /* |
1547 | * Maximal file size. There is a direct, and {,double-,triple-}indirect | 1689 | * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect |
1548 | * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. | 1690 | * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. |
1549 | * We need to be 1 filesystem block less than the 2^32 sector limit. | 1691 | * We need to be 1 filesystem block less than the 2^48 sector limit. |
1550 | */ | 1692 | */ |
1551 | static loff_t ext4_max_size(int bits) | 1693 | static loff_t ext4_max_bitmap_size(int bits) |
1552 | { | 1694 | { |
1553 | loff_t res = EXT4_NDIR_BLOCKS; | 1695 | loff_t res = EXT4_NDIR_BLOCKS; |
1554 | /* This constant is calculated to be the largest file size for a | 1696 | int meta_blocks; |
1555 | * dense, 4k-blocksize file such that the total number of | 1697 | loff_t upper_limit; |
1698 | /* This is calculated to be the largest file size for a | ||
1699 | * dense, bitmapped file such that the total number of | ||
1556 | * sectors in the file, including data and all indirect blocks, | 1700 | * sectors in the file, including data and all indirect blocks, |
1557 | * does not exceed 2^32. */ | 1701 | * does not exceed 2^48 -1 |
1558 | const loff_t upper_limit = 0x1ff7fffd000LL; | 1702 | * __u32 i_blocks_lo and _u16 i_blocks_high representing the |
1703 | * total number of 512 bytes blocks of the file | ||
1704 | */ | ||
1705 | |||
1706 | if (sizeof(blkcnt_t) < sizeof(u64)) { | ||
1707 | /* | ||
1708 | * CONFIG_LSF is not enabled implies the inode | ||
1709 | * i_block represent total blocks in 512 bytes | ||
1710 | * 32 == size of vfs inode i_blocks * 8 | ||
1711 | */ | ||
1712 | upper_limit = (1LL << 32) - 1; | ||
1713 | |||
1714 | /* total blocks in file system block size */ | ||
1715 | upper_limit >>= (bits - 9); | ||
1716 | |||
1717 | } else { | ||
1718 | /* | ||
1719 | * We use 48 bit ext4_inode i_blocks | ||
1720 | * With EXT4_HUGE_FILE_FL set the i_blocks | ||
1721 | * represent total number of blocks in | ||
1722 | * file system block size | ||
1723 | */ | ||
1724 | upper_limit = (1LL << 48) - 1; | ||
1725 | |||
1726 | } | ||
1727 | |||
1728 | /* indirect blocks */ | ||
1729 | meta_blocks = 1; | ||
1730 | /* double indirect blocks */ | ||
1731 | meta_blocks += 1 + (1LL << (bits-2)); | ||
1732 | /* tripple indirect blocks */ | ||
1733 | meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); | ||
1734 | |||
1735 | upper_limit -= meta_blocks; | ||
1736 | upper_limit <<= bits; | ||
1559 | 1737 | ||
1560 | res += 1LL << (bits-2); | 1738 | res += 1LL << (bits-2); |
1561 | res += 1LL << (2*(bits-2)); | 1739 | res += 1LL << (2*(bits-2)); |
@@ -1563,6 +1741,10 @@ static loff_t ext4_max_size(int bits) | |||
1563 | res <<= bits; | 1741 | res <<= bits; |
1564 | if (res > upper_limit) | 1742 | if (res > upper_limit) |
1565 | res = upper_limit; | 1743 | res = upper_limit; |
1744 | |||
1745 | if (res > MAX_LFS_FILESIZE) | ||
1746 | res = MAX_LFS_FILESIZE; | ||
1747 | |||
1566 | return res; | 1748 | return res; |
1567 | } | 1749 | } |
1568 | 1750 | ||
@@ -1570,7 +1752,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb, | |||
1570 | ext4_fsblk_t logical_sb_block, int nr) | 1752 | ext4_fsblk_t logical_sb_block, int nr) |
1571 | { | 1753 | { |
1572 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1754 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1573 | unsigned long bg, first_meta_bg; | 1755 | ext4_group_t bg, first_meta_bg; |
1574 | int has_super = 0; | 1756 | int has_super = 0; |
1575 | 1757 | ||
1576 | first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); | 1758 | first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); |
@@ -1584,8 +1766,39 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb, | |||
1584 | return (has_super + ext4_group_first_block_no(sb, bg)); | 1766 | return (has_super + ext4_group_first_block_no(sb, bg)); |
1585 | } | 1767 | } |
1586 | 1768 | ||
1769 | /** | ||
1770 | * ext4_get_stripe_size: Get the stripe size. | ||
1771 | * @sbi: In memory super block info | ||
1772 | * | ||
1773 | * If we have specified it via mount option, then | ||
1774 | * use the mount option value. If the value specified at mount time is | ||
1775 | * greater than the blocks per group use the super block value. | ||
1776 | * If the super block value is greater than blocks per group return 0. | ||
1777 | * Allocator needs it be less than blocks per group. | ||
1778 | * | ||
1779 | */ | ||
1780 | static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) | ||
1781 | { | ||
1782 | unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); | ||
1783 | unsigned long stripe_width = | ||
1784 | le32_to_cpu(sbi->s_es->s_raid_stripe_width); | ||
1785 | |||
1786 | if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) | ||
1787 | return sbi->s_stripe; | ||
1788 | |||
1789 | if (stripe_width <= sbi->s_blocks_per_group) | ||
1790 | return stripe_width; | ||
1791 | |||
1792 | if (stride <= sbi->s_blocks_per_group) | ||
1793 | return stride; | ||
1794 | |||
1795 | return 0; | ||
1796 | } | ||
1587 | 1797 | ||
1588 | static int ext4_fill_super (struct super_block *sb, void *data, int silent) | 1798 | static int ext4_fill_super (struct super_block *sb, void *data, int silent) |
1799 | __releases(kernel_sem) | ||
1800 | __acquires(kernel_sem) | ||
1801 | |||
1589 | { | 1802 | { |
1590 | struct buffer_head * bh; | 1803 | struct buffer_head * bh; |
1591 | struct ext4_super_block *es = NULL; | 1804 | struct ext4_super_block *es = NULL; |
@@ -1599,7 +1812,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1599 | unsigned long def_mount_opts; | 1812 | unsigned long def_mount_opts; |
1600 | struct inode *root; | 1813 | struct inode *root; |
1601 | int blocksize; | 1814 | int blocksize; |
1602 | int hblock; | ||
1603 | int db_count; | 1815 | int db_count; |
1604 | int i; | 1816 | int i; |
1605 | int needs_recovery; | 1817 | int needs_recovery; |
@@ -1624,6 +1836,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1624 | goto out_fail; | 1836 | goto out_fail; |
1625 | } | 1837 | } |
1626 | 1838 | ||
1839 | if (!sb_set_blocksize(sb, blocksize)) { | ||
1840 | printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize); | ||
1841 | goto out_fail; | ||
1842 | } | ||
1843 | |||
1627 | /* | 1844 | /* |
1628 | * The ext4 superblock will not be buffer aligned for other than 1kB | 1845 | * The ext4 superblock will not be buffer aligned for other than 1kB |
1629 | * block sizes. We need to calculate the offset from buffer start. | 1846 | * block sizes. We need to calculate the offset from buffer start. |
@@ -1674,10 +1891,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1674 | 1891 | ||
1675 | if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) | 1892 | if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) |
1676 | set_opt(sbi->s_mount_opt, ERRORS_PANIC); | 1893 | set_opt(sbi->s_mount_opt, ERRORS_PANIC); |
1677 | else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO) | 1894 | else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) |
1678 | set_opt(sbi->s_mount_opt, ERRORS_RO); | ||
1679 | else | ||
1680 | set_opt(sbi->s_mount_opt, ERRORS_CONT); | 1895 | set_opt(sbi->s_mount_opt, ERRORS_CONT); |
1896 | else | ||
1897 | set_opt(sbi->s_mount_opt, ERRORS_RO); | ||
1681 | 1898 | ||
1682 | sbi->s_resuid = le16_to_cpu(es->s_def_resuid); | 1899 | sbi->s_resuid = le16_to_cpu(es->s_def_resuid); |
1683 | sbi->s_resgid = le16_to_cpu(es->s_def_resgid); | 1900 | sbi->s_resgid = le16_to_cpu(es->s_def_resgid); |
@@ -1689,6 +1906,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1689 | * User -o noextents to turn it off | 1906 | * User -o noextents to turn it off |
1690 | */ | 1907 | */ |
1691 | set_opt(sbi->s_mount_opt, EXTENTS); | 1908 | set_opt(sbi->s_mount_opt, EXTENTS); |
1909 | /* | ||
1910 | * turn on mballoc feature by default in ext4 filesystem | ||
1911 | * User -o nomballoc to turn it off | ||
1912 | */ | ||
1913 | set_opt(sbi->s_mount_opt, MBALLOC); | ||
1692 | 1914 | ||
1693 | if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, | 1915 | if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, |
1694 | NULL, 0)) | 1916 | NULL, 0)) |
@@ -1723,6 +1945,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1723 | sb->s_id, le32_to_cpu(features)); | 1945 | sb->s_id, le32_to_cpu(features)); |
1724 | goto failed_mount; | 1946 | goto failed_mount; |
1725 | } | 1947 | } |
1948 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { | ||
1949 | /* | ||
1950 | * Large file size enabled file system can only be | ||
1951 | * mount if kernel is build with CONFIG_LSF | ||
1952 | */ | ||
1953 | if (sizeof(root->i_blocks) < sizeof(u64) && | ||
1954 | !(sb->s_flags & MS_RDONLY)) { | ||
1955 | printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge " | ||
1956 | "files cannot be mounted read-write " | ||
1957 | "without CONFIG_LSF.\n", sb->s_id); | ||
1958 | goto failed_mount; | ||
1959 | } | ||
1960 | } | ||
1726 | blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); | 1961 | blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); |
1727 | 1962 | ||
1728 | if (blocksize < EXT4_MIN_BLOCK_SIZE || | 1963 | if (blocksize < EXT4_MIN_BLOCK_SIZE || |
@@ -1733,20 +1968,16 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1733 | goto failed_mount; | 1968 | goto failed_mount; |
1734 | } | 1969 | } |
1735 | 1970 | ||
1736 | hblock = bdev_hardsect_size(sb->s_bdev); | ||
1737 | if (sb->s_blocksize != blocksize) { | 1971 | if (sb->s_blocksize != blocksize) { |
1738 | /* | 1972 | |
1739 | * Make sure the blocksize for the filesystem is larger | 1973 | /* Validate the filesystem blocksize */ |
1740 | * than the hardware sectorsize for the machine. | 1974 | if (!sb_set_blocksize(sb, blocksize)) { |
1741 | */ | 1975 | printk(KERN_ERR "EXT4-fs: bad block size %d.\n", |
1742 | if (blocksize < hblock) { | 1976 | blocksize); |
1743 | printk(KERN_ERR "EXT4-fs: blocksize %d too small for " | ||
1744 | "device blocksize %d.\n", blocksize, hblock); | ||
1745 | goto failed_mount; | 1977 | goto failed_mount; |
1746 | } | 1978 | } |
1747 | 1979 | ||
1748 | brelse (bh); | 1980 | brelse (bh); |
1749 | sb_set_blocksize(sb, blocksize); | ||
1750 | logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; | 1981 | logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; |
1751 | offset = do_div(logical_sb_block, blocksize); | 1982 | offset = do_div(logical_sb_block, blocksize); |
1752 | bh = sb_bread(sb, logical_sb_block); | 1983 | bh = sb_bread(sb, logical_sb_block); |
@@ -1764,6 +1995,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1764 | } | 1995 | } |
1765 | } | 1996 | } |
1766 | 1997 | ||
1998 | sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); | ||
1767 | sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); | 1999 | sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); |
1768 | 2000 | ||
1769 | if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { | 2001 | if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { |
@@ -1838,6 +2070,17 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1838 | 2070 | ||
1839 | if (EXT4_BLOCKS_PER_GROUP(sb) == 0) | 2071 | if (EXT4_BLOCKS_PER_GROUP(sb) == 0) |
1840 | goto cantfind_ext4; | 2072 | goto cantfind_ext4; |
2073 | |||
2074 | /* ensure blocks_count calculation below doesn't sign-extend */ | ||
2075 | if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < | ||
2076 | le32_to_cpu(es->s_first_data_block) + 1) { | ||
2077 | printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " | ||
2078 | "first data block %u, blocks per group %lu\n", | ||
2079 | ext4_blocks_count(es), | ||
2080 | le32_to_cpu(es->s_first_data_block), | ||
2081 | EXT4_BLOCKS_PER_GROUP(sb)); | ||
2082 | goto failed_mount; | ||
2083 | } | ||
1841 | blocks_count = (ext4_blocks_count(es) - | 2084 | blocks_count = (ext4_blocks_count(es) - |
1842 | le32_to_cpu(es->s_first_data_block) + | 2085 | le32_to_cpu(es->s_first_data_block) + |
1843 | EXT4_BLOCKS_PER_GROUP(sb) - 1); | 2086 | EXT4_BLOCKS_PER_GROUP(sb) - 1); |
@@ -1900,6 +2143,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1900 | sbi->s_rsv_window_head.rsv_goal_size = 0; | 2143 | sbi->s_rsv_window_head.rsv_goal_size = 0; |
1901 | ext4_rsv_window_add(sb, &sbi->s_rsv_window_head); | 2144 | ext4_rsv_window_add(sb, &sbi->s_rsv_window_head); |
1902 | 2145 | ||
2146 | sbi->s_stripe = ext4_get_stripe_size(sbi); | ||
2147 | |||
1903 | /* | 2148 | /* |
1904 | * set up enough so that it can read an inode | 2149 | * set up enough so that it can read an inode |
1905 | */ | 2150 | */ |
@@ -1944,6 +2189,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1944 | goto failed_mount4; | 2189 | goto failed_mount4; |
1945 | } | 2190 | } |
1946 | 2191 | ||
2192 | if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { | ||
2193 | jbd2_journal_set_features(sbi->s_journal, | ||
2194 | JBD2_FEATURE_COMPAT_CHECKSUM, 0, | ||
2195 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); | ||
2196 | } else if (test_opt(sb, JOURNAL_CHECKSUM)) { | ||
2197 | jbd2_journal_set_features(sbi->s_journal, | ||
2198 | JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); | ||
2199 | jbd2_journal_clear_features(sbi->s_journal, 0, 0, | ||
2200 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); | ||
2201 | } else { | ||
2202 | jbd2_journal_clear_features(sbi->s_journal, | ||
2203 | JBD2_FEATURE_COMPAT_CHECKSUM, 0, | ||
2204 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); | ||
2205 | } | ||
2206 | |||
1947 | /* We have now updated the journal if required, so we can | 2207 | /* We have now updated the journal if required, so we can |
1948 | * validate the data journaling mode. */ | 2208 | * validate the data journaling mode. */ |
1949 | switch (test_opt(sb, DATA_FLAGS)) { | 2209 | switch (test_opt(sb, DATA_FLAGS)) { |
@@ -2044,6 +2304,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
2044 | "writeback"); | 2304 | "writeback"); |
2045 | 2305 | ||
2046 | ext4_ext_init(sb); | 2306 | ext4_ext_init(sb); |
2307 | ext4_mb_init(sb, needs_recovery); | ||
2047 | 2308 | ||
2048 | lock_kernel(); | 2309 | lock_kernel(); |
2049 | return 0; | 2310 | return 0; |
@@ -2673,7 +2934,7 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf) | |||
2673 | if (test_opt(sb, MINIX_DF)) { | 2934 | if (test_opt(sb, MINIX_DF)) { |
2674 | sbi->s_overhead_last = 0; | 2935 | sbi->s_overhead_last = 0; |
2675 | } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { | 2936 | } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { |
2676 | unsigned long ngroups = sbi->s_groups_count, i; | 2937 | ext4_group_t ngroups = sbi->s_groups_count, i; |
2677 | ext4_fsblk_t overhead = 0; | 2938 | ext4_fsblk_t overhead = 0; |
2678 | smp_rmb(); | 2939 | smp_rmb(); |
2679 | 2940 | ||
@@ -2909,7 +3170,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, | |||
2909 | size_t len, loff_t off) | 3170 | size_t len, loff_t off) |
2910 | { | 3171 | { |
2911 | struct inode *inode = sb_dqopt(sb)->files[type]; | 3172 | struct inode *inode = sb_dqopt(sb)->files[type]; |
2912 | sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); | 3173 | ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); |
2913 | int err = 0; | 3174 | int err = 0; |
2914 | int offset = off & (sb->s_blocksize - 1); | 3175 | int offset = off & (sb->s_blocksize - 1); |
2915 | int tocopy; | 3176 | int tocopy; |
@@ -2947,7 +3208,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, | |||
2947 | const char *data, size_t len, loff_t off) | 3208 | const char *data, size_t len, loff_t off) |
2948 | { | 3209 | { |
2949 | struct inode *inode = sb_dqopt(sb)->files[type]; | 3210 | struct inode *inode = sb_dqopt(sb)->files[type]; |
2950 | sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); | 3211 | ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); |
2951 | int err = 0; | 3212 | int err = 0; |
2952 | int offset = off & (sb->s_blocksize - 1); | 3213 | int offset = off & (sb->s_blocksize - 1); |
2953 | int tocopy; | 3214 | int tocopy; |
@@ -3002,7 +3263,6 @@ out: | |||
3002 | i_size_write(inode, off+len-towrite); | 3263 | i_size_write(inode, off+len-towrite); |
3003 | EXT4_I(inode)->i_disksize = inode->i_size; | 3264 | EXT4_I(inode)->i_disksize = inode->i_size; |
3004 | } | 3265 | } |
3005 | inode->i_version++; | ||
3006 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 3266 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
3007 | ext4_mark_inode_dirty(handle, inode); | 3267 | ext4_mark_inode_dirty(handle, inode); |
3008 | mutex_unlock(&inode->i_mutex); | 3268 | mutex_unlock(&inode->i_mutex); |
@@ -3027,9 +3287,15 @@ static struct file_system_type ext4dev_fs_type = { | |||
3027 | 3287 | ||
3028 | static int __init init_ext4_fs(void) | 3288 | static int __init init_ext4_fs(void) |
3029 | { | 3289 | { |
3030 | int err = init_ext4_xattr(); | 3290 | int err; |
3291 | |||
3292 | err = init_ext4_mballoc(); | ||
3031 | if (err) | 3293 | if (err) |
3032 | return err; | 3294 | return err; |
3295 | |||
3296 | err = init_ext4_xattr(); | ||
3297 | if (err) | ||
3298 | goto out2; | ||
3033 | err = init_inodecache(); | 3299 | err = init_inodecache(); |
3034 | if (err) | 3300 | if (err) |
3035 | goto out1; | 3301 | goto out1; |
@@ -3041,6 +3307,8 @@ out: | |||
3041 | destroy_inodecache(); | 3307 | destroy_inodecache(); |
3042 | out1: | 3308 | out1: |
3043 | exit_ext4_xattr(); | 3309 | exit_ext4_xattr(); |
3310 | out2: | ||
3311 | exit_ext4_mballoc(); | ||
3044 | return err; | 3312 | return err; |
3045 | } | 3313 | } |
3046 | 3314 | ||
@@ -3049,6 +3317,7 @@ static void __exit exit_ext4_fs(void) | |||
3049 | unregister_filesystem(&ext4dev_fs_type); | 3317 | unregister_filesystem(&ext4dev_fs_type); |
3050 | destroy_inodecache(); | 3318 | destroy_inodecache(); |
3051 | exit_ext4_xattr(); | 3319 | exit_ext4_xattr(); |
3320 | exit_ext4_mballoc(); | ||
3052 | } | 3321 | } |
3053 | 3322 | ||
3054 | MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); | 3323 | MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); |
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 86387302c2a9..d7962139c010 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c | |||
@@ -480,7 +480,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, | |||
480 | ea_bdebug(bh, "refcount now=0; freeing"); | 480 | ea_bdebug(bh, "refcount now=0; freeing"); |
481 | if (ce) | 481 | if (ce) |
482 | mb_cache_entry_free(ce); | 482 | mb_cache_entry_free(ce); |
483 | ext4_free_blocks(handle, inode, bh->b_blocknr, 1); | 483 | ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1); |
484 | get_bh(bh); | 484 | get_bh(bh); |
485 | ext4_forget(handle, 1, inode, bh, bh->b_blocknr); | 485 | ext4_forget(handle, 1, inode, bh, bh->b_blocknr); |
486 | } else { | 486 | } else { |
@@ -821,7 +821,7 @@ inserted: | |||
821 | new_bh = sb_getblk(sb, block); | 821 | new_bh = sb_getblk(sb, block); |
822 | if (!new_bh) { | 822 | if (!new_bh) { |
823 | getblk_failed: | 823 | getblk_failed: |
824 | ext4_free_blocks(handle, inode, block, 1); | 824 | ext4_free_blocks(handle, inode, block, 1, 1); |
825 | error = -EIO; | 825 | error = -EIO; |
826 | goto cleanup; | 826 | goto cleanup; |
827 | } | 827 | } |
diff --git a/fs/inode.c b/fs/inode.c index ed35383d0b6c..276ffd6b6fdd 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -1276,6 +1276,11 @@ void file_update_time(struct file *file) | |||
1276 | sync_it = 1; | 1276 | sync_it = 1; |
1277 | } | 1277 | } |
1278 | 1278 | ||
1279 | if (IS_I_VERSION(inode)) { | ||
1280 | inode_inc_iversion(inode); | ||
1281 | sync_it = 1; | ||
1282 | } | ||
1283 | |||
1279 | if (sync_it) | 1284 | if (sync_it) |
1280 | mark_inode_dirty_sync(inode); | 1285 | mark_inode_dirty_sync(inode); |
1281 | } | 1286 | } |
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 3fccde7ba008..1b7f282c1ae9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
@@ -232,7 +232,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) | |||
232 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | 232 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it |
233 | */ | 233 | */ |
234 | static int __process_buffer(journal_t *journal, struct journal_head *jh, | 234 | static int __process_buffer(journal_t *journal, struct journal_head *jh, |
235 | struct buffer_head **bhs, int *batch_count) | 235 | struct buffer_head **bhs, int *batch_count, |
236 | transaction_t *transaction) | ||
236 | { | 237 | { |
237 | struct buffer_head *bh = jh2bh(jh); | 238 | struct buffer_head *bh = jh2bh(jh); |
238 | int ret = 0; | 239 | int ret = 0; |
@@ -250,6 +251,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, | |||
250 | transaction_t *t = jh->b_transaction; | 251 | transaction_t *t = jh->b_transaction; |
251 | tid_t tid = t->t_tid; | 252 | tid_t tid = t->t_tid; |
252 | 253 | ||
254 | transaction->t_chp_stats.cs_forced_to_close++; | ||
253 | spin_unlock(&journal->j_list_lock); | 255 | spin_unlock(&journal->j_list_lock); |
254 | jbd_unlock_bh_state(bh); | 256 | jbd_unlock_bh_state(bh); |
255 | jbd2_log_start_commit(journal, tid); | 257 | jbd2_log_start_commit(journal, tid); |
@@ -279,6 +281,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, | |||
279 | bhs[*batch_count] = bh; | 281 | bhs[*batch_count] = bh; |
280 | __buffer_relink_io(jh); | 282 | __buffer_relink_io(jh); |
281 | jbd_unlock_bh_state(bh); | 283 | jbd_unlock_bh_state(bh); |
284 | transaction->t_chp_stats.cs_written++; | ||
282 | (*batch_count)++; | 285 | (*batch_count)++; |
283 | if (*batch_count == NR_BATCH) { | 286 | if (*batch_count == NR_BATCH) { |
284 | spin_unlock(&journal->j_list_lock); | 287 | spin_unlock(&journal->j_list_lock); |
@@ -322,6 +325,8 @@ int jbd2_log_do_checkpoint(journal_t *journal) | |||
322 | if (!journal->j_checkpoint_transactions) | 325 | if (!journal->j_checkpoint_transactions) |
323 | goto out; | 326 | goto out; |
324 | transaction = journal->j_checkpoint_transactions; | 327 | transaction = journal->j_checkpoint_transactions; |
328 | if (transaction->t_chp_stats.cs_chp_time == 0) | ||
329 | transaction->t_chp_stats.cs_chp_time = jiffies; | ||
325 | this_tid = transaction->t_tid; | 330 | this_tid = transaction->t_tid; |
326 | restart: | 331 | restart: |
327 | /* | 332 | /* |
@@ -346,7 +351,8 @@ restart: | |||
346 | retry = 1; | 351 | retry = 1; |
347 | break; | 352 | break; |
348 | } | 353 | } |
349 | retry = __process_buffer(journal, jh, bhs,&batch_count); | 354 | retry = __process_buffer(journal, jh, bhs, &batch_count, |
355 | transaction); | ||
350 | if (!retry && lock_need_resched(&journal->j_list_lock)){ | 356 | if (!retry && lock_need_resched(&journal->j_list_lock)){ |
351 | spin_unlock(&journal->j_list_lock); | 357 | spin_unlock(&journal->j_list_lock); |
352 | retry = 1; | 358 | retry = 1; |
@@ -602,15 +608,15 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) | |||
602 | 608 | ||
603 | /* | 609 | /* |
604 | * There is one special case to worry about: if we have just pulled the | 610 | * There is one special case to worry about: if we have just pulled the |
605 | * buffer off a committing transaction's forget list, then even if the | 611 | * buffer off a running or committing transaction's checkpoing list, |
606 | * checkpoint list is empty, the transaction obviously cannot be | 612 | * then even if the checkpoint list is empty, the transaction obviously |
607 | * dropped! | 613 | * cannot be dropped! |
608 | * | 614 | * |
609 | * The locking here around j_committing_transaction is a bit sleazy. | 615 | * The locking here around t_state is a bit sleazy. |
610 | * See the comment at the end of jbd2_journal_commit_transaction(). | 616 | * See the comment at the end of jbd2_journal_commit_transaction(). |
611 | */ | 617 | */ |
612 | if (transaction == journal->j_committing_transaction) { | 618 | if (transaction->t_state != T_FINISHED) { |
613 | JBUFFER_TRACE(jh, "belongs to committing transaction"); | 619 | JBUFFER_TRACE(jh, "belongs to running/committing transaction"); |
614 | goto out; | 620 | goto out; |
615 | } | 621 | } |
616 | 622 | ||
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6986f334c643..da8d0eb3b7b9 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -20,6 +20,8 @@ | |||
20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/jiffies.h> | ||
24 | #include <linux/crc32.h> | ||
23 | 25 | ||
24 | /* | 26 | /* |
25 | * Default IO end handler for temporary BJ_IO buffer_heads. | 27 | * Default IO end handler for temporary BJ_IO buffer_heads. |
@@ -92,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh) | |||
92 | return 1; | 94 | return 1; |
93 | } | 95 | } |
94 | 96 | ||
95 | /* Done it all: now write the commit record. We should have | 97 | /* |
98 | * Done it all: now submit the commit record. We should have | ||
96 | * cleaned up our previous buffers by now, so if we are in abort | 99 | * cleaned up our previous buffers by now, so if we are in abort |
97 | * mode we can now just skip the rest of the journal write | 100 | * mode we can now just skip the rest of the journal write |
98 | * entirely. | 101 | * entirely. |
99 | * | 102 | * |
100 | * Returns 1 if the journal needs to be aborted or 0 on success | 103 | * Returns 1 if the journal needs to be aborted or 0 on success |
101 | */ | 104 | */ |
102 | static int journal_write_commit_record(journal_t *journal, | 105 | static int journal_submit_commit_record(journal_t *journal, |
103 | transaction_t *commit_transaction) | 106 | transaction_t *commit_transaction, |
107 | struct buffer_head **cbh, | ||
108 | __u32 crc32_sum) | ||
104 | { | 109 | { |
105 | struct journal_head *descriptor; | 110 | struct journal_head *descriptor; |
111 | struct commit_header *tmp; | ||
106 | struct buffer_head *bh; | 112 | struct buffer_head *bh; |
107 | int i, ret; | 113 | int ret; |
108 | int barrier_done = 0; | 114 | int barrier_done = 0; |
109 | 115 | ||
110 | if (is_journal_aborted(journal)) | 116 | if (is_journal_aborted(journal)) |
@@ -116,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal, | |||
116 | 122 | ||
117 | bh = jh2bh(descriptor); | 123 | bh = jh2bh(descriptor); |
118 | 124 | ||
119 | /* AKPM: buglet - add `i' to tmp! */ | 125 | tmp = (struct commit_header *)bh->b_data; |
120 | for (i = 0; i < bh->b_size; i += 512) { | 126 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
121 | journal_header_t *tmp = (journal_header_t*)bh->b_data; | 127 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); |
122 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 128 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
123 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | 129 | |
124 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 130 | if (JBD2_HAS_COMPAT_FEATURE(journal, |
131 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | ||
132 | tmp->h_chksum_type = JBD2_CRC32_CHKSUM; | ||
133 | tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; | ||
134 | tmp->h_chksum[0] = cpu_to_be32(crc32_sum); | ||
125 | } | 135 | } |
126 | 136 | ||
127 | JBUFFER_TRACE(descriptor, "write commit block"); | 137 | JBUFFER_TRACE(descriptor, "submit commit block"); |
138 | lock_buffer(bh); | ||
139 | |||
128 | set_buffer_dirty(bh); | 140 | set_buffer_dirty(bh); |
129 | if (journal->j_flags & JBD2_BARRIER) { | 141 | set_buffer_uptodate(bh); |
142 | bh->b_end_io = journal_end_buffer_io_sync; | ||
143 | |||
144 | if (journal->j_flags & JBD2_BARRIER && | ||
145 | !JBD2_HAS_COMPAT_FEATURE(journal, | ||
146 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | ||
130 | set_buffer_ordered(bh); | 147 | set_buffer_ordered(bh); |
131 | barrier_done = 1; | 148 | barrier_done = 1; |
132 | } | 149 | } |
133 | ret = sync_dirty_buffer(bh); | 150 | ret = submit_bh(WRITE, bh); |
151 | |||
134 | /* is it possible for another commit to fail at roughly | 152 | /* is it possible for another commit to fail at roughly |
135 | * the same time as this one? If so, we don't want to | 153 | * the same time as this one? If so, we don't want to |
136 | * trust the barrier flag in the super, but instead want | 154 | * trust the barrier flag in the super, but instead want |
@@ -151,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal, | |||
151 | clear_buffer_ordered(bh); | 169 | clear_buffer_ordered(bh); |
152 | set_buffer_uptodate(bh); | 170 | set_buffer_uptodate(bh); |
153 | set_buffer_dirty(bh); | 171 | set_buffer_dirty(bh); |
154 | ret = sync_dirty_buffer(bh); | 172 | ret = submit_bh(WRITE, bh); |
155 | } | 173 | } |
156 | put_bh(bh); /* One for getblk() */ | 174 | *cbh = bh; |
157 | jbd2_journal_put_journal_head(descriptor); | 175 | return ret; |
176 | } | ||
177 | |||
178 | /* | ||
179 | * This function along with journal_submit_commit_record | ||
180 | * allows to write the commit record asynchronously. | ||
181 | */ | ||
182 | static int journal_wait_on_commit_record(struct buffer_head *bh) | ||
183 | { | ||
184 | int ret = 0; | ||
185 | |||
186 | clear_buffer_dirty(bh); | ||
187 | wait_on_buffer(bh); | ||
188 | |||
189 | if (unlikely(!buffer_uptodate(bh))) | ||
190 | ret = -EIO; | ||
191 | put_bh(bh); /* One for getblk() */ | ||
192 | jbd2_journal_put_journal_head(bh2jh(bh)); | ||
158 | 193 | ||
159 | return (ret == -EIO); | 194 | return ret; |
160 | } | 195 | } |
161 | 196 | ||
197 | /* | ||
198 | * Wait for all submitted IO to complete. | ||
199 | */ | ||
200 | static int journal_wait_on_locked_list(journal_t *journal, | ||
201 | transaction_t *commit_transaction) | ||
202 | { | ||
203 | int ret = 0; | ||
204 | struct journal_head *jh; | ||
205 | |||
206 | while (commit_transaction->t_locked_list) { | ||
207 | struct buffer_head *bh; | ||
208 | |||
209 | jh = commit_transaction->t_locked_list->b_tprev; | ||
210 | bh = jh2bh(jh); | ||
211 | get_bh(bh); | ||
212 | if (buffer_locked(bh)) { | ||
213 | spin_unlock(&journal->j_list_lock); | ||
214 | wait_on_buffer(bh); | ||
215 | if (unlikely(!buffer_uptodate(bh))) | ||
216 | ret = -EIO; | ||
217 | spin_lock(&journal->j_list_lock); | ||
218 | } | ||
219 | if (!inverted_lock(journal, bh)) { | ||
220 | put_bh(bh); | ||
221 | spin_lock(&journal->j_list_lock); | ||
222 | continue; | ||
223 | } | ||
224 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
225 | __jbd2_journal_unfile_buffer(jh); | ||
226 | jbd_unlock_bh_state(bh); | ||
227 | jbd2_journal_remove_journal_head(bh); | ||
228 | put_bh(bh); | ||
229 | } else { | ||
230 | jbd_unlock_bh_state(bh); | ||
231 | } | ||
232 | put_bh(bh); | ||
233 | cond_resched_lock(&journal->j_list_lock); | ||
234 | } | ||
235 | return ret; | ||
236 | } | ||
237 | |||
162 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) | 238 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) |
163 | { | 239 | { |
164 | int i; | 240 | int i; |
@@ -274,7 +350,21 @@ write_out_data: | |||
274 | journal_do_submit_data(wbuf, bufs); | 350 | journal_do_submit_data(wbuf, bufs); |
275 | } | 351 | } |
276 | 352 | ||
277 | static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | 353 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) |
354 | { | ||
355 | struct page *page = bh->b_page; | ||
356 | char *addr; | ||
357 | __u32 checksum; | ||
358 | |||
359 | addr = kmap_atomic(page, KM_USER0); | ||
360 | checksum = crc32_be(crc32_sum, | ||
361 | (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); | ||
362 | kunmap_atomic(addr, KM_USER0); | ||
363 | |||
364 | return checksum; | ||
365 | } | ||
366 | |||
367 | static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | ||
278 | unsigned long long block) | 368 | unsigned long long block) |
279 | { | 369 | { |
280 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); | 370 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); |
@@ -290,6 +380,7 @@ static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | |||
290 | */ | 380 | */ |
291 | void jbd2_journal_commit_transaction(journal_t *journal) | 381 | void jbd2_journal_commit_transaction(journal_t *journal) |
292 | { | 382 | { |
383 | struct transaction_stats_s stats; | ||
293 | transaction_t *commit_transaction; | 384 | transaction_t *commit_transaction; |
294 | struct journal_head *jh, *new_jh, *descriptor; | 385 | struct journal_head *jh, *new_jh, *descriptor; |
295 | struct buffer_head **wbuf = journal->j_wbuf; | 386 | struct buffer_head **wbuf = journal->j_wbuf; |
@@ -305,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
305 | int tag_flag; | 396 | int tag_flag; |
306 | int i; | 397 | int i; |
307 | int tag_bytes = journal_tag_bytes(journal); | 398 | int tag_bytes = journal_tag_bytes(journal); |
399 | struct buffer_head *cbh = NULL; /* For transactional checksums */ | ||
400 | __u32 crc32_sum = ~0; | ||
308 | 401 | ||
309 | /* | 402 | /* |
310 | * First job: lock down the current transaction and wait for | 403 | * First job: lock down the current transaction and wait for |
@@ -337,6 +430,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
337 | spin_lock(&journal->j_state_lock); | 430 | spin_lock(&journal->j_state_lock); |
338 | commit_transaction->t_state = T_LOCKED; | 431 | commit_transaction->t_state = T_LOCKED; |
339 | 432 | ||
433 | stats.u.run.rs_wait = commit_transaction->t_max_wait; | ||
434 | stats.u.run.rs_locked = jiffies; | ||
435 | stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, | ||
436 | stats.u.run.rs_locked); | ||
437 | |||
340 | spin_lock(&commit_transaction->t_handle_lock); | 438 | spin_lock(&commit_transaction->t_handle_lock); |
341 | while (commit_transaction->t_updates) { | 439 | while (commit_transaction->t_updates) { |
342 | DEFINE_WAIT(wait); | 440 | DEFINE_WAIT(wait); |
@@ -407,6 +505,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
407 | */ | 505 | */ |
408 | jbd2_journal_switch_revoke_table(journal); | 506 | jbd2_journal_switch_revoke_table(journal); |
409 | 507 | ||
508 | stats.u.run.rs_flushing = jiffies; | ||
509 | stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, | ||
510 | stats.u.run.rs_flushing); | ||
511 | |||
410 | commit_transaction->t_state = T_FLUSH; | 512 | commit_transaction->t_state = T_FLUSH; |
411 | journal->j_committing_transaction = commit_transaction; | 513 | journal->j_committing_transaction = commit_transaction; |
412 | journal->j_running_transaction = NULL; | 514 | journal->j_running_transaction = NULL; |
@@ -440,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
440 | journal_submit_data_buffers(journal, commit_transaction); | 542 | journal_submit_data_buffers(journal, commit_transaction); |
441 | 543 | ||
442 | /* | 544 | /* |
443 | * Wait for all previously submitted IO to complete. | 545 | * Wait for all previously submitted IO to complete if commit |
546 | * record is to be written synchronously. | ||
444 | */ | 547 | */ |
445 | spin_lock(&journal->j_list_lock); | 548 | spin_lock(&journal->j_list_lock); |
446 | while (commit_transaction->t_locked_list) { | 549 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
447 | struct buffer_head *bh; | 550 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) |
551 | err = journal_wait_on_locked_list(journal, | ||
552 | commit_transaction); | ||
448 | 553 | ||
449 | jh = commit_transaction->t_locked_list->b_tprev; | ||
450 | bh = jh2bh(jh); | ||
451 | get_bh(bh); | ||
452 | if (buffer_locked(bh)) { | ||
453 | spin_unlock(&journal->j_list_lock); | ||
454 | wait_on_buffer(bh); | ||
455 | if (unlikely(!buffer_uptodate(bh))) | ||
456 | err = -EIO; | ||
457 | spin_lock(&journal->j_list_lock); | ||
458 | } | ||
459 | if (!inverted_lock(journal, bh)) { | ||
460 | put_bh(bh); | ||
461 | spin_lock(&journal->j_list_lock); | ||
462 | continue; | ||
463 | } | ||
464 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
465 | __jbd2_journal_unfile_buffer(jh); | ||
466 | jbd_unlock_bh_state(bh); | ||
467 | jbd2_journal_remove_journal_head(bh); | ||
468 | put_bh(bh); | ||
469 | } else { | ||
470 | jbd_unlock_bh_state(bh); | ||
471 | } | ||
472 | put_bh(bh); | ||
473 | cond_resched_lock(&journal->j_list_lock); | ||
474 | } | ||
475 | spin_unlock(&journal->j_list_lock); | 554 | spin_unlock(&journal->j_list_lock); |
476 | 555 | ||
477 | if (err) | 556 | if (err) |
@@ -498,6 +577,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
498 | */ | 577 | */ |
499 | commit_transaction->t_state = T_COMMIT; | 578 | commit_transaction->t_state = T_COMMIT; |
500 | 579 | ||
580 | stats.u.run.rs_logging = jiffies; | ||
581 | stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, | ||
582 | stats.u.run.rs_logging); | ||
583 | stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; | ||
584 | stats.u.run.rs_blocks_logged = 0; | ||
585 | |||
501 | descriptor = NULL; | 586 | descriptor = NULL; |
502 | bufs = 0; | 587 | bufs = 0; |
503 | while (commit_transaction->t_buffers) { | 588 | while (commit_transaction->t_buffers) { |
@@ -639,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
639 | start_journal_io: | 724 | start_journal_io: |
640 | for (i = 0; i < bufs; i++) { | 725 | for (i = 0; i < bufs; i++) { |
641 | struct buffer_head *bh = wbuf[i]; | 726 | struct buffer_head *bh = wbuf[i]; |
727 | /* | ||
728 | * Compute checksum. | ||
729 | */ | ||
730 | if (JBD2_HAS_COMPAT_FEATURE(journal, | ||
731 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | ||
732 | crc32_sum = | ||
733 | jbd2_checksum_data(crc32_sum, bh); | ||
734 | } | ||
735 | |||
642 | lock_buffer(bh); | 736 | lock_buffer(bh); |
643 | clear_buffer_dirty(bh); | 737 | clear_buffer_dirty(bh); |
644 | set_buffer_uptodate(bh); | 738 | set_buffer_uptodate(bh); |
@@ -646,6 +740,7 @@ start_journal_io: | |||
646 | submit_bh(WRITE, bh); | 740 | submit_bh(WRITE, bh); |
647 | } | 741 | } |
648 | cond_resched(); | 742 | cond_resched(); |
743 | stats.u.run.rs_blocks_logged += bufs; | ||
649 | 744 | ||
650 | /* Force a new descriptor to be generated next | 745 | /* Force a new descriptor to be generated next |
651 | time round the loop. */ | 746 | time round the loop. */ |
@@ -654,6 +749,23 @@ start_journal_io: | |||
654 | } | 749 | } |
655 | } | 750 | } |
656 | 751 | ||
752 | /* Done it all: now write the commit record asynchronously. */ | ||
753 | |||
754 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, | ||
755 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | ||
756 | err = journal_submit_commit_record(journal, commit_transaction, | ||
757 | &cbh, crc32_sum); | ||
758 | if (err) | ||
759 | __jbd2_journal_abort_hard(journal); | ||
760 | |||
761 | spin_lock(&journal->j_list_lock); | ||
762 | err = journal_wait_on_locked_list(journal, | ||
763 | commit_transaction); | ||
764 | spin_unlock(&journal->j_list_lock); | ||
765 | if (err) | ||
766 | __jbd2_journal_abort_hard(journal); | ||
767 | } | ||
768 | |||
657 | /* Lo and behold: we have just managed to send a transaction to | 769 | /* Lo and behold: we have just managed to send a transaction to |
658 | the log. Before we can commit it, wait for the IO so far to | 770 | the log. Before we can commit it, wait for the IO so far to |
659 | complete. Control buffers being written are on the | 771 | complete. Control buffers being written are on the |
@@ -753,8 +865,14 @@ wait_for_iobuf: | |||
753 | 865 | ||
754 | jbd_debug(3, "JBD: commit phase 6\n"); | 866 | jbd_debug(3, "JBD: commit phase 6\n"); |
755 | 867 | ||
756 | if (journal_write_commit_record(journal, commit_transaction)) | 868 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
757 | err = -EIO; | 869 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
870 | err = journal_submit_commit_record(journal, commit_transaction, | ||
871 | &cbh, crc32_sum); | ||
872 | if (err) | ||
873 | __jbd2_journal_abort_hard(journal); | ||
874 | } | ||
875 | err = journal_wait_on_commit_record(cbh); | ||
758 | 876 | ||
759 | if (err) | 877 | if (err) |
760 | jbd2_journal_abort(journal, err); | 878 | jbd2_journal_abort(journal, err); |
@@ -816,6 +934,7 @@ restart_loop: | |||
816 | cp_transaction = jh->b_cp_transaction; | 934 | cp_transaction = jh->b_cp_transaction; |
817 | if (cp_transaction) { | 935 | if (cp_transaction) { |
818 | JBUFFER_TRACE(jh, "remove from old cp transaction"); | 936 | JBUFFER_TRACE(jh, "remove from old cp transaction"); |
937 | cp_transaction->t_chp_stats.cs_dropped++; | ||
819 | __jbd2_journal_remove_checkpoint(jh); | 938 | __jbd2_journal_remove_checkpoint(jh); |
820 | } | 939 | } |
821 | 940 | ||
@@ -867,10 +986,10 @@ restart_loop: | |||
867 | } | 986 | } |
868 | spin_unlock(&journal->j_list_lock); | 987 | spin_unlock(&journal->j_list_lock); |
869 | /* | 988 | /* |
870 | * This is a bit sleazy. We borrow j_list_lock to protect | 989 | * This is a bit sleazy. We use j_list_lock to protect transition |
871 | * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint. | 990 | * of a transaction into T_FINISHED state and calling |
872 | * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but | 991 | * __jbd2_journal_drop_transaction(). Otherwise we could race with |
873 | * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint | 992 | * other checkpointing code processing the transaction... |
874 | */ | 993 | */ |
875 | spin_lock(&journal->j_state_lock); | 994 | spin_lock(&journal->j_state_lock); |
876 | spin_lock(&journal->j_list_lock); | 995 | spin_lock(&journal->j_list_lock); |
@@ -890,6 +1009,36 @@ restart_loop: | |||
890 | 1009 | ||
891 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | 1010 | J_ASSERT(commit_transaction->t_state == T_COMMIT); |
892 | 1011 | ||
1012 | commit_transaction->t_start = jiffies; | ||
1013 | stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, | ||
1014 | commit_transaction->t_start); | ||
1015 | |||
1016 | /* | ||
1017 | * File the transaction for history | ||
1018 | */ | ||
1019 | stats.ts_type = JBD2_STATS_RUN; | ||
1020 | stats.ts_tid = commit_transaction->t_tid; | ||
1021 | stats.u.run.rs_handle_count = commit_transaction->t_handle_count; | ||
1022 | spin_lock(&journal->j_history_lock); | ||
1023 | memcpy(journal->j_history + journal->j_history_cur, &stats, | ||
1024 | sizeof(stats)); | ||
1025 | if (++journal->j_history_cur == journal->j_history_max) | ||
1026 | journal->j_history_cur = 0; | ||
1027 | |||
1028 | /* | ||
1029 | * Calculate overall stats | ||
1030 | */ | ||
1031 | journal->j_stats.ts_tid++; | ||
1032 | journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; | ||
1033 | journal->j_stats.u.run.rs_running += stats.u.run.rs_running; | ||
1034 | journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; | ||
1035 | journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; | ||
1036 | journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; | ||
1037 | journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; | ||
1038 | journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; | ||
1039 | journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; | ||
1040 | spin_unlock(&journal->j_history_lock); | ||
1041 | |||
893 | commit_transaction->t_state = T_FINISHED; | 1042 | commit_transaction->t_state = T_FINISHED; |
894 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | 1043 | J_ASSERT(commit_transaction == journal->j_committing_transaction); |
895 | journal->j_commit_sequence = commit_transaction->t_tid; | 1044 | journal->j_commit_sequence = commit_transaction->t_tid; |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 6ddc5531587c..96ba846992e9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/poison.h> | 36 | #include <linux/poison.h> |
37 | #include <linux/proc_fs.h> | 37 | #include <linux/proc_fs.h> |
38 | #include <linux/debugfs.h> | 38 | #include <linux/debugfs.h> |
39 | #include <linux/seq_file.h> | ||
39 | 40 | ||
40 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
41 | #include <asm/page.h> | 42 | #include <asm/page.h> |
@@ -640,6 +641,312 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) | |||
640 | return jbd2_journal_add_journal_head(bh); | 641 | return jbd2_journal_add_journal_head(bh); |
641 | } | 642 | } |
642 | 643 | ||
644 | struct jbd2_stats_proc_session { | ||
645 | journal_t *journal; | ||
646 | struct transaction_stats_s *stats; | ||
647 | int start; | ||
648 | int max; | ||
649 | }; | ||
650 | |||
651 | static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s, | ||
652 | struct transaction_stats_s *ts, | ||
653 | int first) | ||
654 | { | ||
655 | if (ts == s->stats + s->max) | ||
656 | ts = s->stats; | ||
657 | if (!first && ts == s->stats + s->start) | ||
658 | return NULL; | ||
659 | while (ts->ts_type == 0) { | ||
660 | ts++; | ||
661 | if (ts == s->stats + s->max) | ||
662 | ts = s->stats; | ||
663 | if (ts == s->stats + s->start) | ||
664 | return NULL; | ||
665 | } | ||
666 | return ts; | ||
667 | |||
668 | } | ||
669 | |||
670 | static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos) | ||
671 | { | ||
672 | struct jbd2_stats_proc_session *s = seq->private; | ||
673 | struct transaction_stats_s *ts; | ||
674 | int l = *pos; | ||
675 | |||
676 | if (l == 0) | ||
677 | return SEQ_START_TOKEN; | ||
678 | ts = jbd2_history_skip_empty(s, s->stats + s->start, 1); | ||
679 | if (!ts) | ||
680 | return NULL; | ||
681 | l--; | ||
682 | while (l) { | ||
683 | ts = jbd2_history_skip_empty(s, ++ts, 0); | ||
684 | if (!ts) | ||
685 | break; | ||
686 | l--; | ||
687 | } | ||
688 | return ts; | ||
689 | } | ||
690 | |||
691 | static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) | ||
692 | { | ||
693 | struct jbd2_stats_proc_session *s = seq->private; | ||
694 | struct transaction_stats_s *ts = v; | ||
695 | |||
696 | ++*pos; | ||
697 | if (v == SEQ_START_TOKEN) | ||
698 | return jbd2_history_skip_empty(s, s->stats + s->start, 1); | ||
699 | else | ||
700 | return jbd2_history_skip_empty(s, ++ts, 0); | ||
701 | } | ||
702 | |||
703 | static int jbd2_seq_history_show(struct seq_file *seq, void *v) | ||
704 | { | ||
705 | struct transaction_stats_s *ts = v; | ||
706 | if (v == SEQ_START_TOKEN) { | ||
707 | seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s " | ||
708 | "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid", | ||
709 | "wait", "run", "lock", "flush", "log", "hndls", | ||
710 | "block", "inlog", "ctime", "write", "drop", | ||
711 | "close"); | ||
712 | return 0; | ||
713 | } | ||
714 | if (ts->ts_type == JBD2_STATS_RUN) | ||
715 | seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u " | ||
716 | "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid, | ||
717 | jiffies_to_msecs(ts->u.run.rs_wait), | ||
718 | jiffies_to_msecs(ts->u.run.rs_running), | ||
719 | jiffies_to_msecs(ts->u.run.rs_locked), | ||
720 | jiffies_to_msecs(ts->u.run.rs_flushing), | ||
721 | jiffies_to_msecs(ts->u.run.rs_logging), | ||
722 | ts->u.run.rs_handle_count, | ||
723 | ts->u.run.rs_blocks, | ||
724 | ts->u.run.rs_blocks_logged); | ||
725 | else if (ts->ts_type == JBD2_STATS_CHECKPOINT) | ||
726 | seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n", | ||
727 | "C", ts->ts_tid, " ", | ||
728 | jiffies_to_msecs(ts->u.chp.cs_chp_time), | ||
729 | ts->u.chp.cs_written, ts->u.chp.cs_dropped, | ||
730 | ts->u.chp.cs_forced_to_close); | ||
731 | else | ||
732 | J_ASSERT(0); | ||
733 | return 0; | ||
734 | } | ||
735 | |||
736 | static void jbd2_seq_history_stop(struct seq_file *seq, void *v) | ||
737 | { | ||
738 | } | ||
739 | |||
740 | static struct seq_operations jbd2_seq_history_ops = { | ||
741 | .start = jbd2_seq_history_start, | ||
742 | .next = jbd2_seq_history_next, | ||
743 | .stop = jbd2_seq_history_stop, | ||
744 | .show = jbd2_seq_history_show, | ||
745 | }; | ||
746 | |||
747 | static int jbd2_seq_history_open(struct inode *inode, struct file *file) | ||
748 | { | ||
749 | journal_t *journal = PDE(inode)->data; | ||
750 | struct jbd2_stats_proc_session *s; | ||
751 | int rc, size; | ||
752 | |||
753 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
754 | if (s == NULL) | ||
755 | return -ENOMEM; | ||
756 | size = sizeof(struct transaction_stats_s) * journal->j_history_max; | ||
757 | s->stats = kmalloc(size, GFP_KERNEL); | ||
758 | if (s->stats == NULL) { | ||
759 | kfree(s); | ||
760 | return -ENOMEM; | ||
761 | } | ||
762 | spin_lock(&journal->j_history_lock); | ||
763 | memcpy(s->stats, journal->j_history, size); | ||
764 | s->max = journal->j_history_max; | ||
765 | s->start = journal->j_history_cur % s->max; | ||
766 | spin_unlock(&journal->j_history_lock); | ||
767 | |||
768 | rc = seq_open(file, &jbd2_seq_history_ops); | ||
769 | if (rc == 0) { | ||
770 | struct seq_file *m = file->private_data; | ||
771 | m->private = s; | ||
772 | } else { | ||
773 | kfree(s->stats); | ||
774 | kfree(s); | ||
775 | } | ||
776 | return rc; | ||
777 | |||
778 | } | ||
779 | |||
780 | static int jbd2_seq_history_release(struct inode *inode, struct file *file) | ||
781 | { | ||
782 | struct seq_file *seq = file->private_data; | ||
783 | struct jbd2_stats_proc_session *s = seq->private; | ||
784 | |||
785 | kfree(s->stats); | ||
786 | kfree(s); | ||
787 | return seq_release(inode, file); | ||
788 | } | ||
789 | |||
790 | static struct file_operations jbd2_seq_history_fops = { | ||
791 | .owner = THIS_MODULE, | ||
792 | .open = jbd2_seq_history_open, | ||
793 | .read = seq_read, | ||
794 | .llseek = seq_lseek, | ||
795 | .release = jbd2_seq_history_release, | ||
796 | }; | ||
797 | |||
798 | static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) | ||
799 | { | ||
800 | return *pos ? NULL : SEQ_START_TOKEN; | ||
801 | } | ||
802 | |||
803 | static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) | ||
804 | { | ||
805 | return NULL; | ||
806 | } | ||
807 | |||
808 | static int jbd2_seq_info_show(struct seq_file *seq, void *v) | ||
809 | { | ||
810 | struct jbd2_stats_proc_session *s = seq->private; | ||
811 | |||
812 | if (v != SEQ_START_TOKEN) | ||
813 | return 0; | ||
814 | seq_printf(seq, "%lu transaction, each upto %u blocks\n", | ||
815 | s->stats->ts_tid, | ||
816 | s->journal->j_max_transaction_buffers); | ||
817 | if (s->stats->ts_tid == 0) | ||
818 | return 0; | ||
819 | seq_printf(seq, "average: \n %ums waiting for transaction\n", | ||
820 | jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); | ||
821 | seq_printf(seq, " %ums running transaction\n", | ||
822 | jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); | ||
823 | seq_printf(seq, " %ums transaction was being locked\n", | ||
824 | jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); | ||
825 | seq_printf(seq, " %ums flushing data (in ordered mode)\n", | ||
826 | jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); | ||
827 | seq_printf(seq, " %ums logging transaction\n", | ||
828 | jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); | ||
829 | seq_printf(seq, " %lu handles per transaction\n", | ||
830 | s->stats->u.run.rs_handle_count / s->stats->ts_tid); | ||
831 | seq_printf(seq, " %lu blocks per transaction\n", | ||
832 | s->stats->u.run.rs_blocks / s->stats->ts_tid); | ||
833 | seq_printf(seq, " %lu logged blocks per transaction\n", | ||
834 | s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); | ||
835 | return 0; | ||
836 | } | ||
837 | |||
838 | static void jbd2_seq_info_stop(struct seq_file *seq, void *v) | ||
839 | { | ||
840 | } | ||
841 | |||
842 | static struct seq_operations jbd2_seq_info_ops = { | ||
843 | .start = jbd2_seq_info_start, | ||
844 | .next = jbd2_seq_info_next, | ||
845 | .stop = jbd2_seq_info_stop, | ||
846 | .show = jbd2_seq_info_show, | ||
847 | }; | ||
848 | |||
849 | static int jbd2_seq_info_open(struct inode *inode, struct file *file) | ||
850 | { | ||
851 | journal_t *journal = PDE(inode)->data; | ||
852 | struct jbd2_stats_proc_session *s; | ||
853 | int rc, size; | ||
854 | |||
855 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
856 | if (s == NULL) | ||
857 | return -ENOMEM; | ||
858 | size = sizeof(struct transaction_stats_s); | ||
859 | s->stats = kmalloc(size, GFP_KERNEL); | ||
860 | if (s->stats == NULL) { | ||
861 | kfree(s); | ||
862 | return -ENOMEM; | ||
863 | } | ||
864 | spin_lock(&journal->j_history_lock); | ||
865 | memcpy(s->stats, &journal->j_stats, size); | ||
866 | s->journal = journal; | ||
867 | spin_unlock(&journal->j_history_lock); | ||
868 | |||
869 | rc = seq_open(file, &jbd2_seq_info_ops); | ||
870 | if (rc == 0) { | ||
871 | struct seq_file *m = file->private_data; | ||
872 | m->private = s; | ||
873 | } else { | ||
874 | kfree(s->stats); | ||
875 | kfree(s); | ||
876 | } | ||
877 | return rc; | ||
878 | |||
879 | } | ||
880 | |||
881 | static int jbd2_seq_info_release(struct inode *inode, struct file *file) | ||
882 | { | ||
883 | struct seq_file *seq = file->private_data; | ||
884 | struct jbd2_stats_proc_session *s = seq->private; | ||
885 | kfree(s->stats); | ||
886 | kfree(s); | ||
887 | return seq_release(inode, file); | ||
888 | } | ||
889 | |||
890 | static struct file_operations jbd2_seq_info_fops = { | ||
891 | .owner = THIS_MODULE, | ||
892 | .open = jbd2_seq_info_open, | ||
893 | .read = seq_read, | ||
894 | .llseek = seq_lseek, | ||
895 | .release = jbd2_seq_info_release, | ||
896 | }; | ||
897 | |||
898 | static struct proc_dir_entry *proc_jbd2_stats; | ||
899 | |||
900 | static void jbd2_stats_proc_init(journal_t *journal) | ||
901 | { | ||
902 | char name[BDEVNAME_SIZE]; | ||
903 | |||
904 | snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); | ||
905 | journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats); | ||
906 | if (journal->j_proc_entry) { | ||
907 | struct proc_dir_entry *p; | ||
908 | p = create_proc_entry("history", S_IRUGO, | ||
909 | journal->j_proc_entry); | ||
910 | if (p) { | ||
911 | p->proc_fops = &jbd2_seq_history_fops; | ||
912 | p->data = journal; | ||
913 | p = create_proc_entry("info", S_IRUGO, | ||
914 | journal->j_proc_entry); | ||
915 | if (p) { | ||
916 | p->proc_fops = &jbd2_seq_info_fops; | ||
917 | p->data = journal; | ||
918 | } | ||
919 | } | ||
920 | } | ||
921 | } | ||
922 | |||
923 | static void jbd2_stats_proc_exit(journal_t *journal) | ||
924 | { | ||
925 | char name[BDEVNAME_SIZE]; | ||
926 | |||
927 | snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); | ||
928 | remove_proc_entry("info", journal->j_proc_entry); | ||
929 | remove_proc_entry("history", journal->j_proc_entry); | ||
930 | remove_proc_entry(name, proc_jbd2_stats); | ||
931 | } | ||
932 | |||
933 | static void journal_init_stats(journal_t *journal) | ||
934 | { | ||
935 | int size; | ||
936 | |||
937 | if (!proc_jbd2_stats) | ||
938 | return; | ||
939 | |||
940 | journal->j_history_max = 100; | ||
941 | size = sizeof(struct transaction_stats_s) * journal->j_history_max; | ||
942 | journal->j_history = kzalloc(size, GFP_KERNEL); | ||
943 | if (!journal->j_history) { | ||
944 | journal->j_history_max = 0; | ||
945 | return; | ||
946 | } | ||
947 | spin_lock_init(&journal->j_history_lock); | ||
948 | } | ||
949 | |||
643 | /* | 950 | /* |
644 | * Management for journal control blocks: functions to create and | 951 | * Management for journal control blocks: functions to create and |
645 | * destroy journal_t structures, and to initialise and read existing | 952 | * destroy journal_t structures, and to initialise and read existing |
@@ -681,6 +988,9 @@ static journal_t * journal_init_common (void) | |||
681 | kfree(journal); | 988 | kfree(journal); |
682 | goto fail; | 989 | goto fail; |
683 | } | 990 | } |
991 | |||
992 | journal_init_stats(journal); | ||
993 | |||
684 | return journal; | 994 | return journal; |
685 | fail: | 995 | fail: |
686 | return NULL; | 996 | return NULL; |
@@ -735,6 +1045,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, | |||
735 | journal->j_fs_dev = fs_dev; | 1045 | journal->j_fs_dev = fs_dev; |
736 | journal->j_blk_offset = start; | 1046 | journal->j_blk_offset = start; |
737 | journal->j_maxlen = len; | 1047 | journal->j_maxlen = len; |
1048 | jbd2_stats_proc_init(journal); | ||
738 | 1049 | ||
739 | bh = __getblk(journal->j_dev, start, journal->j_blocksize); | 1050 | bh = __getblk(journal->j_dev, start, journal->j_blocksize); |
740 | J_ASSERT(bh != NULL); | 1051 | J_ASSERT(bh != NULL); |
@@ -773,6 +1084,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) | |||
773 | 1084 | ||
774 | journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; | 1085 | journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; |
775 | journal->j_blocksize = inode->i_sb->s_blocksize; | 1086 | journal->j_blocksize = inode->i_sb->s_blocksize; |
1087 | jbd2_stats_proc_init(journal); | ||
776 | 1088 | ||
777 | /* journal descriptor can store up to n blocks -bzzz */ | 1089 | /* journal descriptor can store up to n blocks -bzzz */ |
778 | n = journal->j_blocksize / sizeof(journal_block_tag_t); | 1090 | n = journal->j_blocksize / sizeof(journal_block_tag_t); |
@@ -1153,6 +1465,8 @@ void jbd2_journal_destroy(journal_t *journal) | |||
1153 | brelse(journal->j_sb_buffer); | 1465 | brelse(journal->j_sb_buffer); |
1154 | } | 1466 | } |
1155 | 1467 | ||
1468 | if (journal->j_proc_entry) | ||
1469 | jbd2_stats_proc_exit(journal); | ||
1156 | if (journal->j_inode) | 1470 | if (journal->j_inode) |
1157 | iput(journal->j_inode); | 1471 | iput(journal->j_inode); |
1158 | if (journal->j_revoke) | 1472 | if (journal->j_revoke) |
@@ -1264,6 +1578,32 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, | |||
1264 | return 1; | 1578 | return 1; |
1265 | } | 1579 | } |
1266 | 1580 | ||
1581 | /* | ||
1582 | * jbd2_journal_clear_features () - Clear a given journal feature in the | ||
1583 | * superblock | ||
1584 | * @journal: Journal to act on. | ||
1585 | * @compat: bitmask of compatible features | ||
1586 | * @ro: bitmask of features that force read-only mount | ||
1587 | * @incompat: bitmask of incompatible features | ||
1588 | * | ||
1589 | * Clear a given journal feature as present on the | ||
1590 | * superblock. | ||
1591 | */ | ||
1592 | void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, | ||
1593 | unsigned long ro, unsigned long incompat) | ||
1594 | { | ||
1595 | journal_superblock_t *sb; | ||
1596 | |||
1597 | jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", | ||
1598 | compat, ro, incompat); | ||
1599 | |||
1600 | sb = journal->j_superblock; | ||
1601 | |||
1602 | sb->s_feature_compat &= ~cpu_to_be32(compat); | ||
1603 | sb->s_feature_ro_compat &= ~cpu_to_be32(ro); | ||
1604 | sb->s_feature_incompat &= ~cpu_to_be32(incompat); | ||
1605 | } | ||
1606 | EXPORT_SYMBOL(jbd2_journal_clear_features); | ||
1267 | 1607 | ||
1268 | /** | 1608 | /** |
1269 | * int jbd2_journal_update_format () - Update on-disk journal structure. | 1609 | * int jbd2_journal_update_format () - Update on-disk journal structure. |
@@ -1633,7 +1973,7 @@ static int journal_init_jbd2_journal_head_cache(void) | |||
1633 | jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", | 1973 | jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", |
1634 | sizeof(struct journal_head), | 1974 | sizeof(struct journal_head), |
1635 | 0, /* offset */ | 1975 | 0, /* offset */ |
1636 | 0, /* flags */ | 1976 | SLAB_TEMPORARY, /* flags */ |
1637 | NULL); /* ctor */ | 1977 | NULL); /* ctor */ |
1638 | retval = 0; | 1978 | retval = 0; |
1639 | if (jbd2_journal_head_cache == 0) { | 1979 | if (jbd2_journal_head_cache == 0) { |
@@ -1900,6 +2240,28 @@ static void __exit jbd2_remove_debugfs_entry(void) | |||
1900 | 2240 | ||
1901 | #endif | 2241 | #endif |
1902 | 2242 | ||
2243 | #ifdef CONFIG_PROC_FS | ||
2244 | |||
2245 | #define JBD2_STATS_PROC_NAME "fs/jbd2" | ||
2246 | |||
2247 | static void __init jbd2_create_jbd_stats_proc_entry(void) | ||
2248 | { | ||
2249 | proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL); | ||
2250 | } | ||
2251 | |||
2252 | static void __exit jbd2_remove_jbd_stats_proc_entry(void) | ||
2253 | { | ||
2254 | if (proc_jbd2_stats) | ||
2255 | remove_proc_entry(JBD2_STATS_PROC_NAME, NULL); | ||
2256 | } | ||
2257 | |||
2258 | #else | ||
2259 | |||
2260 | #define jbd2_create_jbd_stats_proc_entry() do {} while (0) | ||
2261 | #define jbd2_remove_jbd_stats_proc_entry() do {} while (0) | ||
2262 | |||
2263 | #endif | ||
2264 | |||
1903 | struct kmem_cache *jbd2_handle_cache; | 2265 | struct kmem_cache *jbd2_handle_cache; |
1904 | 2266 | ||
1905 | static int __init journal_init_handle_cache(void) | 2267 | static int __init journal_init_handle_cache(void) |
@@ -1907,7 +2269,7 @@ static int __init journal_init_handle_cache(void) | |||
1907 | jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", | 2269 | jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", |
1908 | sizeof(handle_t), | 2270 | sizeof(handle_t), |
1909 | 0, /* offset */ | 2271 | 0, /* offset */ |
1910 | 0, /* flags */ | 2272 | SLAB_TEMPORARY, /* flags */ |
1911 | NULL); /* ctor */ | 2273 | NULL); /* ctor */ |
1912 | if (jbd2_handle_cache == NULL) { | 2274 | if (jbd2_handle_cache == NULL) { |
1913 | printk(KERN_EMERG "JBD: failed to create handle cache\n"); | 2275 | printk(KERN_EMERG "JBD: failed to create handle cache\n"); |
@@ -1955,6 +2317,7 @@ static int __init journal_init(void) | |||
1955 | if (ret != 0) | 2317 | if (ret != 0) |
1956 | jbd2_journal_destroy_caches(); | 2318 | jbd2_journal_destroy_caches(); |
1957 | jbd2_create_debugfs_entry(); | 2319 | jbd2_create_debugfs_entry(); |
2320 | jbd2_create_jbd_stats_proc_entry(); | ||
1958 | return ret; | 2321 | return ret; |
1959 | } | 2322 | } |
1960 | 2323 | ||
@@ -1966,6 +2329,7 @@ static void __exit journal_exit(void) | |||
1966 | printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); | 2329 | printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); |
1967 | #endif | 2330 | #endif |
1968 | jbd2_remove_debugfs_entry(); | 2331 | jbd2_remove_debugfs_entry(); |
2332 | jbd2_remove_jbd_stats_proc_entry(); | ||
1969 | jbd2_journal_destroy_caches(); | 2333 | jbd2_journal_destroy_caches(); |
1970 | } | 2334 | } |
1971 | 2335 | ||
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d0ce627539ef..921680663fa2 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/jbd2.h> | 21 | #include <linux/jbd2.h> |
22 | #include <linux/errno.h> | 22 | #include <linux/errno.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/crc32.h> | ||
24 | #endif | 25 | #endif |
25 | 26 | ||
26 | /* | 27 | /* |
@@ -316,6 +317,37 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag | |||
316 | return block; | 317 | return block; |
317 | } | 318 | } |
318 | 319 | ||
320 | /* | ||
321 | * calc_chksums calculates the checksums for the blocks described in the | ||
322 | * descriptor block. | ||
323 | */ | ||
324 | static int calc_chksums(journal_t *journal, struct buffer_head *bh, | ||
325 | unsigned long *next_log_block, __u32 *crc32_sum) | ||
326 | { | ||
327 | int i, num_blks, err; | ||
328 | unsigned long io_block; | ||
329 | struct buffer_head *obh; | ||
330 | |||
331 | num_blks = count_tags(journal, bh); | ||
332 | /* Calculate checksum of the descriptor block. */ | ||
333 | *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); | ||
334 | |||
335 | for (i = 0; i < num_blks; i++) { | ||
336 | io_block = (*next_log_block)++; | ||
337 | wrap(journal, *next_log_block); | ||
338 | err = jread(&obh, journal, io_block); | ||
339 | if (err) { | ||
340 | printk(KERN_ERR "JBD: IO error %d recovering block " | ||
341 | "%lu in log\n", err, io_block); | ||
342 | return 1; | ||
343 | } else { | ||
344 | *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, | ||
345 | obh->b_size); | ||
346 | } | ||
347 | } | ||
348 | return 0; | ||
349 | } | ||
350 | |||
319 | static int do_one_pass(journal_t *journal, | 351 | static int do_one_pass(journal_t *journal, |
320 | struct recovery_info *info, enum passtype pass) | 352 | struct recovery_info *info, enum passtype pass) |
321 | { | 353 | { |
@@ -328,6 +360,7 @@ static int do_one_pass(journal_t *journal, | |||
328 | unsigned int sequence; | 360 | unsigned int sequence; |
329 | int blocktype; | 361 | int blocktype; |
330 | int tag_bytes = journal_tag_bytes(journal); | 362 | int tag_bytes = journal_tag_bytes(journal); |
363 | __u32 crc32_sum = ~0; /* Transactional Checksums */ | ||
331 | 364 | ||
332 | /* Precompute the maximum metadata descriptors in a descriptor block */ | 365 | /* Precompute the maximum metadata descriptors in a descriptor block */ |
333 | int MAX_BLOCKS_PER_DESC; | 366 | int MAX_BLOCKS_PER_DESC; |
@@ -419,12 +452,26 @@ static int do_one_pass(journal_t *journal, | |||
419 | switch(blocktype) { | 452 | switch(blocktype) { |
420 | case JBD2_DESCRIPTOR_BLOCK: | 453 | case JBD2_DESCRIPTOR_BLOCK: |
421 | /* If it is a valid descriptor block, replay it | 454 | /* If it is a valid descriptor block, replay it |
422 | * in pass REPLAY; otherwise, just skip over the | 455 | * in pass REPLAY; if journal_checksums enabled, then |
423 | * blocks it describes. */ | 456 | * calculate checksums in PASS_SCAN, otherwise, |
457 | * just skip over the blocks it describes. */ | ||
424 | if (pass != PASS_REPLAY) { | 458 | if (pass != PASS_REPLAY) { |
459 | if (pass == PASS_SCAN && | ||
460 | JBD2_HAS_COMPAT_FEATURE(journal, | ||
461 | JBD2_FEATURE_COMPAT_CHECKSUM) && | ||
462 | !info->end_transaction) { | ||
463 | if (calc_chksums(journal, bh, | ||
464 | &next_log_block, | ||
465 | &crc32_sum)) { | ||
466 | put_bh(bh); | ||
467 | break; | ||
468 | } | ||
469 | put_bh(bh); | ||
470 | continue; | ||
471 | } | ||
425 | next_log_block += count_tags(journal, bh); | 472 | next_log_block += count_tags(journal, bh); |
426 | wrap(journal, next_log_block); | 473 | wrap(journal, next_log_block); |
427 | brelse(bh); | 474 | put_bh(bh); |
428 | continue; | 475 | continue; |
429 | } | 476 | } |
430 | 477 | ||
@@ -516,9 +563,96 @@ static int do_one_pass(journal_t *journal, | |||
516 | continue; | 563 | continue; |
517 | 564 | ||
518 | case JBD2_COMMIT_BLOCK: | 565 | case JBD2_COMMIT_BLOCK: |
519 | /* Found an expected commit block: not much to | 566 | /* How to differentiate between interrupted commit |
520 | * do other than move on to the next sequence | 567 | * and journal corruption ? |
568 | * | ||
569 | * {nth transaction} | ||
570 | * Checksum Verification Failed | ||
571 | * | | ||
572 | * ____________________ | ||
573 | * | | | ||
574 | * async_commit sync_commit | ||
575 | * | | | ||
576 | * | GO TO NEXT "Journal Corruption" | ||
577 | * | TRANSACTION | ||
578 | * | | ||
579 | * {(n+1)th transanction} | ||
580 | * | | ||
581 | * _______|______________ | ||
582 | * | | | ||
583 | * Commit block found Commit block not found | ||
584 | * | | | ||
585 | * "Journal Corruption" | | ||
586 | * _____________|_________ | ||
587 | * | | | ||
588 | * nth trans corrupt OR nth trans | ||
589 | * and (n+1)th interrupted interrupted | ||
590 | * before commit block | ||
591 | * could reach the disk. | ||
592 | * (Cannot find the difference in above | ||
593 | * mentioned conditions. Hence assume | ||
594 | * "Interrupted Commit".) | ||
595 | */ | ||
596 | |||
597 | /* Found an expected commit block: if checksums | ||
598 | * are present verify them in PASS_SCAN; else not | ||
599 | * much to do other than move on to the next sequence | ||
521 | * number. */ | 600 | * number. */ |
601 | if (pass == PASS_SCAN && | ||
602 | JBD2_HAS_COMPAT_FEATURE(journal, | ||
603 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | ||
604 | int chksum_err, chksum_seen; | ||
605 | struct commit_header *cbh = | ||
606 | (struct commit_header *)bh->b_data; | ||
607 | unsigned found_chksum = | ||
608 | be32_to_cpu(cbh->h_chksum[0]); | ||
609 | |||
610 | chksum_err = chksum_seen = 0; | ||
611 | |||
612 | if (info->end_transaction) { | ||
613 | printk(KERN_ERR "JBD: Transaction %u " | ||
614 | "found to be corrupt.\n", | ||
615 | next_commit_ID - 1); | ||
616 | brelse(bh); | ||
617 | break; | ||
618 | } | ||
619 | |||
620 | if (crc32_sum == found_chksum && | ||
621 | cbh->h_chksum_type == JBD2_CRC32_CHKSUM && | ||
622 | cbh->h_chksum_size == | ||
623 | JBD2_CRC32_CHKSUM_SIZE) | ||
624 | chksum_seen = 1; | ||
625 | else if (!(cbh->h_chksum_type == 0 && | ||
626 | cbh->h_chksum_size == 0 && | ||
627 | found_chksum == 0 && | ||
628 | !chksum_seen)) | ||
629 | /* | ||
630 | * If fs is mounted using an old kernel and then | ||
631 | * kernel with journal_chksum is used then we | ||
632 | * get a situation where the journal flag has | ||
633 | * checksum flag set but checksums are not | ||
634 | * present i.e chksum = 0, in the individual | ||
635 | * commit blocks. | ||
636 | * Hence to avoid checksum failures, in this | ||
637 | * situation, this extra check is added. | ||
638 | */ | ||
639 | chksum_err = 1; | ||
640 | |||
641 | if (chksum_err) { | ||
642 | info->end_transaction = next_commit_ID; | ||
643 | |||
644 | if (!JBD2_HAS_COMPAT_FEATURE(journal, | ||
645 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ | ||
646 | printk(KERN_ERR | ||
647 | "JBD: Transaction %u " | ||
648 | "found to be corrupt.\n", | ||
649 | next_commit_ID); | ||
650 | brelse(bh); | ||
651 | break; | ||
652 | } | ||
653 | } | ||
654 | crc32_sum = ~0; | ||
655 | } | ||
522 | brelse(bh); | 656 | brelse(bh); |
523 | next_commit_ID++; | 657 | next_commit_ID++; |
524 | continue; | 658 | continue; |
@@ -554,9 +688,10 @@ static int do_one_pass(journal_t *journal, | |||
554 | * transaction marks the end of the valid log. | 688 | * transaction marks the end of the valid log. |
555 | */ | 689 | */ |
556 | 690 | ||
557 | if (pass == PASS_SCAN) | 691 | if (pass == PASS_SCAN) { |
558 | info->end_transaction = next_commit_ID; | 692 | if (!info->end_transaction) |
559 | else { | 693 | info->end_transaction = next_commit_ID; |
694 | } else { | ||
560 | /* It's really bad news if different passes end up at | 695 | /* It's really bad news if different passes end up at |
561 | * different places (but possible due to IO errors). */ | 696 | * different places (but possible due to IO errors). */ |
562 | if (info->end_transaction != next_commit_ID) { | 697 | if (info->end_transaction != next_commit_ID) { |
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 3595fd432d5b..df36f42e19e1 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c | |||
@@ -171,13 +171,15 @@ int __init jbd2_journal_init_revoke_caches(void) | |||
171 | { | 171 | { |
172 | jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", | 172 | jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", |
173 | sizeof(struct jbd2_revoke_record_s), | 173 | sizeof(struct jbd2_revoke_record_s), |
174 | 0, SLAB_HWCACHE_ALIGN, NULL); | 174 | 0, |
175 | SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, | ||
176 | NULL); | ||
175 | if (jbd2_revoke_record_cache == 0) | 177 | if (jbd2_revoke_record_cache == 0) |
176 | return -ENOMEM; | 178 | return -ENOMEM; |
177 | 179 | ||
178 | jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", | 180 | jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", |
179 | sizeof(struct jbd2_revoke_table_s), | 181 | sizeof(struct jbd2_revoke_table_s), |
180 | 0, 0, NULL); | 182 | 0, SLAB_TEMPORARY, NULL); |
181 | if (jbd2_revoke_table_cache == 0) { | 183 | if (jbd2_revoke_table_cache == 0) { |
182 | kmem_cache_destroy(jbd2_revoke_record_cache); | 184 | kmem_cache_destroy(jbd2_revoke_record_cache); |
183 | jbd2_revoke_record_cache = NULL; | 185 | jbd2_revoke_record_cache = NULL; |
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b1fcf2b3dca3..b9b0b6f899b9 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -54,11 +54,13 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) | |||
54 | spin_lock_init(&transaction->t_handle_lock); | 54 | spin_lock_init(&transaction->t_handle_lock); |
55 | 55 | ||
56 | /* Set up the commit timer for the new transaction. */ | 56 | /* Set up the commit timer for the new transaction. */ |
57 | journal->j_commit_timer.expires = transaction->t_expires; | 57 | journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); |
58 | add_timer(&journal->j_commit_timer); | 58 | add_timer(&journal->j_commit_timer); |
59 | 59 | ||
60 | J_ASSERT(journal->j_running_transaction == NULL); | 60 | J_ASSERT(journal->j_running_transaction == NULL); |
61 | journal->j_running_transaction = transaction; | 61 | journal->j_running_transaction = transaction; |
62 | transaction->t_max_wait = 0; | ||
63 | transaction->t_start = jiffies; | ||
62 | 64 | ||
63 | return transaction; | 65 | return transaction; |
64 | } | 66 | } |
@@ -85,6 +87,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle) | |||
85 | int nblocks = handle->h_buffer_credits; | 87 | int nblocks = handle->h_buffer_credits; |
86 | transaction_t *new_transaction = NULL; | 88 | transaction_t *new_transaction = NULL; |
87 | int ret = 0; | 89 | int ret = 0; |
90 | unsigned long ts = jiffies; | ||
88 | 91 | ||
89 | if (nblocks > journal->j_max_transaction_buffers) { | 92 | if (nblocks > journal->j_max_transaction_buffers) { |
90 | printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", | 93 | printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", |
@@ -217,6 +220,12 @@ repeat_locked: | |||
217 | /* OK, account for the buffers that this operation expects to | 220 | /* OK, account for the buffers that this operation expects to |
218 | * use and add the handle to the running transaction. */ | 221 | * use and add the handle to the running transaction. */ |
219 | 222 | ||
223 | if (time_after(transaction->t_start, ts)) { | ||
224 | ts = jbd2_time_diff(ts, transaction->t_start); | ||
225 | if (ts > transaction->t_max_wait) | ||
226 | transaction->t_max_wait = ts; | ||
227 | } | ||
228 | |||
220 | handle->h_transaction = transaction; | 229 | handle->h_transaction = transaction; |
221 | transaction->t_outstanding_credits += nblocks; | 230 | transaction->t_outstanding_credits += nblocks; |
222 | transaction->t_updates++; | 231 | transaction->t_updates++; |
@@ -232,6 +241,8 @@ out: | |||
232 | return ret; | 241 | return ret; |
233 | } | 242 | } |
234 | 243 | ||
244 | static struct lock_class_key jbd2_handle_key; | ||
245 | |||
235 | /* Allocate a new handle. This should probably be in a slab... */ | 246 | /* Allocate a new handle. This should probably be in a slab... */ |
236 | static handle_t *new_handle(int nblocks) | 247 | static handle_t *new_handle(int nblocks) |
237 | { | 248 | { |
@@ -242,6 +253,9 @@ static handle_t *new_handle(int nblocks) | |||
242 | handle->h_buffer_credits = nblocks; | 253 | handle->h_buffer_credits = nblocks; |
243 | handle->h_ref = 1; | 254 | handle->h_ref = 1; |
244 | 255 | ||
256 | lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", | ||
257 | &jbd2_handle_key, 0); | ||
258 | |||
245 | return handle; | 259 | return handle; |
246 | } | 260 | } |
247 | 261 | ||
@@ -284,7 +298,11 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) | |||
284 | jbd2_free_handle(handle); | 298 | jbd2_free_handle(handle); |
285 | current->journal_info = NULL; | 299 | current->journal_info = NULL; |
286 | handle = ERR_PTR(err); | 300 | handle = ERR_PTR(err); |
301 | goto out; | ||
287 | } | 302 | } |
303 | |||
304 | lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); | ||
305 | out: | ||
288 | return handle; | 306 | return handle; |
289 | } | 307 | } |
290 | 308 | ||
@@ -1164,7 +1182,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) | |||
1164 | } | 1182 | } |
1165 | 1183 | ||
1166 | /* That test should have eliminated the following case: */ | 1184 | /* That test should have eliminated the following case: */ |
1167 | J_ASSERT_JH(jh, jh->b_frozen_data == 0); | 1185 | J_ASSERT_JH(jh, jh->b_frozen_data == NULL); |
1168 | 1186 | ||
1169 | JBUFFER_TRACE(jh, "file as BJ_Metadata"); | 1187 | JBUFFER_TRACE(jh, "file as BJ_Metadata"); |
1170 | spin_lock(&journal->j_list_lock); | 1188 | spin_lock(&journal->j_list_lock); |
@@ -1410,6 +1428,8 @@ int jbd2_journal_stop(handle_t *handle) | |||
1410 | spin_unlock(&journal->j_state_lock); | 1428 | spin_unlock(&journal->j_state_lock); |
1411 | } | 1429 | } |
1412 | 1430 | ||
1431 | lock_release(&handle->h_lockdep_map, 1, _THIS_IP_); | ||
1432 | |||
1413 | jbd2_free_handle(handle); | 1433 | jbd2_free_handle(handle); |
1414 | return err; | 1434 | return err; |
1415 | } | 1435 | } |
@@ -1512,7 +1532,7 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) | |||
1512 | 1532 | ||
1513 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); | 1533 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); |
1514 | if (jh->b_jlist != BJ_None) | 1534 | if (jh->b_jlist != BJ_None) |
1515 | J_ASSERT_JH(jh, transaction != 0); | 1535 | J_ASSERT_JH(jh, transaction != NULL); |
1516 | 1536 | ||
1517 | switch (jh->b_jlist) { | 1537 | switch (jh->b_jlist) { |
1518 | case BJ_None: | 1538 | case BJ_None: |
@@ -1581,11 +1601,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) | |||
1581 | if (buffer_locked(bh) || buffer_dirty(bh)) | 1601 | if (buffer_locked(bh) || buffer_dirty(bh)) |
1582 | goto out; | 1602 | goto out; |
1583 | 1603 | ||
1584 | if (jh->b_next_transaction != 0) | 1604 | if (jh->b_next_transaction != NULL) |
1585 | goto out; | 1605 | goto out; |
1586 | 1606 | ||
1587 | spin_lock(&journal->j_list_lock); | 1607 | spin_lock(&journal->j_list_lock); |
1588 | if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { | 1608 | if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { |
1589 | if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { | 1609 | if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { |
1590 | /* A written-back ordered data buffer */ | 1610 | /* A written-back ordered data buffer */ |
1591 | JBUFFER_TRACE(jh, "release data"); | 1611 | JBUFFER_TRACE(jh, "release data"); |
@@ -1593,7 +1613,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) | |||
1593 | jbd2_journal_remove_journal_head(bh); | 1613 | jbd2_journal_remove_journal_head(bh); |
1594 | __brelse(bh); | 1614 | __brelse(bh); |
1595 | } | 1615 | } |
1596 | } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { | 1616 | } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { |
1597 | /* written-back checkpointed metadata buffer */ | 1617 | /* written-back checkpointed metadata buffer */ |
1598 | if (jh->b_jlist == BJ_None) { | 1618 | if (jh->b_jlist == BJ_None) { |
1599 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | 1619 | JBUFFER_TRACE(jh, "remove from checkpoint list"); |
@@ -1953,7 +1973,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, | |||
1953 | 1973 | ||
1954 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); | 1974 | J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); |
1955 | J_ASSERT_JH(jh, jh->b_transaction == transaction || | 1975 | J_ASSERT_JH(jh, jh->b_transaction == transaction || |
1956 | jh->b_transaction == 0); | 1976 | jh->b_transaction == NULL); |
1957 | 1977 | ||
1958 | if (jh->b_transaction && jh->b_jlist == jlist) | 1978 | if (jh->b_transaction && jh->b_jlist == jlist) |
1959 | return; | 1979 | return; |
diff --git a/fs/read_write.c b/fs/read_write.c index c4d3d17923f1..1c177f29e1b7 100644 --- a/fs/read_write.c +++ b/fs/read_write.c | |||
@@ -446,6 +446,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) | |||
446 | } | 446 | } |
447 | return seg; | 447 | return seg; |
448 | } | 448 | } |
449 | EXPORT_SYMBOL(iov_shorten); | ||
449 | 450 | ||
450 | ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, | 451 | ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, |
451 | unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) | 452 | unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) |
diff --git a/include/asm-arm/bitops.h b/include/asm-arm/bitops.h index 47a6b086eee2..5c60bfc1a84d 100644 --- a/include/asm-arm/bitops.h +++ b/include/asm-arm/bitops.h | |||
@@ -310,6 +310,8 @@ static inline int constant_fls(int x) | |||
310 | _find_first_zero_bit_le(p,sz) | 310 | _find_first_zero_bit_le(p,sz) |
311 | #define ext2_find_next_zero_bit(p,sz,off) \ | 311 | #define ext2_find_next_zero_bit(p,sz,off) \ |
312 | _find_next_zero_bit_le(p,sz,off) | 312 | _find_next_zero_bit_le(p,sz,off) |
313 | #define ext2_find_next_bit(p, sz, off) \ | ||
314 | _find_next_bit_le(p, sz, off) | ||
313 | 315 | ||
314 | /* | 316 | /* |
315 | * Minix is defined to use little-endian byte ordering. | 317 | * Minix is defined to use little-endian byte ordering. |
diff --git a/include/asm-generic/bitops/ext2-non-atomic.h b/include/asm-generic/bitops/ext2-non-atomic.h index 1697404afa05..63cf822431a2 100644 --- a/include/asm-generic/bitops/ext2-non-atomic.h +++ b/include/asm-generic/bitops/ext2-non-atomic.h | |||
@@ -14,5 +14,7 @@ | |||
14 | generic_find_first_zero_le_bit((unsigned long *)(addr), (size)) | 14 | generic_find_first_zero_le_bit((unsigned long *)(addr), (size)) |
15 | #define ext2_find_next_zero_bit(addr, size, off) \ | 15 | #define ext2_find_next_zero_bit(addr, size, off) \ |
16 | generic_find_next_zero_le_bit((unsigned long *)(addr), (size), (off)) | 16 | generic_find_next_zero_le_bit((unsigned long *)(addr), (size), (off)) |
17 | #define ext2_find_next_bit(addr, size, off) \ | ||
18 | generic_find_next_le_bit((unsigned long *)(addr), (size), (off)) | ||
17 | 19 | ||
18 | #endif /* _ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_ */ | 20 | #endif /* _ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_ */ |
diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h index b9c7e5d2d2ad..80e3bf13b2b9 100644 --- a/include/asm-generic/bitops/le.h +++ b/include/asm-generic/bitops/le.h | |||
@@ -20,6 +20,8 @@ | |||
20 | #define generic___test_and_clear_le_bit(nr, addr) __test_and_clear_bit(nr, addr) | 20 | #define generic___test_and_clear_le_bit(nr, addr) __test_and_clear_bit(nr, addr) |
21 | 21 | ||
22 | #define generic_find_next_zero_le_bit(addr, size, offset) find_next_zero_bit(addr, size, offset) | 22 | #define generic_find_next_zero_le_bit(addr, size, offset) find_next_zero_bit(addr, size, offset) |
23 | #define generic_find_next_le_bit(addr, size, offset) \ | ||
24 | find_next_bit(addr, size, offset) | ||
23 | 25 | ||
24 | #elif defined(__BIG_ENDIAN) | 26 | #elif defined(__BIG_ENDIAN) |
25 | 27 | ||
@@ -42,6 +44,8 @@ | |||
42 | 44 | ||
43 | extern unsigned long generic_find_next_zero_le_bit(const unsigned long *addr, | 45 | extern unsigned long generic_find_next_zero_le_bit(const unsigned long *addr, |
44 | unsigned long size, unsigned long offset); | 46 | unsigned long size, unsigned long offset); |
47 | extern unsigned long generic_find_next_le_bit(const unsigned long *addr, | ||
48 | unsigned long size, unsigned long offset); | ||
45 | 49 | ||
46 | #else | 50 | #else |
47 | #error "Please fix <asm/byteorder.h>" | 51 | #error "Please fix <asm/byteorder.h>" |
diff --git a/include/asm-m68k/bitops.h b/include/asm-m68k/bitops.h index 2976b5d68e96..83d1f286230b 100644 --- a/include/asm-m68k/bitops.h +++ b/include/asm-m68k/bitops.h | |||
@@ -410,6 +410,8 @@ static inline int ext2_find_next_zero_bit(const void *vaddr, unsigned size, | |||
410 | res = ext2_find_first_zero_bit (p, size - 32 * (p - addr)); | 410 | res = ext2_find_first_zero_bit (p, size - 32 * (p - addr)); |
411 | return (p - addr) * 32 + res; | 411 | return (p - addr) * 32 + res; |
412 | } | 412 | } |
413 | #define ext2_find_next_bit(addr, size, off) \ | ||
414 | generic_find_next_le_bit((unsigned long *)(addr), (size), (off)) | ||
413 | 415 | ||
414 | #endif /* __KERNEL__ */ | 416 | #endif /* __KERNEL__ */ |
415 | 417 | ||
diff --git a/include/asm-m68knommu/bitops.h b/include/asm-m68knommu/bitops.h index f8dfb7ba2e25..f43afe1fc3b3 100644 --- a/include/asm-m68knommu/bitops.h +++ b/include/asm-m68knommu/bitops.h | |||
@@ -294,6 +294,8 @@ found_middle: | |||
294 | return result + ffz(__swab32(tmp)); | 294 | return result + ffz(__swab32(tmp)); |
295 | } | 295 | } |
296 | 296 | ||
297 | #define ext2_find_next_bit(addr, size, off) \ | ||
298 | generic_find_next_le_bit((unsigned long *)(addr), (size), (off)) | ||
297 | #include <asm-generic/bitops/minix.h> | 299 | #include <asm-generic/bitops/minix.h> |
298 | 300 | ||
299 | #endif /* __KERNEL__ */ | 301 | #endif /* __KERNEL__ */ |
diff --git a/include/asm-powerpc/bitops.h b/include/asm-powerpc/bitops.h index 733b4af7f4f1..220d9a781ab9 100644 --- a/include/asm-powerpc/bitops.h +++ b/include/asm-powerpc/bitops.h | |||
@@ -359,6 +359,8 @@ static __inline__ int test_le_bit(unsigned long nr, | |||
359 | unsigned long generic_find_next_zero_le_bit(const unsigned long *addr, | 359 | unsigned long generic_find_next_zero_le_bit(const unsigned long *addr, |
360 | unsigned long size, unsigned long offset); | 360 | unsigned long size, unsigned long offset); |
361 | 361 | ||
362 | unsigned long generic_find_next_le_bit(const unsigned long *addr, | ||
363 | unsigned long size, unsigned long offset); | ||
362 | /* Bitmap functions for the ext2 filesystem */ | 364 | /* Bitmap functions for the ext2 filesystem */ |
363 | 365 | ||
364 | #define ext2_set_bit(nr,addr) \ | 366 | #define ext2_set_bit(nr,addr) \ |
@@ -378,6 +380,8 @@ unsigned long generic_find_next_zero_le_bit(const unsigned long *addr, | |||
378 | #define ext2_find_next_zero_bit(addr, size, off) \ | 380 | #define ext2_find_next_zero_bit(addr, size, off) \ |
379 | generic_find_next_zero_le_bit((unsigned long*)addr, size, off) | 381 | generic_find_next_zero_le_bit((unsigned long*)addr, size, off) |
380 | 382 | ||
383 | #define ext2_find_next_bit(addr, size, off) \ | ||
384 | generic_find_next_le_bit((unsigned long *)addr, size, off) | ||
381 | /* Bitmap functions for the minix filesystem. */ | 385 | /* Bitmap functions for the minix filesystem. */ |
382 | 386 | ||
383 | #define minix_test_and_set_bit(nr,addr) \ | 387 | #define minix_test_and_set_bit(nr,addr) \ |
diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h index 34d9a6357c38..dba6fecad0be 100644 --- a/include/asm-s390/bitops.h +++ b/include/asm-s390/bitops.h | |||
@@ -772,6 +772,8 @@ static inline int sched_find_first_bit(unsigned long *b) | |||
772 | test_and_clear_bit((nr)^(__BITOPS_WORDSIZE - 8), (unsigned long *)addr) | 772 | test_and_clear_bit((nr)^(__BITOPS_WORDSIZE - 8), (unsigned long *)addr) |
773 | #define ext2_test_bit(nr, addr) \ | 773 | #define ext2_test_bit(nr, addr) \ |
774 | test_bit((nr)^(__BITOPS_WORDSIZE - 8), (unsigned long *)addr) | 774 | test_bit((nr)^(__BITOPS_WORDSIZE - 8), (unsigned long *)addr) |
775 | #define ext2_find_next_bit(addr, size, off) \ | ||
776 | generic_find_next_le_bit((unsigned long *)(addr), (size), (off)) | ||
775 | 777 | ||
776 | #ifndef __s390x__ | 778 | #ifndef __s390x__ |
777 | 779 | ||
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index da0d83fbadc0..e98801f06dcc 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -192,6 +192,8 @@ int sync_dirty_buffer(struct buffer_head *bh); | |||
192 | int submit_bh(int, struct buffer_head *); | 192 | int submit_bh(int, struct buffer_head *); |
193 | void write_boundary_block(struct block_device *bdev, | 193 | void write_boundary_block(struct block_device *bdev, |
194 | sector_t bblock, unsigned blocksize); | 194 | sector_t bblock, unsigned blocksize); |
195 | int bh_uptodate_or_lock(struct buffer_head *bh); | ||
196 | int bh_submit_read(struct buffer_head *bh); | ||
195 | 197 | ||
196 | extern int buffer_heads_over_limit; | 198 | extern int buffer_heads_over_limit; |
197 | 199 | ||
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h index 97dd409d5f4a..1852313fc7c7 100644 --- a/include/linux/ext4_fs.h +++ b/include/linux/ext4_fs.h | |||
@@ -20,6 +20,8 @@ | |||
20 | #include <linux/blkdev.h> | 20 | #include <linux/blkdev.h> |
21 | #include <linux/magic.h> | 21 | #include <linux/magic.h> |
22 | 22 | ||
23 | #include <linux/ext4_fs_i.h> | ||
24 | |||
23 | /* | 25 | /* |
24 | * The second extended filesystem constants/structures | 26 | * The second extended filesystem constants/structures |
25 | */ | 27 | */ |
@@ -51,6 +53,50 @@ | |||
51 | #define ext4_debug(f, a...) do {} while (0) | 53 | #define ext4_debug(f, a...) do {} while (0) |
52 | #endif | 54 | #endif |
53 | 55 | ||
56 | #define EXT4_MULTIBLOCK_ALLOCATOR 1 | ||
57 | |||
58 | /* prefer goal again. length */ | ||
59 | #define EXT4_MB_HINT_MERGE 1 | ||
60 | /* blocks already reserved */ | ||
61 | #define EXT4_MB_HINT_RESERVED 2 | ||
62 | /* metadata is being allocated */ | ||
63 | #define EXT4_MB_HINT_METADATA 4 | ||
64 | /* first blocks in the file */ | ||
65 | #define EXT4_MB_HINT_FIRST 8 | ||
66 | /* search for the best chunk */ | ||
67 | #define EXT4_MB_HINT_BEST 16 | ||
68 | /* data is being allocated */ | ||
69 | #define EXT4_MB_HINT_DATA 32 | ||
70 | /* don't preallocate (for tails) */ | ||
71 | #define EXT4_MB_HINT_NOPREALLOC 64 | ||
72 | /* allocate for locality group */ | ||
73 | #define EXT4_MB_HINT_GROUP_ALLOC 128 | ||
74 | /* allocate goal blocks or none */ | ||
75 | #define EXT4_MB_HINT_GOAL_ONLY 256 | ||
76 | /* goal is meaningful */ | ||
77 | #define EXT4_MB_HINT_TRY_GOAL 512 | ||
78 | |||
79 | struct ext4_allocation_request { | ||
80 | /* target inode for block we're allocating */ | ||
81 | struct inode *inode; | ||
82 | /* logical block in target inode */ | ||
83 | ext4_lblk_t logical; | ||
84 | /* phys. target (a hint) */ | ||
85 | ext4_fsblk_t goal; | ||
86 | /* the closest logical allocated block to the left */ | ||
87 | ext4_lblk_t lleft; | ||
88 | /* phys. block for ^^^ */ | ||
89 | ext4_fsblk_t pleft; | ||
90 | /* the closest logical allocated block to the right */ | ||
91 | ext4_lblk_t lright; | ||
92 | /* phys. block for ^^^ */ | ||
93 | ext4_fsblk_t pright; | ||
94 | /* how many blocks we want to allocate */ | ||
95 | unsigned long len; | ||
96 | /* flags. see above EXT4_MB_HINT_* */ | ||
97 | unsigned long flags; | ||
98 | }; | ||
99 | |||
54 | /* | 100 | /* |
55 | * Special inodes numbers | 101 | * Special inodes numbers |
56 | */ | 102 | */ |
@@ -73,8 +119,8 @@ | |||
73 | * Macro-instructions used to manage several block sizes | 119 | * Macro-instructions used to manage several block sizes |
74 | */ | 120 | */ |
75 | #define EXT4_MIN_BLOCK_SIZE 1024 | 121 | #define EXT4_MIN_BLOCK_SIZE 1024 |
76 | #define EXT4_MAX_BLOCK_SIZE 4096 | 122 | #define EXT4_MAX_BLOCK_SIZE 65536 |
77 | #define EXT4_MIN_BLOCK_LOG_SIZE 10 | 123 | #define EXT4_MIN_BLOCK_LOG_SIZE 10 |
78 | #ifdef __KERNEL__ | 124 | #ifdef __KERNEL__ |
79 | # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) | 125 | # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) |
80 | #else | 126 | #else |
@@ -118,6 +164,11 @@ struct ext4_group_desc | |||
118 | __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ | 164 | __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ |
119 | __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ | 165 | __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ |
120 | __le32 bg_inode_table_hi; /* Inodes table block MSB */ | 166 | __le32 bg_inode_table_hi; /* Inodes table block MSB */ |
167 | __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ | ||
168 | __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ | ||
169 | __le16 bg_used_dirs_count_hi; /* Directories count MSB */ | ||
170 | __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ | ||
171 | __u32 bg_reserved2[3]; | ||
121 | }; | 172 | }; |
122 | 173 | ||
123 | #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ | 174 | #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ |
@@ -178,8 +229,9 @@ struct ext4_group_desc | |||
178 | #define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ | 229 | #define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ |
179 | #define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ | 230 | #define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ |
180 | #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ | 231 | #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ |
181 | #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ | 232 | #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ |
182 | #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ | 233 | #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ |
234 | #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ | ||
183 | 235 | ||
184 | #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ | 236 | #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ |
185 | #define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ | 237 | #define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ |
@@ -237,6 +289,7 @@ struct ext4_new_group_data { | |||
237 | #endif | 289 | #endif |
238 | #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) | 290 | #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) |
239 | #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) | 291 | #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) |
292 | #define EXT4_IOC_MIGRATE _IO('f', 7) | ||
240 | 293 | ||
241 | /* | 294 | /* |
242 | * ioctl commands in 32 bit emulation | 295 | * ioctl commands in 32 bit emulation |
@@ -275,18 +328,18 @@ struct ext4_mount_options { | |||
275 | struct ext4_inode { | 328 | struct ext4_inode { |
276 | __le16 i_mode; /* File mode */ | 329 | __le16 i_mode; /* File mode */ |
277 | __le16 i_uid; /* Low 16 bits of Owner Uid */ | 330 | __le16 i_uid; /* Low 16 bits of Owner Uid */ |
278 | __le32 i_size; /* Size in bytes */ | 331 | __le32 i_size_lo; /* Size in bytes */ |
279 | __le32 i_atime; /* Access time */ | 332 | __le32 i_atime; /* Access time */ |
280 | __le32 i_ctime; /* Inode Change time */ | 333 | __le32 i_ctime; /* Inode Change time */ |
281 | __le32 i_mtime; /* Modification time */ | 334 | __le32 i_mtime; /* Modification time */ |
282 | __le32 i_dtime; /* Deletion Time */ | 335 | __le32 i_dtime; /* Deletion Time */ |
283 | __le16 i_gid; /* Low 16 bits of Group Id */ | 336 | __le16 i_gid; /* Low 16 bits of Group Id */ |
284 | __le16 i_links_count; /* Links count */ | 337 | __le16 i_links_count; /* Links count */ |
285 | __le32 i_blocks; /* Blocks count */ | 338 | __le32 i_blocks_lo; /* Blocks count */ |
286 | __le32 i_flags; /* File flags */ | 339 | __le32 i_flags; /* File flags */ |
287 | union { | 340 | union { |
288 | struct { | 341 | struct { |
289 | __u32 l_i_reserved1; | 342 | __le32 l_i_version; |
290 | } linux1; | 343 | } linux1; |
291 | struct { | 344 | struct { |
292 | __u32 h_i_translator; | 345 | __u32 h_i_translator; |
@@ -297,12 +350,12 @@ struct ext4_inode { | |||
297 | } osd1; /* OS dependent 1 */ | 350 | } osd1; /* OS dependent 1 */ |
298 | __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ | 351 | __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ |
299 | __le32 i_generation; /* File version (for NFS) */ | 352 | __le32 i_generation; /* File version (for NFS) */ |
300 | __le32 i_file_acl; /* File ACL */ | 353 | __le32 i_file_acl_lo; /* File ACL */ |
301 | __le32 i_dir_acl; /* Directory ACL */ | 354 | __le32 i_size_high; |
302 | __le32 i_obso_faddr; /* Obsoleted fragment address */ | 355 | __le32 i_obso_faddr; /* Obsoleted fragment address */ |
303 | union { | 356 | union { |
304 | struct { | 357 | struct { |
305 | __le16 l_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ | 358 | __le16 l_i_blocks_high; /* were l_i_reserved1 */ |
306 | __le16 l_i_file_acl_high; | 359 | __le16 l_i_file_acl_high; |
307 | __le16 l_i_uid_high; /* these 2 fields */ | 360 | __le16 l_i_uid_high; /* these 2 fields */ |
308 | __le16 l_i_gid_high; /* were reserved2[0] */ | 361 | __le16 l_i_gid_high; /* were reserved2[0] */ |
@@ -328,9 +381,9 @@ struct ext4_inode { | |||
328 | __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ | 381 | __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ |
329 | __le32 i_crtime; /* File Creation time */ | 382 | __le32 i_crtime; /* File Creation time */ |
330 | __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ | 383 | __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ |
384 | __le32 i_version_hi; /* high 32 bits for 64-bit version */ | ||
331 | }; | 385 | }; |
332 | 386 | ||
333 | #define i_size_high i_dir_acl | ||
334 | 387 | ||
335 | #define EXT4_EPOCH_BITS 2 | 388 | #define EXT4_EPOCH_BITS 2 |
336 | #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) | 389 | #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) |
@@ -402,9 +455,12 @@ do { \ | |||
402 | raw_inode->xtime ## _extra); \ | 455 | raw_inode->xtime ## _extra); \ |
403 | } while (0) | 456 | } while (0) |
404 | 457 | ||
458 | #define i_disk_version osd1.linux1.l_i_version | ||
459 | |||
405 | #if defined(__KERNEL__) || defined(__linux__) | 460 | #if defined(__KERNEL__) || defined(__linux__) |
406 | #define i_reserved1 osd1.linux1.l_i_reserved1 | 461 | #define i_reserved1 osd1.linux1.l_i_reserved1 |
407 | #define i_file_acl_high osd2.linux2.l_i_file_acl_high | 462 | #define i_file_acl_high osd2.linux2.l_i_file_acl_high |
463 | #define i_blocks_high osd2.linux2.l_i_blocks_high | ||
408 | #define i_uid_low i_uid | 464 | #define i_uid_low i_uid |
409 | #define i_gid_low i_gid | 465 | #define i_gid_low i_gid |
410 | #define i_uid_high osd2.linux2.l_i_uid_high | 466 | #define i_uid_high osd2.linux2.l_i_uid_high |
@@ -461,7 +517,10 @@ do { \ | |||
461 | #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ | 517 | #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ |
462 | #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ | 518 | #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ |
463 | #define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ | 519 | #define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ |
464 | 520 | #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ | |
521 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ | ||
522 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ | ||
523 | #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ | ||
465 | /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ | 524 | /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ |
466 | #ifndef _LINUX_EXT2_FS_H | 525 | #ifndef _LINUX_EXT2_FS_H |
467 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt | 526 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt |
@@ -481,6 +540,7 @@ do { \ | |||
481 | #define ext4_test_bit ext2_test_bit | 540 | #define ext4_test_bit ext2_test_bit |
482 | #define ext4_find_first_zero_bit ext2_find_first_zero_bit | 541 | #define ext4_find_first_zero_bit ext2_find_first_zero_bit |
483 | #define ext4_find_next_zero_bit ext2_find_next_zero_bit | 542 | #define ext4_find_next_zero_bit ext2_find_next_zero_bit |
543 | #define ext4_find_next_bit ext2_find_next_bit | ||
484 | 544 | ||
485 | /* | 545 | /* |
486 | * Maximal mount counts between two filesystem checks | 546 | * Maximal mount counts between two filesystem checks |
@@ -671,6 +731,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) | |||
671 | #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 | 731 | #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 |
672 | #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 | 732 | #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 |
673 | #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 | 733 | #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 |
734 | #define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 | ||
674 | #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 | 735 | #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 |
675 | #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 | 736 | #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 |
676 | #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 | 737 | #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 |
@@ -682,6 +743,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) | |||
682 | #define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 | 743 | #define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 |
683 | #define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ | 744 | #define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ |
684 | #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 | 745 | #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 |
746 | #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 | ||
685 | #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 | 747 | #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 |
686 | 748 | ||
687 | #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR | 749 | #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR |
@@ -696,7 +758,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) | |||
696 | EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ | 758 | EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ |
697 | EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ | 759 | EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ |
698 | EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ | 760 | EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ |
699 | EXT4_FEATURE_RO_COMPAT_BTREE_DIR) | 761 | EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ |
762 | EXT4_FEATURE_RO_COMPAT_HUGE_FILE) | ||
700 | 763 | ||
701 | /* | 764 | /* |
702 | * Default values for user and/or group using reserved blocks | 765 | * Default values for user and/or group using reserved blocks |
@@ -767,6 +830,26 @@ struct ext4_dir_entry_2 { | |||
767 | #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) | 830 | #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) |
768 | #define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ | 831 | #define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ |
769 | ~EXT4_DIR_ROUND) | 832 | ~EXT4_DIR_ROUND) |
833 | #define EXT4_MAX_REC_LEN ((1<<16)-1) | ||
834 | |||
835 | static inline unsigned ext4_rec_len_from_disk(__le16 dlen) | ||
836 | { | ||
837 | unsigned len = le16_to_cpu(dlen); | ||
838 | |||
839 | if (len == EXT4_MAX_REC_LEN) | ||
840 | return 1 << 16; | ||
841 | return len; | ||
842 | } | ||
843 | |||
844 | static inline __le16 ext4_rec_len_to_disk(unsigned len) | ||
845 | { | ||
846 | if (len == (1 << 16)) | ||
847 | return cpu_to_le16(EXT4_MAX_REC_LEN); | ||
848 | else if (len > (1 << 16)) | ||
849 | BUG(); | ||
850 | return cpu_to_le16(len); | ||
851 | } | ||
852 | |||
770 | /* | 853 | /* |
771 | * Hash Tree Directory indexing | 854 | * Hash Tree Directory indexing |
772 | * (c) Daniel Phillips, 2001 | 855 | * (c) Daniel Phillips, 2001 |
@@ -810,7 +893,7 @@ struct ext4_iloc | |||
810 | { | 893 | { |
811 | struct buffer_head *bh; | 894 | struct buffer_head *bh; |
812 | unsigned long offset; | 895 | unsigned long offset; |
813 | unsigned long block_group; | 896 | ext4_group_t block_group; |
814 | }; | 897 | }; |
815 | 898 | ||
816 | static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) | 899 | static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) |
@@ -835,7 +918,7 @@ struct dir_private_info { | |||
835 | 918 | ||
836 | /* calculate the first block number of the group */ | 919 | /* calculate the first block number of the group */ |
837 | static inline ext4_fsblk_t | 920 | static inline ext4_fsblk_t |
838 | ext4_group_first_block_no(struct super_block *sb, unsigned long group_no) | 921 | ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) |
839 | { | 922 | { |
840 | return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + | 923 | return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + |
841 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | 924 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); |
@@ -866,21 +949,24 @@ extern unsigned int ext4_block_group(struct super_block *sb, | |||
866 | ext4_fsblk_t blocknr); | 949 | ext4_fsblk_t blocknr); |
867 | extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, | 950 | extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, |
868 | ext4_fsblk_t blocknr); | 951 | ext4_fsblk_t blocknr); |
869 | extern int ext4_bg_has_super(struct super_block *sb, int group); | 952 | extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); |
870 | extern unsigned long ext4_bg_num_gdb(struct super_block *sb, int group); | 953 | extern unsigned long ext4_bg_num_gdb(struct super_block *sb, |
954 | ext4_group_t group); | ||
871 | extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, | 955 | extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, |
872 | ext4_fsblk_t goal, int *errp); | 956 | ext4_fsblk_t goal, int *errp); |
873 | extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, | 957 | extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, |
874 | ext4_fsblk_t goal, unsigned long *count, int *errp); | 958 | ext4_fsblk_t goal, unsigned long *count, int *errp); |
959 | extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | ||
960 | ext4_fsblk_t goal, unsigned long *count, int *errp); | ||
875 | extern void ext4_free_blocks (handle_t *handle, struct inode *inode, | 961 | extern void ext4_free_blocks (handle_t *handle, struct inode *inode, |
876 | ext4_fsblk_t block, unsigned long count); | 962 | ext4_fsblk_t block, unsigned long count, int metadata); |
877 | extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, | 963 | extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, |
878 | ext4_fsblk_t block, unsigned long count, | 964 | ext4_fsblk_t block, unsigned long count, |
879 | unsigned long *pdquot_freed_blocks); | 965 | unsigned long *pdquot_freed_blocks); |
880 | extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); | 966 | extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); |
881 | extern void ext4_check_blocks_bitmap (struct super_block *); | 967 | extern void ext4_check_blocks_bitmap (struct super_block *); |
882 | extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, | 968 | extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, |
883 | unsigned int block_group, | 969 | ext4_group_t block_group, |
884 | struct buffer_head ** bh); | 970 | struct buffer_head ** bh); |
885 | extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); | 971 | extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); |
886 | extern void ext4_init_block_alloc_info(struct inode *); | 972 | extern void ext4_init_block_alloc_info(struct inode *); |
@@ -911,15 +997,32 @@ extern unsigned long ext4_count_dirs (struct super_block *); | |||
911 | extern void ext4_check_inodes_bitmap (struct super_block *); | 997 | extern void ext4_check_inodes_bitmap (struct super_block *); |
912 | extern unsigned long ext4_count_free (struct buffer_head *, unsigned); | 998 | extern unsigned long ext4_count_free (struct buffer_head *, unsigned); |
913 | 999 | ||
1000 | /* mballoc.c */ | ||
1001 | extern long ext4_mb_stats; | ||
1002 | extern long ext4_mb_max_to_scan; | ||
1003 | extern int ext4_mb_init(struct super_block *, int); | ||
1004 | extern int ext4_mb_release(struct super_block *); | ||
1005 | extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, | ||
1006 | struct ext4_allocation_request *, int *); | ||
1007 | extern int ext4_mb_reserve_blocks(struct super_block *, int); | ||
1008 | extern void ext4_mb_discard_inode_preallocations(struct inode *); | ||
1009 | extern int __init init_ext4_mballoc(void); | ||
1010 | extern void exit_ext4_mballoc(void); | ||
1011 | extern void ext4_mb_free_blocks(handle_t *, struct inode *, | ||
1012 | unsigned long, unsigned long, int, unsigned long *); | ||
1013 | |||
914 | 1014 | ||
915 | /* inode.c */ | 1015 | /* inode.c */ |
916 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, | 1016 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, |
917 | struct buffer_head *bh, ext4_fsblk_t blocknr); | 1017 | struct buffer_head *bh, ext4_fsblk_t blocknr); |
918 | struct buffer_head * ext4_getblk (handle_t *, struct inode *, long, int, int *); | 1018 | struct buffer_head *ext4_getblk(handle_t *, struct inode *, |
919 | struct buffer_head * ext4_bread (handle_t *, struct inode *, int, int, int *); | 1019 | ext4_lblk_t, int, int *); |
1020 | struct buffer_head *ext4_bread(handle_t *, struct inode *, | ||
1021 | ext4_lblk_t, int, int *); | ||
920 | int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | 1022 | int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, |
921 | sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, | 1023 | ext4_lblk_t iblock, unsigned long maxblocks, |
922 | int create, int extend_disksize); | 1024 | struct buffer_head *bh_result, |
1025 | int create, int extend_disksize); | ||
923 | 1026 | ||
924 | extern void ext4_read_inode (struct inode *); | 1027 | extern void ext4_read_inode (struct inode *); |
925 | extern int ext4_write_inode (struct inode *, int); | 1028 | extern int ext4_write_inode (struct inode *, int); |
@@ -943,6 +1046,9 @@ extern int ext4_ioctl (struct inode *, struct file *, unsigned int, | |||
943 | unsigned long); | 1046 | unsigned long); |
944 | extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); | 1047 | extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); |
945 | 1048 | ||
1049 | /* migrate.c */ | ||
1050 | extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int, | ||
1051 | unsigned long); | ||
946 | /* namei.c */ | 1052 | /* namei.c */ |
947 | extern int ext4_orphan_add(handle_t *, struct inode *); | 1053 | extern int ext4_orphan_add(handle_t *, struct inode *); |
948 | extern int ext4_orphan_del(handle_t *, struct inode *); | 1054 | extern int ext4_orphan_del(handle_t *, struct inode *); |
@@ -965,6 +1071,12 @@ extern void ext4_abort (struct super_block *, const char *, const char *, ...) | |||
965 | extern void ext4_warning (struct super_block *, const char *, const char *, ...) | 1071 | extern void ext4_warning (struct super_block *, const char *, const char *, ...) |
966 | __attribute__ ((format (printf, 3, 4))); | 1072 | __attribute__ ((format (printf, 3, 4))); |
967 | extern void ext4_update_dynamic_rev (struct super_block *sb); | 1073 | extern void ext4_update_dynamic_rev (struct super_block *sb); |
1074 | extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, | ||
1075 | __u32 compat); | ||
1076 | extern int ext4_update_rocompat_feature(handle_t *handle, | ||
1077 | struct super_block *sb, __u32 rocompat); | ||
1078 | extern int ext4_update_incompat_feature(handle_t *handle, | ||
1079 | struct super_block *sb, __u32 incompat); | ||
968 | extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, | 1080 | extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, |
969 | struct ext4_group_desc *bg); | 1081 | struct ext4_group_desc *bg); |
970 | extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, | 1082 | extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, |
@@ -1017,6 +1129,29 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, | |||
1017 | es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); | 1129 | es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); |
1018 | } | 1130 | } |
1019 | 1131 | ||
1132 | static inline loff_t ext4_isize(struct ext4_inode *raw_inode) | ||
1133 | { | ||
1134 | return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | | ||
1135 | le32_to_cpu(raw_inode->i_size_lo); | ||
1136 | } | ||
1137 | |||
1138 | static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) | ||
1139 | { | ||
1140 | raw_inode->i_size_lo = cpu_to_le32(i_size); | ||
1141 | raw_inode->i_size_high = cpu_to_le32(i_size >> 32); | ||
1142 | } | ||
1143 | |||
1144 | static inline | ||
1145 | struct ext4_group_info *ext4_get_group_info(struct super_block *sb, | ||
1146 | ext4_group_t group) | ||
1147 | { | ||
1148 | struct ext4_group_info ***grp_info; | ||
1149 | long indexv, indexh; | ||
1150 | grp_info = EXT4_SB(sb)->s_group_info; | ||
1151 | indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); | ||
1152 | indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); | ||
1153 | return grp_info[indexv][indexh]; | ||
1154 | } | ||
1020 | 1155 | ||
1021 | 1156 | ||
1022 | #define ext4_std_error(sb, errno) \ | 1157 | #define ext4_std_error(sb, errno) \ |
@@ -1048,7 +1183,7 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations; | |||
1048 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); | 1183 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); |
1049 | extern int ext4_ext_writepage_trans_blocks(struct inode *, int); | 1184 | extern int ext4_ext_writepage_trans_blocks(struct inode *, int); |
1050 | extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | 1185 | extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, |
1051 | ext4_fsblk_t iblock, | 1186 | ext4_lblk_t iblock, |
1052 | unsigned long max_blocks, struct buffer_head *bh_result, | 1187 | unsigned long max_blocks, struct buffer_head *bh_result, |
1053 | int create, int extend_disksize); | 1188 | int create, int extend_disksize); |
1054 | extern void ext4_ext_truncate(struct inode *, struct page *); | 1189 | extern void ext4_ext_truncate(struct inode *, struct page *); |
@@ -1056,19 +1191,10 @@ extern void ext4_ext_init(struct super_block *); | |||
1056 | extern void ext4_ext_release(struct super_block *); | 1191 | extern void ext4_ext_release(struct super_block *); |
1057 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, | 1192 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, |
1058 | loff_t len); | 1193 | loff_t len); |
1059 | static inline int | 1194 | extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, |
1060 | ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | 1195 | sector_t block, unsigned long max_blocks, |
1061 | unsigned long max_blocks, struct buffer_head *bh, | 1196 | struct buffer_head *bh, int create, |
1062 | int create, int extend_disksize) | 1197 | int extend_disksize); |
1063 | { | ||
1064 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | ||
1065 | return ext4_ext_get_blocks(handle, inode, block, max_blocks, | ||
1066 | bh, create, extend_disksize); | ||
1067 | return ext4_get_blocks_handle(handle, inode, block, max_blocks, bh, | ||
1068 | create, extend_disksize); | ||
1069 | } | ||
1070 | |||
1071 | |||
1072 | #endif /* __KERNEL__ */ | 1198 | #endif /* __KERNEL__ */ |
1073 | 1199 | ||
1074 | #endif /* _LINUX_EXT4_FS_H */ | 1200 | #endif /* _LINUX_EXT4_FS_H */ |
diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h index d2045a26195d..697da4bce6c5 100644 --- a/include/linux/ext4_fs_extents.h +++ b/include/linux/ext4_fs_extents.h | |||
@@ -124,20 +124,6 @@ struct ext4_ext_path { | |||
124 | #define EXT4_EXT_CACHE_GAP 1 | 124 | #define EXT4_EXT_CACHE_GAP 1 |
125 | #define EXT4_EXT_CACHE_EXTENT 2 | 125 | #define EXT4_EXT_CACHE_EXTENT 2 |
126 | 126 | ||
127 | /* | ||
128 | * to be called by ext4_ext_walk_space() | ||
129 | * negative retcode - error | ||
130 | * positive retcode - signal for ext4_ext_walk_space(), see below | ||
131 | * callback must return valid extent (passed or newly created) | ||
132 | */ | ||
133 | typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, | ||
134 | struct ext4_ext_cache *, | ||
135 | void *); | ||
136 | |||
137 | #define EXT_CONTINUE 0 | ||
138 | #define EXT_BREAK 1 | ||
139 | #define EXT_REPEAT 2 | ||
140 | |||
141 | 127 | ||
142 | #define EXT_MAX_BLOCK 0xffffffff | 128 | #define EXT_MAX_BLOCK 0xffffffff |
143 | 129 | ||
@@ -226,6 +212,8 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) | |||
226 | (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); | 212 | (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); |
227 | } | 213 | } |
228 | 214 | ||
215 | extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); | ||
216 | extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); | ||
229 | extern int ext4_extent_tree_init(handle_t *, struct inode *); | 217 | extern int ext4_extent_tree_init(handle_t *, struct inode *); |
230 | extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); | 218 | extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); |
231 | extern int ext4_ext_try_to_merge(struct inode *inode, | 219 | extern int ext4_ext_try_to_merge(struct inode *inode, |
@@ -233,8 +221,11 @@ extern int ext4_ext_try_to_merge(struct inode *inode, | |||
233 | struct ext4_extent *); | 221 | struct ext4_extent *); |
234 | extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); | 222 | extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); |
235 | extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); | 223 | extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); |
236 | extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *); | 224 | extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, |
237 | extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *); | 225 | struct ext4_ext_path *); |
238 | 226 | extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, | |
227 | ext4_lblk_t *, ext4_fsblk_t *); | ||
228 | extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, | ||
229 | ext4_lblk_t *, ext4_fsblk_t *); | ||
239 | #endif /* _LINUX_EXT4_EXTENTS */ | 230 | #endif /* _LINUX_EXT4_EXTENTS */ |
240 | 231 | ||
diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h index 86ddfe2089f3..d5508d3cf290 100644 --- a/include/linux/ext4_fs_i.h +++ b/include/linux/ext4_fs_i.h | |||
@@ -27,6 +27,12 @@ typedef int ext4_grpblk_t; | |||
27 | /* data type for filesystem-wide blocks number */ | 27 | /* data type for filesystem-wide blocks number */ |
28 | typedef unsigned long long ext4_fsblk_t; | 28 | typedef unsigned long long ext4_fsblk_t; |
29 | 29 | ||
30 | /* data type for file logical block number */ | ||
31 | typedef __u32 ext4_lblk_t; | ||
32 | |||
33 | /* data type for block group number */ | ||
34 | typedef unsigned long ext4_group_t; | ||
35 | |||
30 | struct ext4_reserve_window { | 36 | struct ext4_reserve_window { |
31 | ext4_fsblk_t _rsv_start; /* First byte reserved */ | 37 | ext4_fsblk_t _rsv_start; /* First byte reserved */ |
32 | ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */ | 38 | ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */ |
@@ -48,7 +54,7 @@ struct ext4_block_alloc_info { | |||
48 | * most-recently-allocated block in this file. | 54 | * most-recently-allocated block in this file. |
49 | * We use this for detecting linearly ascending allocation requests. | 55 | * We use this for detecting linearly ascending allocation requests. |
50 | */ | 56 | */ |
51 | __u32 last_alloc_logical_block; | 57 | ext4_lblk_t last_alloc_logical_block; |
52 | /* | 58 | /* |
53 | * Was i_next_alloc_goal in ext4_inode_info | 59 | * Was i_next_alloc_goal in ext4_inode_info |
54 | * is the *physical* companion to i_next_alloc_block. | 60 | * is the *physical* companion to i_next_alloc_block. |
@@ -67,7 +73,7 @@ struct ext4_block_alloc_info { | |||
67 | */ | 73 | */ |
68 | struct ext4_ext_cache { | 74 | struct ext4_ext_cache { |
69 | ext4_fsblk_t ec_start; | 75 | ext4_fsblk_t ec_start; |
70 | __u32 ec_block; | 76 | ext4_lblk_t ec_block; |
71 | __u32 ec_len; /* must be 32bit to return holes */ | 77 | __u32 ec_len; /* must be 32bit to return holes */ |
72 | __u32 ec_type; | 78 | __u32 ec_type; |
73 | }; | 79 | }; |
@@ -79,7 +85,6 @@ struct ext4_inode_info { | |||
79 | __le32 i_data[15]; /* unconverted */ | 85 | __le32 i_data[15]; /* unconverted */ |
80 | __u32 i_flags; | 86 | __u32 i_flags; |
81 | ext4_fsblk_t i_file_acl; | 87 | ext4_fsblk_t i_file_acl; |
82 | __u32 i_dir_acl; | ||
83 | __u32 i_dtime; | 88 | __u32 i_dtime; |
84 | 89 | ||
85 | /* | 90 | /* |
@@ -89,13 +94,13 @@ struct ext4_inode_info { | |||
89 | * place a file's data blocks near its inode block, and new inodes | 94 | * place a file's data blocks near its inode block, and new inodes |
90 | * near to their parent directory's inode. | 95 | * near to their parent directory's inode. |
91 | */ | 96 | */ |
92 | __u32 i_block_group; | 97 | ext4_group_t i_block_group; |
93 | __u32 i_state; /* Dynamic state flags for ext4 */ | 98 | __u32 i_state; /* Dynamic state flags for ext4 */ |
94 | 99 | ||
95 | /* block reservation info */ | 100 | /* block reservation info */ |
96 | struct ext4_block_alloc_info *i_block_alloc_info; | 101 | struct ext4_block_alloc_info *i_block_alloc_info; |
97 | 102 | ||
98 | __u32 i_dir_start_lookup; | 103 | ext4_lblk_t i_dir_start_lookup; |
99 | #ifdef CONFIG_EXT4DEV_FS_XATTR | 104 | #ifdef CONFIG_EXT4DEV_FS_XATTR |
100 | /* | 105 | /* |
101 | * Extended attributes can be read independently of the main file | 106 | * Extended attributes can be read independently of the main file |
@@ -134,16 +139,16 @@ struct ext4_inode_info { | |||
134 | __u16 i_extra_isize; | 139 | __u16 i_extra_isize; |
135 | 140 | ||
136 | /* | 141 | /* |
137 | * truncate_mutex is for serialising ext4_truncate() against | 142 | * i_data_sem is for serialising ext4_truncate() against |
138 | * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's | 143 | * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's |
139 | * data tree are chopped off during truncate. We can't do that in | 144 | * data tree are chopped off during truncate. We can't do that in |
140 | * ext4 because whenever we perform intermediate commits during | 145 | * ext4 because whenever we perform intermediate commits during |
141 | * truncate, the inode and all the metadata blocks *must* be in a | 146 | * truncate, the inode and all the metadata blocks *must* be in a |
142 | * consistent state which allows truncation of the orphans to restart | 147 | * consistent state which allows truncation of the orphans to restart |
143 | * during recovery. Hence we must fix the get_block-vs-truncate race | 148 | * during recovery. Hence we must fix the get_block-vs-truncate race |
144 | * by other means, so we have truncate_mutex. | 149 | * by other means, so we have i_data_sem. |
145 | */ | 150 | */ |
146 | struct mutex truncate_mutex; | 151 | struct rw_semaphore i_data_sem; |
147 | struct inode vfs_inode; | 152 | struct inode vfs_inode; |
148 | 153 | ||
149 | unsigned long i_ext_generation; | 154 | unsigned long i_ext_generation; |
@@ -153,6 +158,10 @@ struct ext4_inode_info { | |||
153 | * struct timespec i_{a,c,m}time in the generic inode. | 158 | * struct timespec i_{a,c,m}time in the generic inode. |
154 | */ | 159 | */ |
155 | struct timespec i_crtime; | 160 | struct timespec i_crtime; |
161 | |||
162 | /* mballoc */ | ||
163 | struct list_head i_prealloc_list; | ||
164 | spinlock_t i_prealloc_lock; | ||
156 | }; | 165 | }; |
157 | 166 | ||
158 | #endif /* _LINUX_EXT4_FS_I */ | 167 | #endif /* _LINUX_EXT4_FS_I */ |
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h index b40e827cd495..abaae2c8cccf 100644 --- a/include/linux/ext4_fs_sb.h +++ b/include/linux/ext4_fs_sb.h | |||
@@ -35,9 +35,10 @@ struct ext4_sb_info { | |||
35 | unsigned long s_itb_per_group; /* Number of inode table blocks per group */ | 35 | unsigned long s_itb_per_group; /* Number of inode table blocks per group */ |
36 | unsigned long s_gdb_count; /* Number of group descriptor blocks */ | 36 | unsigned long s_gdb_count; /* Number of group descriptor blocks */ |
37 | unsigned long s_desc_per_block; /* Number of group descriptors per block */ | 37 | unsigned long s_desc_per_block; /* Number of group descriptors per block */ |
38 | unsigned long s_groups_count; /* Number of groups in the fs */ | 38 | ext4_group_t s_groups_count; /* Number of groups in the fs */ |
39 | unsigned long s_overhead_last; /* Last calculated overhead */ | 39 | unsigned long s_overhead_last; /* Last calculated overhead */ |
40 | unsigned long s_blocks_last; /* Last seen block count */ | 40 | unsigned long s_blocks_last; /* Last seen block count */ |
41 | loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ | ||
41 | struct buffer_head * s_sbh; /* Buffer containing the super block */ | 42 | struct buffer_head * s_sbh; /* Buffer containing the super block */ |
42 | struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */ | 43 | struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */ |
43 | struct buffer_head ** s_group_desc; | 44 | struct buffer_head ** s_group_desc; |
@@ -90,6 +91,58 @@ struct ext4_sb_info { | |||
90 | unsigned long s_ext_blocks; | 91 | unsigned long s_ext_blocks; |
91 | unsigned long s_ext_extents; | 92 | unsigned long s_ext_extents; |
92 | #endif | 93 | #endif |
94 | |||
95 | /* for buddy allocator */ | ||
96 | struct ext4_group_info ***s_group_info; | ||
97 | struct inode *s_buddy_cache; | ||
98 | long s_blocks_reserved; | ||
99 | spinlock_t s_reserve_lock; | ||
100 | struct list_head s_active_transaction; | ||
101 | struct list_head s_closed_transaction; | ||
102 | struct list_head s_committed_transaction; | ||
103 | spinlock_t s_md_lock; | ||
104 | tid_t s_last_transaction; | ||
105 | unsigned short *s_mb_offsets, *s_mb_maxs; | ||
106 | |||
107 | /* tunables */ | ||
108 | unsigned long s_stripe; | ||
109 | unsigned long s_mb_stream_request; | ||
110 | unsigned long s_mb_max_to_scan; | ||
111 | unsigned long s_mb_min_to_scan; | ||
112 | unsigned long s_mb_stats; | ||
113 | unsigned long s_mb_order2_reqs; | ||
114 | unsigned long s_mb_group_prealloc; | ||
115 | /* where last allocation was done - for stream allocation */ | ||
116 | unsigned long s_mb_last_group; | ||
117 | unsigned long s_mb_last_start; | ||
118 | |||
119 | /* history to debug policy */ | ||
120 | struct ext4_mb_history *s_mb_history; | ||
121 | int s_mb_history_cur; | ||
122 | int s_mb_history_max; | ||
123 | int s_mb_history_num; | ||
124 | struct proc_dir_entry *s_mb_proc; | ||
125 | spinlock_t s_mb_history_lock; | ||
126 | int s_mb_history_filter; | ||
127 | |||
128 | /* stats for buddy allocator */ | ||
129 | spinlock_t s_mb_pa_lock; | ||
130 | atomic_t s_bal_reqs; /* number of reqs with len > 1 */ | ||
131 | atomic_t s_bal_success; /* we found long enough chunks */ | ||
132 | atomic_t s_bal_allocated; /* in blocks */ | ||
133 | atomic_t s_bal_ex_scanned; /* total extents scanned */ | ||
134 | atomic_t s_bal_goals; /* goal hits */ | ||
135 | atomic_t s_bal_breaks; /* too long searches */ | ||
136 | atomic_t s_bal_2orders; /* 2^order hits */ | ||
137 | spinlock_t s_bal_lock; | ||
138 | unsigned long s_mb_buddies_generated; | ||
139 | unsigned long long s_mb_generation_time; | ||
140 | atomic_t s_mb_lost_chunks; | ||
141 | atomic_t s_mb_preallocated; | ||
142 | atomic_t s_mb_discarded; | ||
143 | |||
144 | /* locality groups */ | ||
145 | struct ext4_locality_group *s_locality_groups; | ||
93 | }; | 146 | }; |
94 | 147 | ||
95 | #endif /* _LINUX_EXT4_FS_SB */ | 148 | #endif /* _LINUX_EXT4_FS_SB */ |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 21398a5d688d..a516b6716870 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -124,6 +124,7 @@ extern int dir_notify_enable; | |||
124 | #define MS_SHARED (1<<20) /* change to shared */ | 124 | #define MS_SHARED (1<<20) /* change to shared */ |
125 | #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ | 125 | #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ |
126 | #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ | 126 | #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ |
127 | #define MS_I_VERSION (1<<23) /* Update inode I_version field */ | ||
127 | #define MS_ACTIVE (1<<30) | 128 | #define MS_ACTIVE (1<<30) |
128 | #define MS_NOUSER (1<<31) | 129 | #define MS_NOUSER (1<<31) |
129 | 130 | ||
@@ -173,6 +174,7 @@ extern int dir_notify_enable; | |||
173 | ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) | 174 | ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) |
174 | #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) | 175 | #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) |
175 | #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) | 176 | #define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) |
177 | #define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) | ||
176 | 178 | ||
177 | #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) | 179 | #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) |
178 | #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) | 180 | #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) |
@@ -599,7 +601,7 @@ struct inode { | |||
599 | uid_t i_uid; | 601 | uid_t i_uid; |
600 | gid_t i_gid; | 602 | gid_t i_gid; |
601 | dev_t i_rdev; | 603 | dev_t i_rdev; |
602 | unsigned long i_version; | 604 | u64 i_version; |
603 | loff_t i_size; | 605 | loff_t i_size; |
604 | #ifdef __NEED_I_SIZE_ORDERED | 606 | #ifdef __NEED_I_SIZE_ORDERED |
605 | seqcount_t i_size_seqcount; | 607 | seqcount_t i_size_seqcount; |
@@ -1394,6 +1396,21 @@ static inline void inode_dec_link_count(struct inode *inode) | |||
1394 | mark_inode_dirty(inode); | 1396 | mark_inode_dirty(inode); |
1395 | } | 1397 | } |
1396 | 1398 | ||
1399 | /** | ||
1400 | * inode_inc_iversion - increments i_version | ||
1401 | * @inode: inode that need to be updated | ||
1402 | * | ||
1403 | * Every time the inode is modified, the i_version field will be incremented. | ||
1404 | * The filesystem has to be mounted with i_version flag | ||
1405 | */ | ||
1406 | |||
1407 | static inline void inode_inc_iversion(struct inode *inode) | ||
1408 | { | ||
1409 | spin_lock(&inode->i_lock); | ||
1410 | inode->i_version++; | ||
1411 | spin_unlock(&inode->i_lock); | ||
1412 | } | ||
1413 | |||
1397 | extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry); | 1414 | extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry); |
1398 | static inline void file_accessed(struct file *file) | 1415 | static inline void file_accessed(struct file *file) |
1399 | { | 1416 | { |
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 06ef11457051..2cbf6fdb1799 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h | |||
@@ -149,6 +149,28 @@ typedef struct journal_header_s | |||
149 | __be32 h_sequence; | 149 | __be32 h_sequence; |
150 | } journal_header_t; | 150 | } journal_header_t; |
151 | 151 | ||
152 | /* | ||
153 | * Checksum types. | ||
154 | */ | ||
155 | #define JBD2_CRC32_CHKSUM 1 | ||
156 | #define JBD2_MD5_CHKSUM 2 | ||
157 | #define JBD2_SHA1_CHKSUM 3 | ||
158 | |||
159 | #define JBD2_CRC32_CHKSUM_SIZE 4 | ||
160 | |||
161 | #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) | ||
162 | /* | ||
163 | * Commit block header for storing transactional checksums: | ||
164 | */ | ||
165 | struct commit_header { | ||
166 | __be32 h_magic; | ||
167 | __be32 h_blocktype; | ||
168 | __be32 h_sequence; | ||
169 | unsigned char h_chksum_type; | ||
170 | unsigned char h_chksum_size; | ||
171 | unsigned char h_padding[2]; | ||
172 | __be32 h_chksum[JBD2_CHECKSUM_BYTES]; | ||
173 | }; | ||
152 | 174 | ||
153 | /* | 175 | /* |
154 | * The block tag: used to describe a single buffer in the journal. | 176 | * The block tag: used to describe a single buffer in the journal. |
@@ -242,31 +264,25 @@ typedef struct journal_superblock_s | |||
242 | ((j)->j_format_version >= 2 && \ | 264 | ((j)->j_format_version >= 2 && \ |
243 | ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) | 265 | ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) |
244 | 266 | ||
245 | #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 | 267 | #define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 |
246 | #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 | 268 | |
269 | #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 | ||
270 | #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 | ||
271 | #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 | ||
247 | 272 | ||
248 | /* Features known to this kernel version: */ | 273 | /* Features known to this kernel version: */ |
249 | #define JBD2_KNOWN_COMPAT_FEATURES 0 | 274 | #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM |
250 | #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 | 275 | #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 |
251 | #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ | 276 | #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ |
252 | JBD2_FEATURE_INCOMPAT_64BIT) | 277 | JBD2_FEATURE_INCOMPAT_64BIT | \ |
278 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) | ||
253 | 279 | ||
254 | #ifdef __KERNEL__ | 280 | #ifdef __KERNEL__ |
255 | 281 | ||
256 | #include <linux/fs.h> | 282 | #include <linux/fs.h> |
257 | #include <linux/sched.h> | 283 | #include <linux/sched.h> |
258 | 284 | ||
259 | #define JBD2_ASSERTIONS | 285 | #define J_ASSERT(assert) BUG_ON(!(assert)) |
260 | #ifdef JBD2_ASSERTIONS | ||
261 | #define J_ASSERT(assert) \ | ||
262 | do { \ | ||
263 | if (!(assert)) { \ | ||
264 | printk (KERN_EMERG \ | ||
265 | "Assertion failure in %s() at %s:%d: \"%s\"\n", \ | ||
266 | __FUNCTION__, __FILE__, __LINE__, # assert); \ | ||
267 | BUG(); \ | ||
268 | } \ | ||
269 | } while (0) | ||
270 | 286 | ||
271 | #if defined(CONFIG_BUFFER_DEBUG) | 287 | #if defined(CONFIG_BUFFER_DEBUG) |
272 | void buffer_assertion_failure(struct buffer_head *bh); | 288 | void buffer_assertion_failure(struct buffer_head *bh); |
@@ -282,10 +298,6 @@ void buffer_assertion_failure(struct buffer_head *bh); | |||
282 | #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) | 298 | #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) |
283 | #endif | 299 | #endif |
284 | 300 | ||
285 | #else | ||
286 | #define J_ASSERT(assert) do { } while (0) | ||
287 | #endif /* JBD2_ASSERTIONS */ | ||
288 | |||
289 | #if defined(JBD2_PARANOID_IOFAIL) | 301 | #if defined(JBD2_PARANOID_IOFAIL) |
290 | #define J_EXPECT(expr, why...) J_ASSERT(expr) | 302 | #define J_EXPECT(expr, why...) J_ASSERT(expr) |
291 | #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) | 303 | #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) |
@@ -406,9 +418,23 @@ struct handle_s | |||
406 | unsigned int h_sync: 1; /* sync-on-close */ | 418 | unsigned int h_sync: 1; /* sync-on-close */ |
407 | unsigned int h_jdata: 1; /* force data journaling */ | 419 | unsigned int h_jdata: 1; /* force data journaling */ |
408 | unsigned int h_aborted: 1; /* fatal error on handle */ | 420 | unsigned int h_aborted: 1; /* fatal error on handle */ |
421 | |||
422 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
423 | struct lockdep_map h_lockdep_map; | ||
424 | #endif | ||
409 | }; | 425 | }; |
410 | 426 | ||
411 | 427 | ||
428 | /* | ||
429 | * Some stats for checkpoint phase | ||
430 | */ | ||
431 | struct transaction_chp_stats_s { | ||
432 | unsigned long cs_chp_time; | ||
433 | unsigned long cs_forced_to_close; | ||
434 | unsigned long cs_written; | ||
435 | unsigned long cs_dropped; | ||
436 | }; | ||
437 | |||
412 | /* The transaction_t type is the guts of the journaling mechanism. It | 438 | /* The transaction_t type is the guts of the journaling mechanism. It |
413 | * tracks a compound transaction through its various states: | 439 | * tracks a compound transaction through its various states: |
414 | * | 440 | * |
@@ -456,6 +482,8 @@ struct transaction_s | |||
456 | /* | 482 | /* |
457 | * Transaction's current state | 483 | * Transaction's current state |
458 | * [no locking - only kjournald2 alters this] | 484 | * [no locking - only kjournald2 alters this] |
485 | * [j_list_lock] guards transition of a transaction into T_FINISHED | ||
486 | * state and subsequent call of __jbd2_journal_drop_transaction() | ||
459 | * FIXME: needs barriers | 487 | * FIXME: needs barriers |
460 | * KLUDGE: [use j_state_lock] | 488 | * KLUDGE: [use j_state_lock] |
461 | */ | 489 | */ |
@@ -544,6 +572,21 @@ struct transaction_s | |||
544 | spinlock_t t_handle_lock; | 572 | spinlock_t t_handle_lock; |
545 | 573 | ||
546 | /* | 574 | /* |
575 | * Longest time some handle had to wait for running transaction | ||
576 | */ | ||
577 | unsigned long t_max_wait; | ||
578 | |||
579 | /* | ||
580 | * When transaction started | ||
581 | */ | ||
582 | unsigned long t_start; | ||
583 | |||
584 | /* | ||
585 | * Checkpointing stats [j_checkpoint_sem] | ||
586 | */ | ||
587 | struct transaction_chp_stats_s t_chp_stats; | ||
588 | |||
589 | /* | ||
547 | * Number of outstanding updates running on this transaction | 590 | * Number of outstanding updates running on this transaction |
548 | * [t_handle_lock] | 591 | * [t_handle_lock] |
549 | */ | 592 | */ |
@@ -574,6 +617,39 @@ struct transaction_s | |||
574 | 617 | ||
575 | }; | 618 | }; |
576 | 619 | ||
620 | struct transaction_run_stats_s { | ||
621 | unsigned long rs_wait; | ||
622 | unsigned long rs_running; | ||
623 | unsigned long rs_locked; | ||
624 | unsigned long rs_flushing; | ||
625 | unsigned long rs_logging; | ||
626 | |||
627 | unsigned long rs_handle_count; | ||
628 | unsigned long rs_blocks; | ||
629 | unsigned long rs_blocks_logged; | ||
630 | }; | ||
631 | |||
632 | struct transaction_stats_s { | ||
633 | int ts_type; | ||
634 | unsigned long ts_tid; | ||
635 | union { | ||
636 | struct transaction_run_stats_s run; | ||
637 | struct transaction_chp_stats_s chp; | ||
638 | } u; | ||
639 | }; | ||
640 | |||
641 | #define JBD2_STATS_RUN 1 | ||
642 | #define JBD2_STATS_CHECKPOINT 2 | ||
643 | |||
644 | static inline unsigned long | ||
645 | jbd2_time_diff(unsigned long start, unsigned long end) | ||
646 | { | ||
647 | if (end >= start) | ||
648 | return end - start; | ||
649 | |||
650 | return end + (MAX_JIFFY_OFFSET - start); | ||
651 | } | ||
652 | |||
577 | /** | 653 | /** |
578 | * struct journal_s - The journal_s type is the concrete type associated with | 654 | * struct journal_s - The journal_s type is the concrete type associated with |
579 | * journal_t. | 655 | * journal_t. |
@@ -635,6 +711,12 @@ struct transaction_s | |||
635 | * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the | 711 | * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the |
636 | * number that will fit in j_blocksize | 712 | * number that will fit in j_blocksize |
637 | * @j_last_sync_writer: most recent pid which did a synchronous write | 713 | * @j_last_sync_writer: most recent pid which did a synchronous write |
714 | * @j_history: Buffer storing the transactions statistics history | ||
715 | * @j_history_max: Maximum number of transactions in the statistics history | ||
716 | * @j_history_cur: Current number of transactions in the statistics history | ||
717 | * @j_history_lock: Protect the transactions statistics history | ||
718 | * @j_proc_entry: procfs entry for the jbd statistics directory | ||
719 | * @j_stats: Overall statistics | ||
638 | * @j_private: An opaque pointer to fs-private information. | 720 | * @j_private: An opaque pointer to fs-private information. |
639 | */ | 721 | */ |
640 | 722 | ||
@@ -827,6 +909,19 @@ struct journal_s | |||
827 | pid_t j_last_sync_writer; | 909 | pid_t j_last_sync_writer; |
828 | 910 | ||
829 | /* | 911 | /* |
912 | * Journal statistics | ||
913 | */ | ||
914 | struct transaction_stats_s *j_history; | ||
915 | int j_history_max; | ||
916 | int j_history_cur; | ||
917 | /* | ||
918 | * Protect the transactions statistics history | ||
919 | */ | ||
920 | spinlock_t j_history_lock; | ||
921 | struct proc_dir_entry *j_proc_entry; | ||
922 | struct transaction_stats_s j_stats; | ||
923 | |||
924 | /* | ||
830 | * An opaque pointer to fs-private information. ext3 puts its | 925 | * An opaque pointer to fs-private information. ext3 puts its |
831 | * superblock pointer here | 926 | * superblock pointer here |
832 | */ | 927 | */ |
@@ -932,6 +1027,8 @@ extern int jbd2_journal_check_available_features | |||
932 | (journal_t *, unsigned long, unsigned long, unsigned long); | 1027 | (journal_t *, unsigned long, unsigned long, unsigned long); |
933 | extern int jbd2_journal_set_features | 1028 | extern int jbd2_journal_set_features |
934 | (journal_t *, unsigned long, unsigned long, unsigned long); | 1029 | (journal_t *, unsigned long, unsigned long, unsigned long); |
1030 | extern void jbd2_journal_clear_features | ||
1031 | (journal_t *, unsigned long, unsigned long, unsigned long); | ||
935 | extern int jbd2_journal_create (journal_t *); | 1032 | extern int jbd2_journal_create (journal_t *); |
936 | extern int jbd2_journal_load (journal_t *journal); | 1033 | extern int jbd2_journal_load (journal_t *journal); |
937 | extern void jbd2_journal_destroy (journal_t *); | 1034 | extern void jbd2_journal_destroy (journal_t *); |
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c index bda0d71a2514..78ccd73a8841 100644 --- a/lib/find_next_bit.c +++ b/lib/find_next_bit.c | |||
@@ -178,4 +178,47 @@ found_middle_swap: | |||
178 | 178 | ||
179 | EXPORT_SYMBOL(generic_find_next_zero_le_bit); | 179 | EXPORT_SYMBOL(generic_find_next_zero_le_bit); |
180 | 180 | ||
181 | unsigned long generic_find_next_le_bit(const unsigned long *addr, unsigned | ||
182 | long size, unsigned long offset) | ||
183 | { | ||
184 | const unsigned long *p = addr + BITOP_WORD(offset); | ||
185 | unsigned long result = offset & ~(BITS_PER_LONG - 1); | ||
186 | unsigned long tmp; | ||
187 | |||
188 | if (offset >= size) | ||
189 | return size; | ||
190 | size -= result; | ||
191 | offset &= (BITS_PER_LONG - 1UL); | ||
192 | if (offset) { | ||
193 | tmp = ext2_swabp(p++); | ||
194 | tmp &= (~0UL << offset); | ||
195 | if (size < BITS_PER_LONG) | ||
196 | goto found_first; | ||
197 | if (tmp) | ||
198 | goto found_middle; | ||
199 | size -= BITS_PER_LONG; | ||
200 | result += BITS_PER_LONG; | ||
201 | } | ||
202 | |||
203 | while (size & ~(BITS_PER_LONG - 1)) { | ||
204 | tmp = *(p++); | ||
205 | if (tmp) | ||
206 | goto found_middle_swap; | ||
207 | result += BITS_PER_LONG; | ||
208 | size -= BITS_PER_LONG; | ||
209 | } | ||
210 | if (!size) | ||
211 | return result; | ||
212 | tmp = ext2_swabp(p); | ||
213 | found_first: | ||
214 | tmp &= (~0UL >> (BITS_PER_LONG - size)); | ||
215 | if (tmp == 0UL) /* Are any bits set? */ | ||
216 | return result + size; /* Nope. */ | ||
217 | found_middle: | ||
218 | return result + __ffs(tmp); | ||
219 | |||
220 | found_middle_swap: | ||
221 | return result + __ffs(ext2_swab(tmp)); | ||
222 | } | ||
223 | EXPORT_SYMBOL(generic_find_next_le_bit); | ||
181 | #endif /* __BIG_ENDIAN */ | 224 | #endif /* __BIG_ENDIAN */ |