aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/ext4.h12
-rw-r--r--fs/ext4/ext4_jbd2.h11
-rw-r--r--fs/ext4/extents.c223
-rw-r--r--fs/ext4/fsync.c33
-rw-r--r--fs/ext4/ialloc.c8
-rw-r--r--fs/ext4/inode.c467
-rw-r--r--fs/ext4/ioctl.c15
-rw-r--r--fs/ext4/mballoc.c36
-rw-r--r--fs/ext4/mballoc.h2
-rw-r--r--fs/ext4/migrate.c12
-rw-r--r--fs/ext4/namei.c13
-rw-r--r--fs/ext4/page-io.c16
-rw-r--r--fs/ext4/resize.c12
-rw-r--r--fs/ext4/super.c126
-rw-r--r--fs/ext4/xattr.c4
17 files changed, 579 insertions, 418 deletions
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e0270d1f8d82..21eacd7b7d79 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -433,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
433 return -EINVAL; 433 return -EINVAL;
434 if (!test_opt(inode->i_sb, POSIX_ACL)) 434 if (!test_opt(inode->i_sb, POSIX_ACL))
435 return -EOPNOTSUPP; 435 return -EOPNOTSUPP;
436 if (!is_owner_or_cap(inode)) 436 if (!inode_owner_or_capable(inode))
437 return -EPERM; 437 return -EPERM;
438 438
439 if (value) { 439 if (value) {
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index adf96b822781..1c67139ad4b4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -21,6 +21,8 @@
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "mballoc.h" 22#include "mballoc.h"
23 23
24#include <trace/events/ext4.h>
25
24/* 26/*
25 * balloc.c contains the blocks allocation and deallocation routines 27 * balloc.c contains the blocks allocation and deallocation routines
26 */ 28 */
@@ -342,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
342 * We do it here so the bitmap uptodate bit 344 * We do it here so the bitmap uptodate bit
343 * get set with buffer lock held. 345 * get set with buffer lock held.
344 */ 346 */
347 trace_ext4_read_block_bitmap_load(sb, block_group);
345 set_bitmap_uptodate(bh); 348 set_bitmap_uptodate(bh);
346 if (bh_submit_read(bh) < 0) { 349 if (bh_submit_read(bh) < 0) {
347 put_bh(bh); 350 put_bh(bh);
@@ -544,7 +547,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
544 * 547 *
545 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if 548 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
546 * it is profitable to retry the operation, this function will wait 549 * it is profitable to retry the operation, this function will wait
547 * for the current or commiting transaction to complete, and then 550 * for the current or committing transaction to complete, and then
548 * return TRUE. 551 * return TRUE.
549 * 552 *
550 * if the total number of retries exceed three times, return FALSE. 553 * if the total number of retries exceed three times, return FALSE.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3aa0b72b3b94..4daaf2b753f4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -923,14 +923,14 @@ struct ext4_inode_info {
923#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ 923#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
924 EXT4_MOUNT2_##opt) 924 EXT4_MOUNT2_##opt)
925 925
926#define ext4_set_bit ext2_set_bit 926#define ext4_set_bit __test_and_set_bit_le
927#define ext4_set_bit_atomic ext2_set_bit_atomic 927#define ext4_set_bit_atomic ext2_set_bit_atomic
928#define ext4_clear_bit ext2_clear_bit 928#define ext4_clear_bit __test_and_clear_bit_le
929#define ext4_clear_bit_atomic ext2_clear_bit_atomic 929#define ext4_clear_bit_atomic ext2_clear_bit_atomic
930#define ext4_test_bit ext2_test_bit 930#define ext4_test_bit test_bit_le
931#define ext4_find_first_zero_bit ext2_find_first_zero_bit 931#define ext4_find_first_zero_bit find_first_zero_bit_le
932#define ext4_find_next_zero_bit ext2_find_next_zero_bit 932#define ext4_find_next_zero_bit find_next_zero_bit_le
933#define ext4_find_next_bit ext2_find_next_bit 933#define ext4_find_next_bit find_next_bit_le
934 934
935/* 935/*
936 * Maximal mount counts between two filesystem checks 936 * Maximal mount counts between two filesystem checks
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d8b992e658c1..d0f53538a57f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,8 +86,8 @@
86 86
87#ifdef CONFIG_QUOTA 87#ifdef CONFIG_QUOTA
88/* Amount of blocks needed for quota update - we know that the structure was 88/* Amount of blocks needed for quota update - we know that the structure was
89 * allocated so we need to update only inode+data */ 89 * allocated so we need to update only data block */
90#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) 90#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
91/* Amount of blocks needed for quota insert/delete - we do some block writes 91/* Amount of blocks needed for quota insert/delete - we do some block writes
92 * but inode, sb and group updates are done only once */ 92 * but inode, sb and group updates are done only once */
93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ 93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
@@ -202,13 +202,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
202 return 1; 202 return 1;
203} 203}
204 204
205static inline void ext4_journal_release_buffer(handle_t *handle,
206 struct buffer_head *bh)
207{
208 if (ext4_handle_valid(handle))
209 jbd2_journal_release_buffer(handle, bh);
210}
211
212static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) 205static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
213{ 206{
214 return ext4_journal_start_sb(inode->i_sb, nblocks); 207 return ext4_journal_start_sb(inode->i_sb, nblocks);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7516fb9c0bd5..4890d6f3ad15 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,8 @@
44#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
45#include "ext4_extents.h" 45#include "ext4_extents.h"
46 46
47#include <trace/events/ext4.h>
48
47static int ext4_ext_truncate_extend_restart(handle_t *handle, 49static int ext4_ext_truncate_extend_restart(handle_t *handle,
48 struct inode *inode, 50 struct inode *inode,
49 int needed) 51 int needed)
@@ -664,6 +666,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
664 if (unlikely(!bh)) 666 if (unlikely(!bh))
665 goto err; 667 goto err;
666 if (!bh_uptodate_or_lock(bh)) { 668 if (!bh_uptodate_or_lock(bh)) {
669 trace_ext4_ext_load_extent(inode, block,
670 path[ppos].p_block);
667 if (bh_submit_read(bh) < 0) { 671 if (bh_submit_read(bh) < 0) {
668 put_bh(bh); 672 put_bh(bh);
669 goto err; 673 goto err;
@@ -1034,7 +1038,7 @@ cleanup:
1034 for (i = 0; i < depth; i++) { 1038 for (i = 0; i < depth; i++) {
1035 if (!ablocks[i]) 1039 if (!ablocks[i])
1036 continue; 1040 continue;
1037 ext4_free_blocks(handle, inode, 0, ablocks[i], 1, 1041 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1038 EXT4_FREE_BLOCKS_METADATA); 1042 EXT4_FREE_BLOCKS_METADATA);
1039 } 1043 }
1040 } 1044 }
@@ -1725,7 +1729,7 @@ repeat:
1725 BUG_ON(npath->p_depth != path->p_depth); 1729 BUG_ON(npath->p_depth != path->p_depth);
1726 eh = npath[depth].p_hdr; 1730 eh = npath[depth].p_hdr;
1727 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1731 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1728 ext_debug("next leaf isnt full(%d)\n", 1732 ext_debug("next leaf isn't full(%d)\n",
1729 le16_to_cpu(eh->eh_entries)); 1733 le16_to_cpu(eh->eh_entries));
1730 path = npath; 1734 path = npath;
1731 goto repeat; 1735 goto repeat;
@@ -2059,7 +2063,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2059 if (err) 2063 if (err)
2060 return err; 2064 return err;
2061 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2065 ext_debug("index is empty, remove it, free block %llu\n", leaf);
2062 ext4_free_blocks(handle, inode, 0, leaf, 1, 2066 ext4_free_blocks(handle, inode, NULL, leaf, 1,
2063 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2067 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2064 return err; 2068 return err;
2065} 2069}
@@ -2156,7 +2160,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2156 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2160 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2157 start = ext4_ext_pblock(ex) + ee_len - num; 2161 start = ext4_ext_pblock(ex) + ee_len - num;
2158 ext_debug("free last %u blocks starting %llu\n", num, start); 2162 ext_debug("free last %u blocks starting %llu\n", num, start);
2159 ext4_free_blocks(handle, inode, 0, start, num, flags); 2163 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2160 } else if (from == le32_to_cpu(ex->ee_block) 2164 } else if (from == le32_to_cpu(ex->ee_block)
2161 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2165 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2162 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2166 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2529,7 +2533,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2529/* 2533/*
2530 * This function is called by ext4_ext_map_blocks() if someone tries to write 2534 * This function is called by ext4_ext_map_blocks() if someone tries to write
2531 * to an uninitialized extent. It may result in splitting the uninitialized 2535 * to an uninitialized extent. It may result in splitting the uninitialized
2532 * extent into multiple extents (upto three - one initialized and two 2536 * extent into multiple extents (up to three - one initialized and two
2533 * uninitialized). 2537 * uninitialized).
2534 * There are three possibilities: 2538 * There are three possibilities:
2535 * a> There is no split required: Entire extent should be initialized 2539 * a> There is no split required: Entire extent should be initialized
@@ -3108,14 +3112,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3108{ 3112{
3109 int i, depth; 3113 int i, depth;
3110 struct ext4_extent_header *eh; 3114 struct ext4_extent_header *eh;
3111 struct ext4_extent *ex, *last_ex; 3115 struct ext4_extent *last_ex;
3112 3116
3113 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3117 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3114 return 0; 3118 return 0;
3115 3119
3116 depth = ext_depth(inode); 3120 depth = ext_depth(inode);
3117 eh = path[depth].p_hdr; 3121 eh = path[depth].p_hdr;
3118 ex = path[depth].p_ext;
3119 3122
3120 if (unlikely(!eh->eh_entries)) { 3123 if (unlikely(!eh->eh_entries)) {
3121 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " 3124 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
@@ -3171,7 +3174,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3171 path, flags); 3174 path, flags);
3172 /* 3175 /*
3173 * Flag the inode(non aio case) or end_io struct (aio case) 3176 * Flag the inode(non aio case) or end_io struct (aio case)
3174 * that this IO needs to convertion to written when IO is 3177 * that this IO needs to conversion to written when IO is
3175 * completed 3178 * completed
3176 */ 3179 */
3177 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3180 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
@@ -3295,9 +3298,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3295 struct ext4_map_blocks *map, int flags) 3298 struct ext4_map_blocks *map, int flags)
3296{ 3299{
3297 struct ext4_ext_path *path = NULL; 3300 struct ext4_ext_path *path = NULL;
3298 struct ext4_extent_header *eh;
3299 struct ext4_extent newex, *ex; 3301 struct ext4_extent newex, *ex;
3300 ext4_fsblk_t newblock; 3302 ext4_fsblk_t newblock = 0;
3301 int err = 0, depth, ret; 3303 int err = 0, depth, ret;
3302 unsigned int allocated = 0; 3304 unsigned int allocated = 0;
3303 struct ext4_allocation_request ar; 3305 struct ext4_allocation_request ar;
@@ -3305,6 +3307,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3305 3307
3306 ext_debug("blocks %u/%u requested for inode %lu\n", 3308 ext_debug("blocks %u/%u requested for inode %lu\n",
3307 map->m_lblk, map->m_len, inode->i_ino); 3309 map->m_lblk, map->m_len, inode->i_ino);
3310 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3308 3311
3309 /* check in cache */ 3312 /* check in cache */
3310 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3313 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
@@ -3352,7 +3355,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3352 err = -EIO; 3355 err = -EIO;
3353 goto out2; 3356 goto out2;
3354 } 3357 }
3355 eh = path[depth].p_hdr;
3356 3358
3357 ex = path[depth].p_ext; 3359 ex = path[depth].p_ext;
3358 if (ex) { 3360 if (ex) {
@@ -3458,10 +3460,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3458 ext4_ext_mark_uninitialized(&newex); 3460 ext4_ext_mark_uninitialized(&newex);
3459 /* 3461 /*
3460 * io_end structure was created for every IO write to an 3462 * io_end structure was created for every IO write to an
3461 * uninitialized extent. To avoid unecessary conversion, 3463 * uninitialized extent. To avoid unnecessary conversion,
3462 * here we flag the IO that really needs the conversion. 3464 * here we flag the IO that really needs the conversion.
3463 * For non asycn direct IO case, flag the inode state 3465 * For non asycn direct IO case, flag the inode state
3464 * that we need to perform convertion when IO is done. 3466 * that we need to perform conversion when IO is done.
3465 */ 3467 */
3466 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3468 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3467 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3469 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
@@ -3485,7 +3487,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3485 /* not a good idea to call discard here directly, 3487 /* not a good idea to call discard here directly,
3486 * but otherwise we'd need to call it every free() */ 3488 * but otherwise we'd need to call it every free() */
3487 ext4_discard_preallocations(inode); 3489 ext4_discard_preallocations(inode);
3488 ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex), 3490 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
3489 ext4_ext_get_actual_len(&newex), 0); 3491 ext4_ext_get_actual_len(&newex), 0);
3490 goto out2; 3492 goto out2;
3491 } 3493 }
@@ -3525,6 +3527,8 @@ out2:
3525 ext4_ext_drop_refs(path); 3527 ext4_ext_drop_refs(path);
3526 kfree(path); 3528 kfree(path);
3527 } 3529 }
3530 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3531 newblock, map->m_len, err ? err : allocated);
3528 return err ? err : allocated; 3532 return err ? err : allocated;
3529} 3533}
3530 3534
@@ -3658,6 +3662,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3658 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3662 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3659 return -EOPNOTSUPP; 3663 return -EOPNOTSUPP;
3660 3664
3665 trace_ext4_fallocate_enter(inode, offset, len, mode);
3661 map.m_lblk = offset >> blkbits; 3666 map.m_lblk = offset >> blkbits;
3662 /* 3667 /*
3663 * We can't just convert len to max_blocks because 3668 * We can't just convert len to max_blocks because
@@ -3673,6 +3678,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3673 ret = inode_newsize_ok(inode, (len + offset)); 3678 ret = inode_newsize_ok(inode, (len + offset));
3674 if (ret) { 3679 if (ret) {
3675 mutex_unlock(&inode->i_mutex); 3680 mutex_unlock(&inode->i_mutex);
3681 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
3676 return ret; 3682 return ret;
3677 } 3683 }
3678retry: 3684retry:
@@ -3717,6 +3723,8 @@ retry:
3717 goto retry; 3723 goto retry;
3718 } 3724 }
3719 mutex_unlock(&inode->i_mutex); 3725 mutex_unlock(&inode->i_mutex);
3726 trace_ext4_fallocate_exit(inode, offset, max_blocks,
3727 ret > 0 ? ret2 : ret);
3720 return ret > 0 ? ret2 : ret; 3728 return ret > 0 ? ret2 : ret;
3721} 3729}
3722 3730
@@ -3775,6 +3783,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3775 } 3783 }
3776 return ret > 0 ? ret2 : ret; 3784 return ret > 0 ? ret2 : ret;
3777} 3785}
3786
3778/* 3787/*
3779 * Callback function called for each extent to gather FIEMAP information. 3788 * Callback function called for each extent to gather FIEMAP information.
3780 */ 3789 */
@@ -3782,38 +3791,162 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3782 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3791 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3783 void *data) 3792 void *data)
3784{ 3793{
3785 struct fiemap_extent_info *fieinfo = data;
3786 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3787 __u64 logical; 3794 __u64 logical;
3788 __u64 physical; 3795 __u64 physical;
3789 __u64 length; 3796 __u64 length;
3797 loff_t size;
3790 __u32 flags = 0; 3798 __u32 flags = 0;
3791 int error; 3799 int ret = 0;
3800 struct fiemap_extent_info *fieinfo = data;
3801 unsigned char blksize_bits;
3792 3802
3793 logical = (__u64)newex->ec_block << blksize_bits; 3803 blksize_bits = inode->i_sb->s_blocksize_bits;
3804 logical = (__u64)newex->ec_block << blksize_bits;
3794 3805
3795 if (newex->ec_start == 0) { 3806 if (newex->ec_start == 0) {
3796 pgoff_t offset; 3807 /*
3797 struct page *page; 3808 * No extent in extent-tree contains block @newex->ec_start,
3809 * then the block may stay in 1)a hole or 2)delayed-extent.
3810 *
3811 * Holes or delayed-extents are processed as follows.
3812 * 1. lookup dirty pages with specified range in pagecache.
3813 * If no page is got, then there is no delayed-extent and
3814 * return with EXT_CONTINUE.
3815 * 2. find the 1st mapped buffer,
3816 * 3. check if the mapped buffer is both in the request range
3817 * and a delayed buffer. If not, there is no delayed-extent,
3818 * then return.
3819 * 4. a delayed-extent is found, the extent will be collected.
3820 */
3821 ext4_lblk_t end = 0;
3822 pgoff_t last_offset;
3823 pgoff_t offset;
3824 pgoff_t index;
3825 struct page **pages = NULL;
3798 struct buffer_head *bh = NULL; 3826 struct buffer_head *bh = NULL;
3827 struct buffer_head *head = NULL;
3828 unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
3829
3830 pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
3831 if (pages == NULL)
3832 return -ENOMEM;
3799 3833
3800 offset = logical >> PAGE_SHIFT; 3834 offset = logical >> PAGE_SHIFT;
3801 page = find_get_page(inode->i_mapping, offset); 3835repeat:
3802 if (!page || !page_has_buffers(page)) 3836 last_offset = offset;
3803 return EXT_CONTINUE; 3837 head = NULL;
3838 ret = find_get_pages_tag(inode->i_mapping, &offset,
3839 PAGECACHE_TAG_DIRTY, nr_pages, pages);
3840
3841 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
3842 /* First time, try to find a mapped buffer. */
3843 if (ret == 0) {
3844out:
3845 for (index = 0; index < ret; index++)
3846 page_cache_release(pages[index]);
3847 /* just a hole. */
3848 kfree(pages);
3849 return EXT_CONTINUE;
3850 }
3804 3851
3805 bh = page_buffers(page); 3852 /* Try to find the 1st mapped buffer. */
3853 end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
3854 blksize_bits;
3855 if (!page_has_buffers(pages[0]))
3856 goto out;
3857 head = page_buffers(pages[0]);
3858 if (!head)
3859 goto out;
3806 3860
3807 if (!bh) 3861 bh = head;
3808 return EXT_CONTINUE; 3862 do {
3863 if (buffer_mapped(bh)) {
3864 /* get the 1st mapped buffer. */
3865 if (end > newex->ec_block +
3866 newex->ec_len)
3867 /* The buffer is out of
3868 * the request range.
3869 */
3870 goto out;
3871 goto found_mapped_buffer;
3872 }
3873 bh = bh->b_this_page;
3874 end++;
3875 } while (bh != head);
3809 3876
3810 if (buffer_delay(bh)) { 3877 /* No mapped buffer found. */
3811 flags |= FIEMAP_EXTENT_DELALLOC; 3878 goto out;
3812 page_cache_release(page);
3813 } else { 3879 } else {
3814 page_cache_release(page); 3880 /*Find contiguous delayed buffers. */
3815 return EXT_CONTINUE; 3881 if (ret > 0 && pages[0]->index == last_offset)
3882 head = page_buffers(pages[0]);
3883 bh = head;
3816 } 3884 }
3885
3886found_mapped_buffer:
3887 if (bh != NULL && buffer_delay(bh)) {
3888 /* 1st or contiguous delayed buffer found. */
3889 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
3890 /*
3891 * 1st delayed buffer found, record
3892 * the start of extent.
3893 */
3894 flags |= FIEMAP_EXTENT_DELALLOC;
3895 newex->ec_block = end;
3896 logical = (__u64)end << blksize_bits;
3897 }
3898 /* Find contiguous delayed buffers. */
3899 do {
3900 if (!buffer_delay(bh))
3901 goto found_delayed_extent;
3902 bh = bh->b_this_page;
3903 end++;
3904 } while (bh != head);
3905
3906 for (index = 1; index < ret; index++) {
3907 if (!page_has_buffers(pages[index])) {
3908 bh = NULL;
3909 break;
3910 }
3911 head = page_buffers(pages[index]);
3912 if (!head) {
3913 bh = NULL;
3914 break;
3915 }
3916 if (pages[index]->index !=
3917 pages[0]->index + index) {
3918 /* Blocks are not contiguous. */
3919 bh = NULL;
3920 break;
3921 }
3922 bh = head;
3923 do {
3924 if (!buffer_delay(bh))
3925 /* Delayed-extent ends. */
3926 goto found_delayed_extent;
3927 bh = bh->b_this_page;
3928 end++;
3929 } while (bh != head);
3930 }
3931 } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
3932 /* a hole found. */
3933 goto out;
3934
3935found_delayed_extent:
3936 newex->ec_len = min(end - newex->ec_block,
3937 (ext4_lblk_t)EXT_INIT_MAX_LEN);
3938 if (ret == nr_pages && bh != NULL &&
3939 newex->ec_len < EXT_INIT_MAX_LEN &&
3940 buffer_delay(bh)) {
3941 /* Have not collected an extent and continue. */
3942 for (index = 0; index < ret; index++)
3943 page_cache_release(pages[index]);
3944 goto repeat;
3945 }
3946
3947 for (index = 0; index < ret; index++)
3948 page_cache_release(pages[index]);
3949 kfree(pages);
3817 } 3950 }
3818 3951
3819 physical = (__u64)newex->ec_start << blksize_bits; 3952 physical = (__u64)newex->ec_start << blksize_bits;
@@ -3822,32 +3955,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3822 if (ex && ext4_ext_is_uninitialized(ex)) 3955 if (ex && ext4_ext_is_uninitialized(ex))
3823 flags |= FIEMAP_EXTENT_UNWRITTEN; 3956 flags |= FIEMAP_EXTENT_UNWRITTEN;
3824 3957
3825 /* 3958 size = i_size_read(inode);
3826 * If this extent reaches EXT_MAX_BLOCK, it must be last. 3959 if (logical + length >= size)
3827 *
3828 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3829 * this also indicates no more allocated blocks.
3830 *
3831 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3832 */
3833 if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3834 newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3835 loff_t size = i_size_read(inode);
3836 loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3837
3838 flags |= FIEMAP_EXTENT_LAST; 3960 flags |= FIEMAP_EXTENT_LAST;
3839 if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3840 logical+length > size)
3841 length = (size - logical + bs - 1) & ~(bs-1);
3842 }
3843 3961
3844 error = fiemap_fill_next_extent(fieinfo, logical, physical, 3962 ret = fiemap_fill_next_extent(fieinfo, logical, physical,
3845 length, flags); 3963 length, flags);
3846 if (error < 0) 3964 if (ret < 0)
3847 return error; 3965 return ret;
3848 if (error == 1) 3966 if (ret == 1)
3849 return EXT_BREAK; 3967 return EXT_BREAK;
3850
3851 return EXT_CONTINUE; 3968 return EXT_CONTINUE;
3852} 3969}
3853 3970
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 7829b287822a..e9473cbe80df 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -101,7 +101,7 @@ extern int ext4_flush_completed_IO(struct inode *inode)
101 * to the work-to-be schedule is freed. 101 * to the work-to-be schedule is freed.
102 * 102 *
103 * Thus we need to keep the io structure still valid here after 103 * Thus we need to keep the io structure still valid here after
104 * convertion finished. The io structure has a flag to 104 * conversion finished. The io structure has a flag to
105 * avoid double converting from both fsync and background work 105 * avoid double converting from both fsync and background work
106 * queue work. 106 * queue work.
107 */ 107 */
@@ -125,9 +125,11 @@ extern int ext4_flush_completed_IO(struct inode *inode)
125 * the parent directory's parent as well, and so on recursively, if 125 * the parent directory's parent as well, and so on recursively, if
126 * they are also freshly created. 126 * they are also freshly created.
127 */ 127 */
128static void ext4_sync_parent(struct inode *inode) 128static int ext4_sync_parent(struct inode *inode)
129{ 129{
130 struct writeback_control wbc;
130 struct dentry *dentry = NULL; 131 struct dentry *dentry = NULL;
132 int ret = 0;
131 133
132 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { 134 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
133 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); 135 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -136,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
136 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) 138 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
137 break; 139 break;
138 inode = dentry->d_parent->d_inode; 140 inode = dentry->d_parent->d_inode;
139 sync_mapping_buffers(inode->i_mapping); 141 ret = sync_mapping_buffers(inode->i_mapping);
142 if (ret)
143 break;
144 memset(&wbc, 0, sizeof(wbc));
145 wbc.sync_mode = WB_SYNC_ALL;
146 wbc.nr_to_write = 0; /* only write out the inode */
147 ret = sync_inode(inode, &wbc);
148 if (ret)
149 break;
140 } 150 }
151 return ret;
141} 152}
142 153
143/* 154/*
@@ -164,20 +175,20 @@ int ext4_sync_file(struct file *file, int datasync)
164 175
165 J_ASSERT(ext4_journal_current_handle() == NULL); 176 J_ASSERT(ext4_journal_current_handle() == NULL);
166 177
167 trace_ext4_sync_file(file, datasync); 178 trace_ext4_sync_file_enter(file, datasync);
168 179
169 if (inode->i_sb->s_flags & MS_RDONLY) 180 if (inode->i_sb->s_flags & MS_RDONLY)
170 return 0; 181 return 0;
171 182
172 ret = ext4_flush_completed_IO(inode); 183 ret = ext4_flush_completed_IO(inode);
173 if (ret < 0) 184 if (ret < 0)
174 return ret; 185 goto out;
175 186
176 if (!journal) { 187 if (!journal) {
177 ret = generic_file_fsync(file, datasync); 188 ret = generic_file_fsync(file, datasync);
178 if (!ret && !list_empty(&inode->i_dentry)) 189 if (!ret && !list_empty(&inode->i_dentry))
179 ext4_sync_parent(inode); 190 ret = ext4_sync_parent(inode);
180 return ret; 191 goto out;
181 } 192 }
182 193
183 /* 194 /*
@@ -194,8 +205,10 @@ int ext4_sync_file(struct file *file, int datasync)
194 * (they were dirtied by commit). But that's OK - the blocks are 205 * (they were dirtied by commit). But that's OK - the blocks are
195 * safe in-journal, which is all fsync() needs to ensure. 206 * safe in-journal, which is all fsync() needs to ensure.
196 */ 207 */
197 if (ext4_should_journal_data(inode)) 208 if (ext4_should_journal_data(inode)) {
198 return ext4_force_commit(inode->i_sb); 209 ret = ext4_force_commit(inode->i_sb);
210 goto out;
211 }
199 212
200 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 213 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
201 if (jbd2_log_start_commit(journal, commit_tid)) { 214 if (jbd2_log_start_commit(journal, commit_tid)) {
@@ -215,5 +228,7 @@ int ext4_sync_file(struct file *file, int datasync)
215 ret = jbd2_log_wait_commit(journal, commit_tid); 228 ret = jbd2_log_wait_commit(journal, commit_tid);
216 } else if (journal->j_flags & JBD2_BARRIER) 229 } else if (journal->j_flags & JBD2_BARRIER)
217 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 230 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
231 out:
232 trace_ext4_sync_file_exit(inode, ret);
218 return ret; 233 return ret;
219} 234}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 78b79e1bd7ed..21bb2f61e502 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -152,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
152 * We do it here so the bitmap uptodate bit 152 * We do it here so the bitmap uptodate bit
153 * get set with buffer lock held. 153 * get set with buffer lock held.
154 */ 154 */
155 trace_ext4_load_inode_bitmap(sb, block_group);
155 set_bitmap_uptodate(bh); 156 set_bitmap_uptodate(bh);
156 if (bh_submit_read(bh) < 0) { 157 if (bh_submit_read(bh) < 0) {
157 put_bh(bh); 158 put_bh(bh);
@@ -649,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
649 *group = parent_group + flex_size; 650 *group = parent_group + flex_size;
650 if (*group > ngroups) 651 if (*group > ngroups)
651 *group = 0; 652 *group = 0;
652 return find_group_orlov(sb, parent, group, mode, 0); 653 return find_group_orlov(sb, parent, group, mode, NULL);
653 } 654 }
654 655
655 /* 656 /*
@@ -1054,6 +1055,11 @@ got:
1054 } 1055 }
1055 } 1056 }
1056 1057
1058 if (ext4_handle_valid(handle)) {
1059 ei->i_sync_tid = handle->h_transaction->t_tid;
1060 ei->i_datasync_tid = handle->h_transaction->t_tid;
1061 }
1062
1057 err = ext4_mark_inode_dirty(handle, inode); 1063 err = ext4_mark_inode_dirty(handle, inode);
1058 if (err) { 1064 if (err) {
1059 ext4_std_error(sb, err); 1065 ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f7f9e49914f..f2fa5e8a582c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -173,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
173 BUG_ON(EXT4_JOURNAL(inode) == NULL); 173 BUG_ON(EXT4_JOURNAL(inode) == NULL);
174 jbd_debug(2, "restarting handle %p\n", handle); 174 jbd_debug(2, "restarting handle %p\n", handle);
175 up_write(&EXT4_I(inode)->i_data_sem); 175 up_write(&EXT4_I(inode)->i_data_sem);
176 ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); 176 ret = ext4_journal_restart(handle, nblocks);
177 down_write(&EXT4_I(inode)->i_data_sem); 177 down_write(&EXT4_I(inode)->i_data_sem);
178 ext4_discard_preallocations(inode); 178 ext4_discard_preallocations(inode);
179 179
@@ -720,7 +720,7 @@ allocated:
720 return ret; 720 return ret;
721failed_out: 721failed_out:
722 for (i = 0; i < index; i++) 722 for (i = 0; i < index; i++)
723 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
724 return ret; 724 return ret;
725} 725}
726 726
@@ -823,20 +823,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
823 return err; 823 return err;
824failed: 824failed:
825 /* Allocation failed, free what we already allocated */ 825 /* Allocation failed, free what we already allocated */
826 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
827 for (i = 1; i <= n ; i++) { 827 for (i = 1; i <= n ; i++) {
828 /* 828 /*
829 * branch[i].bh is newly allocated, so there is no 829 * branch[i].bh is newly allocated, so there is no
830 * need to revoke the block, which is why we don't 830 * need to revoke the block, which is why we don't
831 * need to set EXT4_FREE_BLOCKS_METADATA. 831 * need to set EXT4_FREE_BLOCKS_METADATA.
832 */ 832 */
833 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
834 EXT4_FREE_BLOCKS_FORGET); 834 EXT4_FREE_BLOCKS_FORGET);
835 } 835 }
836 for (i = n+1; i < indirect_blks; i++) 836 for (i = n+1; i < indirect_blks; i++)
837 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
838 838
839 ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); 839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
840 840
841 return err; 841 return err;
842} 842}
@@ -924,7 +924,7 @@ err_out:
924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
925 EXT4_FREE_BLOCKS_FORGET); 925 EXT4_FREE_BLOCKS_FORGET);
926 } 926 }
927 ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), 927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
928 blks, 0); 928 blks, 0);
929 929
930 return err; 930 return err;
@@ -973,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
973 int count = 0; 973 int count = 0;
974 ext4_fsblk_t first_block = 0; 974 ext4_fsblk_t first_block = 0;
975 975
976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
976 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
977 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
978 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 979 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1058,6 +1059,8 @@ cleanup:
1058 partial--; 1059 partial--;
1059 } 1060 }
1060out: 1061out:
1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063 map->m_pblk, map->m_len, err);
1061 return err; 1064 return err;
1062} 1065}
1063 1066
@@ -2060,7 +2063,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2060 if (nr_pages == 0) 2063 if (nr_pages == 0)
2061 break; 2064 break;
2062 for (i = 0; i < nr_pages; i++) { 2065 for (i = 0; i < nr_pages; i++) {
2063 int commit_write = 0, redirty_page = 0; 2066 int commit_write = 0, skip_page = 0;
2064 struct page *page = pvec.pages[i]; 2067 struct page *page = pvec.pages[i];
2065 2068
2066 index = page->index; 2069 index = page->index;
@@ -2086,14 +2089,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2086 * If the page does not have buffers (for 2089 * If the page does not have buffers (for
2087 * whatever reason), try to create them using 2090 * whatever reason), try to create them using
2088 * __block_write_begin. If this fails, 2091 * __block_write_begin. If this fails,
2089 * redirty the page and move on. 2092 * skip the page and move on.
2090 */ 2093 */
2091 if (!page_has_buffers(page)) { 2094 if (!page_has_buffers(page)) {
2092 if (__block_write_begin(page, 0, len, 2095 if (__block_write_begin(page, 0, len,
2093 noalloc_get_block_write)) { 2096 noalloc_get_block_write)) {
2094 redirty_page: 2097 skip_page:
2095 redirty_page_for_writepage(mpd->wbc,
2096 page);
2097 unlock_page(page); 2098 unlock_page(page);
2098 continue; 2099 continue;
2099 } 2100 }
@@ -2104,7 +2105,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2104 block_start = 0; 2105 block_start = 0;
2105 do { 2106 do {
2106 if (!bh) 2107 if (!bh)
2107 goto redirty_page; 2108 goto skip_page;
2108 if (map && (cur_logical >= map->m_lblk) && 2109 if (map && (cur_logical >= map->m_lblk) &&
2109 (cur_logical <= (map->m_lblk + 2110 (cur_logical <= (map->m_lblk +
2110 (map->m_len - 1)))) { 2111 (map->m_len - 1)))) {
@@ -2120,22 +2121,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2120 clear_buffer_unwritten(bh); 2121 clear_buffer_unwritten(bh);
2121 } 2122 }
2122 2123
2123 /* redirty page if block allocation undone */ 2124 /* skip page if block allocation undone */
2124 if (buffer_delay(bh) || buffer_unwritten(bh)) 2125 if (buffer_delay(bh) || buffer_unwritten(bh))
2125 redirty_page = 1; 2126 skip_page = 1;
2126 bh = bh->b_this_page; 2127 bh = bh->b_this_page;
2127 block_start += bh->b_size; 2128 block_start += bh->b_size;
2128 cur_logical++; 2129 cur_logical++;
2129 pblock++; 2130 pblock++;
2130 } while (bh != page_bufs); 2131 } while (bh != page_bufs);
2131 2132
2132 if (redirty_page) 2133 if (skip_page)
2133 goto redirty_page; 2134 goto skip_page;
2134 2135
2135 if (commit_write) 2136 if (commit_write)
2136 /* mark the buffer_heads as dirty & uptodate */ 2137 /* mark the buffer_heads as dirty & uptodate */
2137 block_commit_write(page, 0, len); 2138 block_commit_write(page, 0, len);
2138 2139
2140 clear_page_dirty_for_io(page);
2139 /* 2141 /*
2140 * Delalloc doesn't support data journalling, 2142 * Delalloc doesn't support data journalling,
2141 * but eventually maybe we'll lift this 2143 * but eventually maybe we'll lift this
@@ -2165,8 +2167,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2165 return ret; 2167 return ret;
2166} 2168}
2167 2169
2168static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2170static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
2169 sector_t logical, long blk_cnt)
2170{ 2171{
2171 int nr_pages, i; 2172 int nr_pages, i;
2172 pgoff_t index, end; 2173 pgoff_t index, end;
@@ -2174,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2174 struct inode *inode = mpd->inode; 2175 struct inode *inode = mpd->inode;
2175 struct address_space *mapping = inode->i_mapping; 2176 struct address_space *mapping = inode->i_mapping;
2176 2177
2177 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2178 index = mpd->first_page;
2178 end = (logical + blk_cnt - 1) >> 2179 end = mpd->next_page - 1;
2179 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2180 while (index <= end) { 2180 while (index <= end) {
2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2182 if (nr_pages == 0) 2182 if (nr_pages == 0)
@@ -2279,9 +2279,8 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2279 err = blks; 2279 err = blks;
2280 /* 2280 /*
2281 * If get block returns EAGAIN or ENOSPC and there 2281 * If get block returns EAGAIN or ENOSPC and there
2282 * appears to be free blocks we will call 2282 * appears to be free blocks we will just let
2283 * ext4_writepage() for all of the pages which will 2283 * mpage_da_submit_io() unlock all of the pages.
2284 * just redirty the pages.
2285 */ 2284 */
2286 if (err == -EAGAIN) 2285 if (err == -EAGAIN)
2287 goto submit_io; 2286 goto submit_io;
@@ -2312,8 +2311,10 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2312 ext4_print_free_blocks(mpd->inode); 2311 ext4_print_free_blocks(mpd->inode);
2313 } 2312 }
2314 /* invalidate all the pages */ 2313 /* invalidate all the pages */
2315 ext4_da_block_invalidatepages(mpd, next, 2314 ext4_da_block_invalidatepages(mpd);
2316 mpd->b_size >> mpd->inode->i_blkbits); 2315
2316 /* Mark this page range as having been completed */
2317 mpd->io_done = 1;
2317 return; 2318 return;
2318 } 2319 }
2319 BUG_ON(blks == 0); 2320 BUG_ON(blks == 0);
@@ -2438,102 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2438} 2439}
2439 2440
2440/* 2441/*
2441 * __mpage_da_writepage - finds extent of pages and blocks
2442 *
2443 * @page: page to consider
2444 * @wbc: not used, we just follow rules
2445 * @data: context
2446 *
2447 * The function finds extents of pages and scan them for all blocks.
2448 */
2449static int __mpage_da_writepage(struct page *page,
2450 struct writeback_control *wbc,
2451 struct mpage_da_data *mpd)
2452{
2453 struct inode *inode = mpd->inode;
2454 struct buffer_head *bh, *head;
2455 sector_t logical;
2456
2457 /*
2458 * Can we merge this page to current extent?
2459 */
2460 if (mpd->next_page != page->index) {
2461 /*
2462 * Nope, we can't. So, we map non-allocated blocks
2463 * and start IO on them
2464 */
2465 if (mpd->next_page != mpd->first_page) {
2466 mpage_da_map_and_submit(mpd);
2467 /*
2468 * skip rest of the page in the page_vec
2469 */
2470 redirty_page_for_writepage(wbc, page);
2471 unlock_page(page);
2472 return MPAGE_DA_EXTENT_TAIL;
2473 }
2474
2475 /*
2476 * Start next extent of pages ...
2477 */
2478 mpd->first_page = page->index;
2479
2480 /*
2481 * ... and blocks
2482 */
2483 mpd->b_size = 0;
2484 mpd->b_state = 0;
2485 mpd->b_blocknr = 0;
2486 }
2487
2488 mpd->next_page = page->index + 1;
2489 logical = (sector_t) page->index <<
2490 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2491
2492 if (!page_has_buffers(page)) {
2493 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2494 (1 << BH_Dirty) | (1 << BH_Uptodate));
2495 if (mpd->io_done)
2496 return MPAGE_DA_EXTENT_TAIL;
2497 } else {
2498 /*
2499 * Page with regular buffer heads, just add all dirty ones
2500 */
2501 head = page_buffers(page);
2502 bh = head;
2503 do {
2504 BUG_ON(buffer_locked(bh));
2505 /*
2506 * We need to try to allocate
2507 * unmapped blocks in the same page.
2508 * Otherwise we won't make progress
2509 * with the page in ext4_writepage
2510 */
2511 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2512 mpage_add_bh_to_extent(mpd, logical,
2513 bh->b_size,
2514 bh->b_state);
2515 if (mpd->io_done)
2516 return MPAGE_DA_EXTENT_TAIL;
2517 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2518 /*
2519 * mapped dirty buffer. We need to update
2520 * the b_state because we look at
2521 * b_state in mpage_da_map_blocks. We don't
2522 * update b_size because if we find an
2523 * unmapped buffer_head later we need to
2524 * use the b_state flag of that buffer_head.
2525 */
2526 if (mpd->b_size == 0)
2527 mpd->b_state = bh->b_state & BH_FLAGS;
2528 }
2529 logical++;
2530 } while ((bh = bh->b_this_page) != head);
2531 }
2532
2533 return 0;
2534}
2535
2536/*
2537 * This is a special get_blocks_t callback which is used by 2442 * This is a special get_blocks_t callback which is used by
2538 * ext4_da_write_begin(). It will either return mapped block or 2443 * ext4_da_write_begin(). It will either return mapped block or
2539 * reserve space for a single block. 2444 * reserve space for a single block.
@@ -2684,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2684 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2589 * because we should have holes filled from ext4_page_mkwrite(). We even don't
2685 * need to file the inode to the transaction's list in ordered mode because if 2590 * need to file the inode to the transaction's list in ordered mode because if
2686 * we are writing back data added by write(), the inode is already there and if 2591 * we are writing back data added by write(), the inode is already there and if
2687 * we are writing back data modified via mmap(), noone guarantees in which 2592 * we are writing back data modified via mmap(), no one guarantees in which
2688 * transaction the data will hit the disk. In case we are journaling data, we 2593 * transaction the data will hit the disk. In case we are journaling data, we
2689 * cannot start transaction directly because transaction start ranks above page 2594 * cannot start transaction directly because transaction start ranks above page
2690 * lock so we have to do some magic. 2595 * lock so we have to do some magic.
@@ -2786,7 +2691,7 @@ static int ext4_writepage(struct page *page,
2786 2691
2787/* 2692/*
2788 * This is called via ext4_da_writepages() to 2693 * This is called via ext4_da_writepages() to
2789 * calulate the total number of credits to reserve to fit 2694 * calculate the total number of credits to reserve to fit
2790 * a single extent allocation into a single transaction, 2695 * a single extent allocation into a single transaction,
2791 * ext4_da_writpeages() will loop calling this before 2696 * ext4_da_writpeages() will loop calling this before
2792 * the block allocation. 2697 * the block allocation.
@@ -2811,27 +2716,27 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2811 2716
2812/* 2717/*
2813 * write_cache_pages_da - walk the list of dirty pages of the given 2718 * write_cache_pages_da - walk the list of dirty pages of the given
2814 * address space and call the callback function (which usually writes 2719 * address space and accumulate pages that need writing, and call
2815 * the pages). 2720 * mpage_da_map_and_submit to map a single contiguous memory region
2816 * 2721 * and then write them.
2817 * This is a forked version of write_cache_pages(). Differences:
2818 * Range cyclic is ignored.
2819 * no_nrwrite_index_update is always presumed true
2820 */ 2722 */
2821static int write_cache_pages_da(struct address_space *mapping, 2723static int write_cache_pages_da(struct address_space *mapping,
2822 struct writeback_control *wbc, 2724 struct writeback_control *wbc,
2823 struct mpage_da_data *mpd, 2725 struct mpage_da_data *mpd,
2824 pgoff_t *done_index) 2726 pgoff_t *done_index)
2825{ 2727{
2826 int ret = 0; 2728 struct buffer_head *bh, *head;
2827 int done = 0; 2729 struct inode *inode = mapping->host;
2828 struct pagevec pvec; 2730 struct pagevec pvec;
2829 unsigned nr_pages; 2731 unsigned int nr_pages;
2830 pgoff_t index; 2732 sector_t logical;
2831 pgoff_t end; /* Inclusive */ 2733 pgoff_t index, end;
2832 long nr_to_write = wbc->nr_to_write; 2734 long nr_to_write = wbc->nr_to_write;
2833 int tag; 2735 int i, tag, ret = 0;
2834 2736
2737 memset(mpd, 0, sizeof(struct mpage_da_data));
2738 mpd->wbc = wbc;
2739 mpd->inode = inode;
2835 pagevec_init(&pvec, 0); 2740 pagevec_init(&pvec, 0);
2836 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2741 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2837 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2742 end = wbc->range_end >> PAGE_CACHE_SHIFT;
@@ -2842,13 +2747,11 @@ static int write_cache_pages_da(struct address_space *mapping,
2842 tag = PAGECACHE_TAG_DIRTY; 2747 tag = PAGECACHE_TAG_DIRTY;
2843 2748
2844 *done_index = index; 2749 *done_index = index;
2845 while (!done && (index <= end)) { 2750 while (index <= end) {
2846 int i;
2847
2848 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2751 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2849 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2752 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2850 if (nr_pages == 0) 2753 if (nr_pages == 0)
2851 break; 2754 return 0;
2852 2755
2853 for (i = 0; i < nr_pages; i++) { 2756 for (i = 0; i < nr_pages; i++) {
2854 struct page *page = pvec.pages[i]; 2757 struct page *page = pvec.pages[i];
@@ -2860,60 +2763,100 @@ static int write_cache_pages_da(struct address_space *mapping,
2860 * mapping. However, page->index will not change 2763 * mapping. However, page->index will not change
2861 * because we have a reference on the page. 2764 * because we have a reference on the page.
2862 */ 2765 */
2863 if (page->index > end) { 2766 if (page->index > end)
2864 done = 1; 2767 goto out;
2865 break;
2866 }
2867 2768
2868 *done_index = page->index + 1; 2769 *done_index = page->index + 1;
2869 2770
2771 /*
2772 * If we can't merge this page, and we have
2773 * accumulated an contiguous region, write it
2774 */
2775 if ((mpd->next_page != page->index) &&
2776 (mpd->next_page != mpd->first_page)) {
2777 mpage_da_map_and_submit(mpd);
2778 goto ret_extent_tail;
2779 }
2780
2870 lock_page(page); 2781 lock_page(page);
2871 2782
2872 /* 2783 /*
2873 * Page truncated or invalidated. We can freely skip it 2784 * If the page is no longer dirty, or its
2874 * then, even for data integrity operations: the page 2785 * mapping no longer corresponds to inode we
2875 * has disappeared concurrently, so there could be no 2786 * are writing (which means it has been
2876 * real expectation of this data interity operation 2787 * truncated or invalidated), or the page is
2877 * even if there is now a new, dirty page at the same 2788 * already under writeback and we are not
2878 * pagecache address. 2789 * doing a data integrity writeback, skip the page
2879 */ 2790 */
2880 if (unlikely(page->mapping != mapping)) { 2791 if (!PageDirty(page) ||
2881continue_unlock: 2792 (PageWriteback(page) &&
2793 (wbc->sync_mode == WB_SYNC_NONE)) ||
2794 unlikely(page->mapping != mapping)) {
2882 unlock_page(page); 2795 unlock_page(page);
2883 continue; 2796 continue;
2884 } 2797 }
2885 2798
2886 if (!PageDirty(page)) { 2799 if (PageWriteback(page))
2887 /* someone wrote it for us */ 2800 wait_on_page_writeback(page);
2888 goto continue_unlock;
2889 }
2890
2891 if (PageWriteback(page)) {
2892 if (wbc->sync_mode != WB_SYNC_NONE)
2893 wait_on_page_writeback(page);
2894 else
2895 goto continue_unlock;
2896 }
2897 2801
2898 BUG_ON(PageWriteback(page)); 2802 BUG_ON(PageWriteback(page));
2899 if (!clear_page_dirty_for_io(page))
2900 goto continue_unlock;
2901 2803
2902 ret = __mpage_da_writepage(page, wbc, mpd); 2804 if (mpd->next_page != page->index)
2903 if (unlikely(ret)) { 2805 mpd->first_page = page->index;
2904 if (ret == AOP_WRITEPAGE_ACTIVATE) { 2806 mpd->next_page = page->index + 1;
2905 unlock_page(page); 2807 logical = (sector_t) page->index <<
2906 ret = 0; 2808 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2907 } else { 2809
2908 done = 1; 2810 if (!page_has_buffers(page)) {
2909 break; 2811 mpage_add_bh_to_extent(mpd, logical,
2910 } 2812 PAGE_CACHE_SIZE,
2813 (1 << BH_Dirty) | (1 << BH_Uptodate));
2814 if (mpd->io_done)
2815 goto ret_extent_tail;
2816 } else {
2817 /*
2818 * Page with regular buffer heads,
2819 * just add all dirty ones
2820 */
2821 head = page_buffers(page);
2822 bh = head;
2823 do {
2824 BUG_ON(buffer_locked(bh));
2825 /*
2826 * We need to try to allocate
2827 * unmapped blocks in the same page.
2828 * Otherwise we won't make progress
2829 * with the page in ext4_writepage
2830 */
2831 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2832 mpage_add_bh_to_extent(mpd, logical,
2833 bh->b_size,
2834 bh->b_state);
2835 if (mpd->io_done)
2836 goto ret_extent_tail;
2837 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2838 /*
2839 * mapped dirty buffer. We need
2840 * to update the b_state
2841 * because we look at b_state
2842 * in mpage_da_map_blocks. We
2843 * don't update b_size because
2844 * if we find an unmapped
2845 * buffer_head later we need to
2846 * use the b_state flag of that
2847 * buffer_head.
2848 */
2849 if (mpd->b_size == 0)
2850 mpd->b_state = bh->b_state & BH_FLAGS;
2851 }
2852 logical++;
2853 } while ((bh = bh->b_this_page) != head);
2911 } 2854 }
2912 2855
2913 if (nr_to_write > 0) { 2856 if (nr_to_write > 0) {
2914 nr_to_write--; 2857 nr_to_write--;
2915 if (nr_to_write == 0 && 2858 if (nr_to_write == 0 &&
2916 wbc->sync_mode == WB_SYNC_NONE) { 2859 wbc->sync_mode == WB_SYNC_NONE)
2917 /* 2860 /*
2918 * We stop writing back only if we are 2861 * We stop writing back only if we are
2919 * not doing integrity sync. In case of 2862 * not doing integrity sync. In case of
@@ -2924,14 +2867,18 @@ continue_unlock:
2924 * pages, but have not synced all of the 2867 * pages, but have not synced all of the
2925 * old dirty pages. 2868 * old dirty pages.
2926 */ 2869 */
2927 done = 1; 2870 goto out;
2928 break;
2929 }
2930 } 2871 }
2931 } 2872 }
2932 pagevec_release(&pvec); 2873 pagevec_release(&pvec);
2933 cond_resched(); 2874 cond_resched();
2934 } 2875 }
2876 return 0;
2877ret_extent_tail:
2878 ret = MPAGE_DA_EXTENT_TAIL;
2879out:
2880 pagevec_release(&pvec);
2881 cond_resched();
2935 return ret; 2882 return ret;
2936} 2883}
2937 2884
@@ -2945,7 +2892,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2945 struct mpage_da_data mpd; 2892 struct mpage_da_data mpd;
2946 struct inode *inode = mapping->host; 2893 struct inode *inode = mapping->host;
2947 int pages_written = 0; 2894 int pages_written = 0;
2948 long pages_skipped;
2949 unsigned int max_pages; 2895 unsigned int max_pages;
2950 int range_cyclic, cycled = 1, io_done = 0; 2896 int range_cyclic, cycled = 1, io_done = 0;
2951 int needed_blocks, ret = 0; 2897 int needed_blocks, ret = 0;
@@ -3028,11 +2974,6 @@ static int ext4_da_writepages(struct address_space *mapping,
3028 wbc->nr_to_write = desired_nr_to_write; 2974 wbc->nr_to_write = desired_nr_to_write;
3029 } 2975 }
3030 2976
3031 mpd.wbc = wbc;
3032 mpd.inode = mapping->host;
3033
3034 pages_skipped = wbc->pages_skipped;
3035
3036retry: 2977retry:
3037 if (wbc->sync_mode == WB_SYNC_ALL) 2978 if (wbc->sync_mode == WB_SYNC_ALL)
3038 tag_pages_for_writeback(mapping, index, end); 2979 tag_pages_for_writeback(mapping, index, end);
@@ -3059,22 +3000,10 @@ retry:
3059 } 3000 }
3060 3001
3061 /* 3002 /*
3062 * Now call __mpage_da_writepage to find the next 3003 * Now call write_cache_pages_da() to find the next
3063 * contiguous region of logical blocks that need 3004 * contiguous region of logical blocks that need
3064 * blocks to be allocated by ext4. We don't actually 3005 * blocks to be allocated by ext4 and submit them.
3065 * submit the blocks for I/O here, even though
3066 * write_cache_pages thinks it will, and will set the
3067 * pages as clean for write before calling
3068 * __mpage_da_writepage().
3069 */ 3006 */
3070 mpd.b_size = 0;
3071 mpd.b_state = 0;
3072 mpd.b_blocknr = 0;
3073 mpd.first_page = 0;
3074 mpd.next_page = 0;
3075 mpd.io_done = 0;
3076 mpd.pages_written = 0;
3077 mpd.retval = 0;
3078 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); 3007 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3079 /* 3008 /*
3080 * If we have a contiguous extent of pages and we 3009 * If we have a contiguous extent of pages and we
@@ -3096,7 +3025,6 @@ retry:
3096 * and try again 3025 * and try again
3097 */ 3026 */
3098 jbd2_journal_force_commit_nested(sbi->s_journal); 3027 jbd2_journal_force_commit_nested(sbi->s_journal);
3099 wbc->pages_skipped = pages_skipped;
3100 ret = 0; 3028 ret = 0;
3101 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 3029 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
3102 /* 3030 /*
@@ -3104,7 +3032,6 @@ retry:
3104 * rest of the pages 3032 * rest of the pages
3105 */ 3033 */
3106 pages_written += mpd.pages_written; 3034 pages_written += mpd.pages_written;
3107 wbc->pages_skipped = pages_skipped;
3108 ret = 0; 3035 ret = 0;
3109 io_done = 1; 3036 io_done = 1;
3110 } else if (wbc->nr_to_write) 3037 } else if (wbc->nr_to_write)
@@ -3122,11 +3049,6 @@ retry:
3122 wbc->range_end = mapping->writeback_index - 1; 3049 wbc->range_end = mapping->writeback_index - 1;
3123 goto retry; 3050 goto retry;
3124 } 3051 }
3125 if (pages_skipped != wbc->pages_skipped)
3126 ext4_msg(inode->i_sb, KERN_CRIT,
3127 "This should not happen leaving %s "
3128 "with nr_to_write = %ld ret = %d",
3129 __func__, wbc->nr_to_write, ret);
3130 3052
3131 /* Update index */ 3053 /* Update index */
3132 wbc->range_cyclic = range_cyclic; 3054 wbc->range_cyclic = range_cyclic;
@@ -3383,7 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
3383 * the pages by calling redirty_page_for_writepage() but that 3305 * the pages by calling redirty_page_for_writepage() but that
3384 * would be ugly in the extreme. So instead we would need to 3306 * would be ugly in the extreme. So instead we would need to
3385 * replicate parts of the code in the above functions, 3307 * replicate parts of the code in the above functions,
3386 * simplifying them becuase we wouldn't actually intend to 3308 * simplifying them because we wouldn't actually intend to
3387 * write out the pages, but rather only collect contiguous 3309 * write out the pages, but rather only collect contiguous
3388 * logical block extents, call the multi-block allocator, and 3310 * logical block extents, call the multi-block allocator, and
3389 * then update the buffer heads with the block allocations. 3311 * then update the buffer heads with the block allocations.
@@ -3460,6 +3382,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3460 3382
3461static int ext4_readpage(struct file *file, struct page *page) 3383static int ext4_readpage(struct file *file, struct page *page)
3462{ 3384{
3385 trace_ext4_readpage(page);
3463 return mpage_readpage(page, ext4_get_block); 3386 return mpage_readpage(page, ext4_get_block);
3464} 3387}
3465 3388
@@ -3494,6 +3417,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
3494{ 3417{
3495 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3418 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3496 3419
3420 trace_ext4_invalidatepage(page, offset);
3421
3497 /* 3422 /*
3498 * free any io_end structure allocated for buffers to be discarded 3423 * free any io_end structure allocated for buffers to be discarded
3499 */ 3424 */
@@ -3515,6 +3440,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3515{ 3440{
3516 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3441 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3517 3442
3443 trace_ext4_releasepage(page);
3444
3518 WARN_ON(PageChecked(page)); 3445 WARN_ON(PageChecked(page));
3519 if (!page_has_buffers(page)) 3446 if (!page_has_buffers(page))
3520 return 0; 3447 return 0;
@@ -3768,7 +3695,7 @@ retry:
3768 * 3695 *
3769 * The unwrritten extents will be converted to written when DIO is completed. 3696 * The unwrritten extents will be converted to written when DIO is completed.
3770 * For async direct IO, since the IO may still pending when return, we 3697 * For async direct IO, since the IO may still pending when return, we
3771 * set up an end_io call back function, which will do the convertion 3698 * set up an end_io call back function, which will do the conversion
3772 * when async direct IO completed. 3699 * when async direct IO completed.
3773 * 3700 *
3774 * If the O_DIRECT write will extend the file then add this inode to the 3701 * If the O_DIRECT write will extend the file then add this inode to the
@@ -3791,7 +3718,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3791 * We could direct write to holes and fallocate. 3718 * We could direct write to holes and fallocate.
3792 * 3719 *
3793 * Allocated blocks to fill the hole are marked as uninitialized 3720 * Allocated blocks to fill the hole are marked as uninitialized
3794 * to prevent paralel buffered read to expose the stale data 3721 * to prevent parallel buffered read to expose the stale data
3795 * before DIO complete the data IO. 3722 * before DIO complete the data IO.
3796 * 3723 *
3797 * As to previously fallocated extents, ext4 get_block 3724 * As to previously fallocated extents, ext4 get_block
@@ -3852,7 +3779,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3852 int err; 3779 int err;
3853 /* 3780 /*
3854 * for non AIO case, since the IO is already 3781 * for non AIO case, since the IO is already
3855 * completed, we could do the convertion right here 3782 * completed, we could do the conversion right here
3856 */ 3783 */
3857 err = ext4_convert_unwritten_extents(inode, 3784 err = ext4_convert_unwritten_extents(inode,
3858 offset, ret); 3785 offset, ret);
@@ -3873,11 +3800,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3873{ 3800{
3874 struct file *file = iocb->ki_filp; 3801 struct file *file = iocb->ki_filp;
3875 struct inode *inode = file->f_mapping->host; 3802 struct inode *inode = file->f_mapping->host;
3803 ssize_t ret;
3876 3804
3805 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3877 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3806 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3878 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3807 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3879 3808 else
3880 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3809 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3810 trace_ext4_direct_IO_exit(inode, offset,
3811 iov_length(iov, nr_segs), rw, ret);
3812 return ret;
3881} 3813}
3882 3814
3883/* 3815/*
@@ -3903,7 +3835,6 @@ static const struct address_space_operations ext4_ordered_aops = {
3903 .readpage = ext4_readpage, 3835 .readpage = ext4_readpage,
3904 .readpages = ext4_readpages, 3836 .readpages = ext4_readpages,
3905 .writepage = ext4_writepage, 3837 .writepage = ext4_writepage,
3906 .sync_page = block_sync_page,
3907 .write_begin = ext4_write_begin, 3838 .write_begin = ext4_write_begin,
3908 .write_end = ext4_ordered_write_end, 3839 .write_end = ext4_ordered_write_end,
3909 .bmap = ext4_bmap, 3840 .bmap = ext4_bmap,
@@ -3919,7 +3850,6 @@ static const struct address_space_operations ext4_writeback_aops = {
3919 .readpage = ext4_readpage, 3850 .readpage = ext4_readpage,
3920 .readpages = ext4_readpages, 3851 .readpages = ext4_readpages,
3921 .writepage = ext4_writepage, 3852 .writepage = ext4_writepage,
3922 .sync_page = block_sync_page,
3923 .write_begin = ext4_write_begin, 3853 .write_begin = ext4_write_begin,
3924 .write_end = ext4_writeback_write_end, 3854 .write_end = ext4_writeback_write_end,
3925 .bmap = ext4_bmap, 3855 .bmap = ext4_bmap,
@@ -3935,7 +3865,6 @@ static const struct address_space_operations ext4_journalled_aops = {
3935 .readpage = ext4_readpage, 3865 .readpage = ext4_readpage,
3936 .readpages = ext4_readpages, 3866 .readpages = ext4_readpages,
3937 .writepage = ext4_writepage, 3867 .writepage = ext4_writepage,
3938 .sync_page = block_sync_page,
3939 .write_begin = ext4_write_begin, 3868 .write_begin = ext4_write_begin,
3940 .write_end = ext4_journalled_write_end, 3869 .write_end = ext4_journalled_write_end,
3941 .set_page_dirty = ext4_journalled_set_page_dirty, 3870 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3951,7 +3880,6 @@ static const struct address_space_operations ext4_da_aops = {
3951 .readpages = ext4_readpages, 3880 .readpages = ext4_readpages,
3952 .writepage = ext4_writepage, 3881 .writepage = ext4_writepage,
3953 .writepages = ext4_da_writepages, 3882 .writepages = ext4_da_writepages,
3954 .sync_page = block_sync_page,
3955 .write_begin = ext4_da_write_begin, 3883 .write_begin = ext4_da_write_begin,
3956 .write_end = ext4_da_write_end, 3884 .write_end = ext4_da_write_end,
3957 .bmap = ext4_bmap, 3885 .bmap = ext4_bmap,
@@ -4098,7 +4026,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
4098 * 4026 *
4099 * When we do truncate() we may have to clean the ends of several 4027 * When we do truncate() we may have to clean the ends of several
4100 * indirect blocks but leave the blocks themselves alive. Block is 4028 * indirect blocks but leave the blocks themselves alive. Block is
4101 * partially truncated if some data below the new i_size is refered 4029 * partially truncated if some data below the new i_size is referred
4102 * from it (and it is on the path to the first completely truncated 4030 * from it (and it is on the path to the first completely truncated
4103 * data block, indeed). We have to free the top of that path along 4031 * data block, indeed). We have to free the top of that path along
4104 * with everything to the right of the path. Since no allocation 4032 * with everything to the right of the path. Since no allocation
@@ -4177,6 +4105,9 @@ no_top:
4177 * 4105 *
4178 * We release `count' blocks on disk, but (last - first) may be greater 4106 * We release `count' blocks on disk, but (last - first) may be greater
4179 * than `count' because there can be holes in there. 4107 * than `count' because there can be holes in there.
4108 *
4109 * Return 0 on success, 1 on invalid block range
4110 * and < 0 on fatal error.
4180 */ 4111 */
4181static int ext4_clear_blocks(handle_t *handle, struct inode *inode, 4112static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4182 struct buffer_head *bh, 4113 struct buffer_head *bh,
@@ -4203,33 +4134,32 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4203 if (bh) { 4134 if (bh) {
4204 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4135 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4205 err = ext4_handle_dirty_metadata(handle, inode, bh); 4136 err = ext4_handle_dirty_metadata(handle, inode, bh);
4206 if (unlikely(err)) { 4137 if (unlikely(err))
4207 ext4_std_error(inode->i_sb, err); 4138 goto out_err;
4208 return 1;
4209 }
4210 } 4139 }
4211 err = ext4_mark_inode_dirty(handle, inode); 4140 err = ext4_mark_inode_dirty(handle, inode);
4212 if (unlikely(err)) { 4141 if (unlikely(err))
4213 ext4_std_error(inode->i_sb, err); 4142 goto out_err;
4214 return 1;
4215 }
4216 err = ext4_truncate_restart_trans(handle, inode, 4143 err = ext4_truncate_restart_trans(handle, inode,
4217 blocks_for_truncate(inode)); 4144 blocks_for_truncate(inode));
4218 if (unlikely(err)) { 4145 if (unlikely(err))
4219 ext4_std_error(inode->i_sb, err); 4146 goto out_err;
4220 return 1;
4221 }
4222 if (bh) { 4147 if (bh) {
4223 BUFFER_TRACE(bh, "retaking write access"); 4148 BUFFER_TRACE(bh, "retaking write access");
4224 ext4_journal_get_write_access(handle, bh); 4149 err = ext4_journal_get_write_access(handle, bh);
4150 if (unlikely(err))
4151 goto out_err;
4225 } 4152 }
4226 } 4153 }
4227 4154
4228 for (p = first; p < last; p++) 4155 for (p = first; p < last; p++)
4229 *p = 0; 4156 *p = 0;
4230 4157
4231 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4158 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4232 return 0; 4159 return 0;
4160out_err:
4161 ext4_std_error(inode->i_sb, err);
4162 return err;
4233} 4163}
4234 4164
4235/** 4165/**
@@ -4240,7 +4170,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4240 * @first: array of block numbers 4170 * @first: array of block numbers
4241 * @last: points immediately past the end of array 4171 * @last: points immediately past the end of array
4242 * 4172 *
4243 * We are freeing all blocks refered from that array (numbers are stored as 4173 * We are freeing all blocks referred from that array (numbers are stored as
4244 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 4174 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4245 * 4175 *
4246 * We accumulate contiguous runs of blocks to free. Conveniently, if these 4176 * We accumulate contiguous runs of blocks to free. Conveniently, if these
@@ -4263,7 +4193,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4263 ext4_fsblk_t nr; /* Current block # */ 4193 ext4_fsblk_t nr; /* Current block # */
4264 __le32 *p; /* Pointer into inode/ind 4194 __le32 *p; /* Pointer into inode/ind
4265 for current block */ 4195 for current block */
4266 int err; 4196 int err = 0;
4267 4197
4268 if (this_bh) { /* For indirect block */ 4198 if (this_bh) { /* For indirect block */
4269 BUFFER_TRACE(this_bh, "get_write_access"); 4199 BUFFER_TRACE(this_bh, "get_write_access");
@@ -4285,9 +4215,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4285 } else if (nr == block_to_free + count) { 4215 } else if (nr == block_to_free + count) {
4286 count++; 4216 count++;
4287 } else { 4217 } else {
4288 if (ext4_clear_blocks(handle, inode, this_bh, 4218 err = ext4_clear_blocks(handle, inode, this_bh,
4289 block_to_free, count, 4219 block_to_free, count,
4290 block_to_free_p, p)) 4220 block_to_free_p, p);
4221 if (err)
4291 break; 4222 break;
4292 block_to_free = nr; 4223 block_to_free = nr;
4293 block_to_free_p = p; 4224 block_to_free_p = p;
@@ -4296,9 +4227,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4296 } 4227 }
4297 } 4228 }
4298 4229
4299 if (count > 0) 4230 if (!err && count > 0)
4300 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 4231 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4301 count, block_to_free_p, p); 4232 count, block_to_free_p, p);
4233 if (err < 0)
4234 /* fatal error */
4235 return;
4302 4236
4303 if (this_bh) { 4237 if (this_bh) {
4304 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 4238 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4328,7 +4262,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4328 * @last: pointer immediately past the end of array 4262 * @last: pointer immediately past the end of array
4329 * @depth: depth of the branches to free 4263 * @depth: depth of the branches to free
4330 * 4264 *
4331 * We are freeing all blocks refered from these branches (numbers are 4265 * We are freeing all blocks referred from these branches (numbers are
4332 * stored as little-endian 32-bit) and updating @inode->i_blocks 4266 * stored as little-endian 32-bit) and updating @inode->i_blocks
4333 * appropriately. 4267 * appropriately.
4334 */ 4268 */
@@ -4416,7 +4350,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4416 * transaction where the data blocks are 4350 * transaction where the data blocks are
4417 * actually freed. 4351 * actually freed.
4418 */ 4352 */
4419 ext4_free_blocks(handle, inode, 0, nr, 1, 4353 ext4_free_blocks(handle, inode, NULL, nr, 1,
4420 EXT4_FREE_BLOCKS_METADATA| 4354 EXT4_FREE_BLOCKS_METADATA|
4421 EXT4_FREE_BLOCKS_FORGET); 4355 EXT4_FREE_BLOCKS_FORGET);
4422 4356
@@ -4496,10 +4430,12 @@ void ext4_truncate(struct inode *inode)
4496 Indirect chain[4]; 4430 Indirect chain[4];
4497 Indirect *partial; 4431 Indirect *partial;
4498 __le32 nr = 0; 4432 __le32 nr = 0;
4499 int n; 4433 int n = 0;
4500 ext4_lblk_t last_block; 4434 ext4_lblk_t last_block, max_block;
4501 unsigned blocksize = inode->i_sb->s_blocksize; 4435 unsigned blocksize = inode->i_sb->s_blocksize;
4502 4436
4437 trace_ext4_truncate_enter(inode);
4438
4503 if (!ext4_can_truncate(inode)) 4439 if (!ext4_can_truncate(inode))
4504 return; 4440 return;
4505 4441
@@ -4510,6 +4446,7 @@ void ext4_truncate(struct inode *inode)
4510 4446
4511 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4447 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4512 ext4_ext_truncate(inode); 4448 ext4_ext_truncate(inode);
4449 trace_ext4_truncate_exit(inode);
4513 return; 4450 return;
4514 } 4451 }
4515 4452
@@ -4519,14 +4456,18 @@ void ext4_truncate(struct inode *inode)
4519 4456
4520 last_block = (inode->i_size + blocksize-1) 4457 last_block = (inode->i_size + blocksize-1)
4521 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4458 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4459 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
4460 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4522 4461
4523 if (inode->i_size & (blocksize - 1)) 4462 if (inode->i_size & (blocksize - 1))
4524 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 4463 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4525 goto out_stop; 4464 goto out_stop;
4526 4465
4527 n = ext4_block_to_path(inode, last_block, offsets, NULL); 4466 if (last_block != max_block) {
4528 if (n == 0) 4467 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4529 goto out_stop; /* error */ 4468 if (n == 0)
4469 goto out_stop; /* error */
4470 }
4530 4471
4531 /* 4472 /*
4532 * OK. This truncate is going to happen. We add the inode to the 4473 * OK. This truncate is going to happen. We add the inode to the
@@ -4557,7 +4498,13 @@ void ext4_truncate(struct inode *inode)
4557 */ 4498 */
4558 ei->i_disksize = inode->i_size; 4499 ei->i_disksize = inode->i_size;
4559 4500
4560 if (n == 1) { /* direct blocks */ 4501 if (last_block == max_block) {
4502 /*
4503 * It is unnecessary to free any data blocks if last_block is
4504 * equal to the indirect block limit.
4505 */
4506 goto out_unlock;
4507 } else if (n == 1) { /* direct blocks */
4561 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 4508 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4562 i_data + EXT4_NDIR_BLOCKS); 4509 i_data + EXT4_NDIR_BLOCKS);
4563 goto do_indirects; 4510 goto do_indirects;
@@ -4617,6 +4564,7 @@ do_indirects:
4617 ; 4564 ;
4618 } 4565 }
4619 4566
4567out_unlock:
4620 up_write(&ei->i_data_sem); 4568 up_write(&ei->i_data_sem);
4621 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4569 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4622 ext4_mark_inode_dirty(handle, inode); 4570 ext4_mark_inode_dirty(handle, inode);
@@ -4639,6 +4587,7 @@ out_stop:
4639 ext4_orphan_del(handle, inode); 4587 ext4_orphan_del(handle, inode);
4640 4588
4641 ext4_journal_stop(handle); 4589 ext4_journal_stop(handle);
4590 trace_ext4_truncate_exit(inode);
4642} 4591}
4643 4592
4644/* 4593/*
@@ -4770,6 +4719,7 @@ make_io:
4770 * has in-inode xattrs, or we don't have this inode in memory. 4719 * has in-inode xattrs, or we don't have this inode in memory.
4771 * Read the block from disk. 4720 * Read the block from disk.
4772 */ 4721 */
4722 trace_ext4_load_inode(inode);
4773 get_bh(bh); 4723 get_bh(bh);
4774 bh->b_end_io = end_buffer_read_sync; 4724 bh->b_end_io = end_buffer_read_sync;
4775 submit_bh(READ_META, bh); 4725 submit_bh(READ_META, bh);
@@ -4875,7 +4825,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4875 return inode; 4825 return inode;
4876 4826
4877 ei = EXT4_I(inode); 4827 ei = EXT4_I(inode);
4878 iloc.bh = 0; 4828 iloc.bh = NULL;
4879 4829
4880 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4830 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4881 if (ret < 0) 4831 if (ret < 0)
@@ -5460,13 +5410,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5460 /* if nrblocks are contiguous */ 5410 /* if nrblocks are contiguous */
5461 if (chunk) { 5411 if (chunk) {
5462 /* 5412 /*
5463 * With N contiguous data blocks, it need at most 5413 * With N contiguous data blocks, we need at most
5464 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 5414 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
5465 * 2 dindirect blocks 5415 * 2 dindirect blocks, and 1 tindirect block
5466 * 1 tindirect block
5467 */ 5416 */
5468 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 5417 return DIV_ROUND_UP(nrblocks,
5469 return indirects + 3; 5418 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5470 } 5419 }
5471 /* 5420 /*
5472 * if nrblocks are not contiguous, worse case, each block touch 5421 * if nrblocks are not contiguous, worse case, each block touch
@@ -5540,7 +5489,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5540} 5489}
5541 5490
5542/* 5491/*
5543 * Calulate the total number of credits to reserve to fit 5492 * Calculate the total number of credits to reserve to fit
5544 * the modification of a single pages into a single transaction, 5493 * the modification of a single pages into a single transaction,
5545 * which may include multiple chunks of block allocations. 5494 * which may include multiple chunks of block allocations.
5546 * 5495 *
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eb3bc2fe647e..808c554e773f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
38 unsigned int oldflags; 38 unsigned int oldflags;
39 unsigned int jflag; 39 unsigned int jflag;
40 40
41 if (!is_owner_or_cap(inode)) 41 if (!inode_owner_or_capable(inode))
42 return -EACCES; 42 return -EACCES;
43 43
44 if (get_user(flags, (int __user *) arg)) 44 if (get_user(flags, (int __user *) arg))
@@ -146,7 +146,7 @@ flags_out:
146 __u32 generation; 146 __u32 generation;
147 int err; 147 int err;
148 148
149 if (!is_owner_or_cap(inode)) 149 if (!inode_owner_or_capable(inode))
150 return -EPERM; 150 return -EPERM;
151 151
152 err = mnt_want_write(filp->f_path.mnt); 152 err = mnt_want_write(filp->f_path.mnt);
@@ -298,7 +298,7 @@ mext_out:
298 case EXT4_IOC_MIGRATE: 298 case EXT4_IOC_MIGRATE:
299 { 299 {
300 int err; 300 int err;
301 if (!is_owner_or_cap(inode)) 301 if (!inode_owner_or_capable(inode))
302 return -EACCES; 302 return -EACCES;
303 303
304 err = mnt_want_write(filp->f_path.mnt); 304 err = mnt_want_write(filp->f_path.mnt);
@@ -320,7 +320,7 @@ mext_out:
320 case EXT4_IOC_ALLOC_DA_BLKS: 320 case EXT4_IOC_ALLOC_DA_BLKS:
321 { 321 {
322 int err; 322 int err;
323 if (!is_owner_or_cap(inode)) 323 if (!inode_owner_or_capable(inode))
324 return -EACCES; 324 return -EACCES;
325 325
326 err = mnt_want_write(filp->f_path.mnt); 326 err = mnt_want_write(filp->f_path.mnt);
@@ -334,16 +334,22 @@ mext_out:
334 case FITRIM: 334 case FITRIM:
335 { 335 {
336 struct super_block *sb = inode->i_sb; 336 struct super_block *sb = inode->i_sb;
337 struct request_queue *q = bdev_get_queue(sb->s_bdev);
337 struct fstrim_range range; 338 struct fstrim_range range;
338 int ret = 0; 339 int ret = 0;
339 340
340 if (!capable(CAP_SYS_ADMIN)) 341 if (!capable(CAP_SYS_ADMIN))
341 return -EPERM; 342 return -EPERM;
342 343
344 if (!blk_queue_discard(q))
345 return -EOPNOTSUPP;
346
343 if (copy_from_user(&range, (struct fstrim_range *)arg, 347 if (copy_from_user(&range, (struct fstrim_range *)arg,
344 sizeof(range))) 348 sizeof(range)))
345 return -EFAULT; 349 return -EFAULT;
346 350
351 range.minlen = max((unsigned int)range.minlen,
352 q->limits.discard_granularity);
347 ret = ext4_trim_fs(sb, &range); 353 ret = ext4_trim_fs(sb, &range);
348 if (ret < 0) 354 if (ret < 0)
349 return ret; 355 return ret;
@@ -421,6 +427,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
421 return err; 427 return err;
422 } 428 }
423 case EXT4_IOC_MOVE_EXT: 429 case EXT4_IOC_MOVE_EXT:
430 case FITRIM:
424 break; 431 break;
425 default: 432 default:
426 return -ENOIOCTLCMD; 433 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d1fe09aea73d..d8a16eecf1d5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -92,7 +92,7 @@
92 * between CPUs. It is possible to get scheduled at this point. 92 * between CPUs. It is possible to get scheduled at this point.
93 * 93 *
94 * The locality group prealloc space is used looking at whether we have 94 * The locality group prealloc space is used looking at whether we have
95 * enough free space (pa_free) withing the prealloc space. 95 * enough free space (pa_free) within the prealloc space.
96 * 96 *
97 * If we can't allocate blocks via inode prealloc or/and locality group 97 * If we can't allocate blocks via inode prealloc or/and locality group
98 * prealloc then we look at the buddy cache. The buddy cache is represented 98 * prealloc then we look at the buddy cache. The buddy cache is represented
@@ -432,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
432 } 432 }
433 433
434 /* at order 0 we see each particular block */ 434 /* at order 0 we see each particular block */
435 *max = 1 << (e4b->bd_blkbits + 3); 435 if (order == 0) {
436 if (order == 0) 436 *max = 1 << (e4b->bd_blkbits + 3);
437 return EXT4_MB_BITMAP(e4b); 437 return EXT4_MB_BITMAP(e4b);
438 }
438 439
439 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 440 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
440 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 441 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -616,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
616 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 617 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
617 618
618 grp = ext4_get_group_info(sb, e4b->bd_group); 619 grp = ext4_get_group_info(sb, e4b->bd_group);
619 buddy = mb_find_buddy(e4b, 0, &max);
620 list_for_each(cur, &grp->bb_prealloc_list) { 620 list_for_each(cur, &grp->bb_prealloc_list) {
621 ext4_group_t groupnr; 621 ext4_group_t groupnr;
622 struct ext4_prealloc_space *pa; 622 struct ext4_prealloc_space *pa;
@@ -635,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
635#define mb_check_buddy(e4b) 635#define mb_check_buddy(e4b)
636#endif 636#endif
637 637
638/* FIXME!! need more doc */ 638/*
639 * Divide blocks started from @first with length @len into
640 * smaller chunks with power of 2 blocks.
641 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
642 * then increase bb_counters[] for corresponded chunk size.
643 */
639static void ext4_mb_mark_free_simple(struct super_block *sb, 644static void ext4_mb_mark_free_simple(struct super_block *sb,
640 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 645 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
641 struct ext4_group_info *grp) 646 struct ext4_group_info *grp)
@@ -2381,7 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2381 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2386 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2382 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2387 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2383 * So a two level scheme suffices for now. */ 2388 * So a two level scheme suffices for now. */
2384 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); 2389 sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
2385 if (sbi->s_group_info == NULL) { 2390 if (sbi->s_group_info == NULL) {
2386 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2391 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2387 return -ENOMEM; 2392 return -ENOMEM;
@@ -3208,7 +3213,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3208 cur_distance = abs(goal_block - cpa->pa_pstart); 3213 cur_distance = abs(goal_block - cpa->pa_pstart);
3209 new_distance = abs(goal_block - pa->pa_pstart); 3214 new_distance = abs(goal_block - pa->pa_pstart);
3210 3215
3211 if (cur_distance < new_distance) 3216 if (cur_distance <= new_distance)
3212 return cpa; 3217 return cpa;
3213 3218
3214 /* drop the previous reference */ 3219 /* drop the previous reference */
@@ -3907,7 +3912,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3907 struct super_block *sb = ac->ac_sb; 3912 struct super_block *sb = ac->ac_sb;
3908 ext4_group_t ngroups, i; 3913 ext4_group_t ngroups, i;
3909 3914
3910 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) 3915 if (!mb_enable_debug ||
3916 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3911 return; 3917 return;
3912 3918
3913 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3919 printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4753,7 +4759,8 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4753 * bitmap. Then issue a TRIM command on this extent and free the extent in 4759 * bitmap. Then issue a TRIM command on this extent and free the extent in
4754 * the group buddy bitmap. This is done until whole group is scanned. 4760 * the group buddy bitmap. This is done until whole group is scanned.
4755 */ 4761 */
4756ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, 4762static ext4_grpblk_t
4763ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4757 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) 4764 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
4758{ 4765{
4759 void *bitmap; 4766 void *bitmap;
@@ -4863,10 +4870,15 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4863 break; 4870 break;
4864 } 4871 }
4865 4872
4866 if (len >= EXT4_BLOCKS_PER_GROUP(sb)) 4873 /*
4867 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); 4874 * For all the groups except the last one, last block will
4868 else 4875 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
4876 * change it for the last group in which case start +
4877 * len < EXT4_BLOCKS_PER_GROUP(sb).
4878 */
4879 if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
4869 last_block = first_block + len; 4880 last_block = first_block + len;
4881 len -= last_block - first_block;
4870 4882
4871 if (e4b.bd_info->bb_free >= minlen) { 4883 if (e4b.bd_info->bb_free >= minlen) {
4872 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4884 cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b619322c76f0..22bd4d7f289b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -169,7 +169,7 @@ struct ext4_allocation_context {
169 /* original request */ 169 /* original request */
170 struct ext4_free_extent ac_o_ex; 170 struct ext4_free_extent ac_o_ex;
171 171
172 /* goal request (after normalization) */ 172 /* goal request (normalized ac_o_ex) */
173 struct ext4_free_extent ac_g_ex; 173 struct ext4_free_extent ac_g_ex;
174 174
175 /* the best found extent */ 175 /* the best found extent */
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b0a126f23c20..92816b4e0f16 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -263,7 +263,7 @@ static int free_dind_blocks(handle_t *handle,
263 for (i = 0; i < max_entries; i++) { 263 for (i = 0; i < max_entries; i++) {
264 if (tmp_idata[i]) { 264 if (tmp_idata[i]) {
265 extend_credit_for_blkdel(handle, inode); 265 extend_credit_for_blkdel(handle, inode);
266 ext4_free_blocks(handle, inode, 0, 266 ext4_free_blocks(handle, inode, NULL,
267 le32_to_cpu(tmp_idata[i]), 1, 267 le32_to_cpu(tmp_idata[i]), 1,
268 EXT4_FREE_BLOCKS_METADATA | 268 EXT4_FREE_BLOCKS_METADATA |
269 EXT4_FREE_BLOCKS_FORGET); 269 EXT4_FREE_BLOCKS_FORGET);
@@ -271,7 +271,7 @@ static int free_dind_blocks(handle_t *handle,
271 } 271 }
272 put_bh(bh); 272 put_bh(bh);
273 extend_credit_for_blkdel(handle, inode); 273 extend_credit_for_blkdel(handle, inode);
274 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 274 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
275 EXT4_FREE_BLOCKS_METADATA | 275 EXT4_FREE_BLOCKS_METADATA |
276 EXT4_FREE_BLOCKS_FORGET); 276 EXT4_FREE_BLOCKS_FORGET);
277 return 0; 277 return 0;
@@ -302,7 +302,7 @@ static int free_tind_blocks(handle_t *handle,
302 } 302 }
303 put_bh(bh); 303 put_bh(bh);
304 extend_credit_for_blkdel(handle, inode); 304 extend_credit_for_blkdel(handle, inode);
305 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 305 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
306 EXT4_FREE_BLOCKS_METADATA | 306 EXT4_FREE_BLOCKS_METADATA |
307 EXT4_FREE_BLOCKS_FORGET); 307 EXT4_FREE_BLOCKS_FORGET);
308 return 0; 308 return 0;
@@ -315,7 +315,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
315 /* ei->i_data[EXT4_IND_BLOCK] */ 315 /* ei->i_data[EXT4_IND_BLOCK] */
316 if (i_data[0]) { 316 if (i_data[0]) {
317 extend_credit_for_blkdel(handle, inode); 317 extend_credit_for_blkdel(handle, inode);
318 ext4_free_blocks(handle, inode, 0, 318 ext4_free_blocks(handle, inode, NULL,
319 le32_to_cpu(i_data[0]), 1, 319 le32_to_cpu(i_data[0]), 1,
320 EXT4_FREE_BLOCKS_METADATA | 320 EXT4_FREE_BLOCKS_METADATA |
321 EXT4_FREE_BLOCKS_FORGET); 321 EXT4_FREE_BLOCKS_FORGET);
@@ -428,7 +428,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
428 } 428 }
429 put_bh(bh); 429 put_bh(bh);
430 extend_credit_for_blkdel(handle, inode); 430 extend_credit_for_blkdel(handle, inode);
431 ext4_free_blocks(handle, inode, 0, block, 1, 431 ext4_free_blocks(handle, inode, NULL, block, 1,
432 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 432 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
433 return retval; 433 return retval;
434} 434}
@@ -517,7 +517,7 @@ int ext4_ext_migrate(struct inode *inode)
517 * start with one credit accounted for 517 * start with one credit accounted for
518 * superblock modification. 518 * superblock modification.
519 * 519 *
520 * For the tmp_inode we already have commited the 520 * For the tmp_inode we already have committed the
521 * trascation that created the inode. Later as and 521 * trascation that created the inode. Later as and
522 * when we add extents we extent the journal 522 * when we add extents we extent the journal
523 */ 523 */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e781b7ea5630..67fd0b025858 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -40,6 +40,7 @@
40#include "xattr.h" 40#include "xattr.h"
41#include "acl.h" 41#include "acl.h"
42 42
43#include <trace/events/ext4.h>
43/* 44/*
44 * define how far ahead to read directories while searching them. 45 * define how far ahead to read directories while searching them.
45 */ 46 */
@@ -2183,6 +2184,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2183 struct ext4_dir_entry_2 *de; 2184 struct ext4_dir_entry_2 *de;
2184 handle_t *handle; 2185 handle_t *handle;
2185 2186
2187 trace_ext4_unlink_enter(dir, dentry);
2186 /* Initialize quotas before so that eventual writes go 2188 /* Initialize quotas before so that eventual writes go
2187 * in separate transaction */ 2189 * in separate transaction */
2188 dquot_initialize(dir); 2190 dquot_initialize(dir);
@@ -2228,6 +2230,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2228end_unlink: 2230end_unlink:
2229 ext4_journal_stop(handle); 2231 ext4_journal_stop(handle);
2230 brelse(bh); 2232 brelse(bh);
2233 trace_ext4_unlink_exit(dentry, retval);
2231 return retval; 2234 return retval;
2232} 2235}
2233 2236
@@ -2402,6 +2405,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2402 if (!new_inode && new_dir != old_dir && 2405 if (!new_inode && new_dir != old_dir &&
2403 EXT4_DIR_LINK_MAX(new_dir)) 2406 EXT4_DIR_LINK_MAX(new_dir))
2404 goto end_rename; 2407 goto end_rename;
2408 BUFFER_TRACE(dir_bh, "get_write_access");
2409 retval = ext4_journal_get_write_access(handle, dir_bh);
2410 if (retval)
2411 goto end_rename;
2405 } 2412 }
2406 if (!new_bh) { 2413 if (!new_bh) {
2407 retval = ext4_add_entry(handle, new_dentry, old_inode); 2414 retval = ext4_add_entry(handle, new_dentry, old_inode);
@@ -2409,7 +2416,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2409 goto end_rename; 2416 goto end_rename;
2410 } else { 2417 } else {
2411 BUFFER_TRACE(new_bh, "get write access"); 2418 BUFFER_TRACE(new_bh, "get write access");
2412 ext4_journal_get_write_access(handle, new_bh); 2419 retval = ext4_journal_get_write_access(handle, new_bh);
2420 if (retval)
2421 goto end_rename;
2413 new_de->inode = cpu_to_le32(old_inode->i_ino); 2422 new_de->inode = cpu_to_le32(old_inode->i_ino);
2414 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, 2423 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2415 EXT4_FEATURE_INCOMPAT_FILETYPE)) 2424 EXT4_FEATURE_INCOMPAT_FILETYPE))
@@ -2470,8 +2479,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2470 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); 2479 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2471 ext4_update_dx_flag(old_dir); 2480 ext4_update_dx_flag(old_dir);
2472 if (dir_bh) { 2481 if (dir_bh) {
2473 BUFFER_TRACE(dir_bh, "get_write_access");
2474 ext4_journal_get_write_access(handle, dir_bh);
2475 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2482 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2476 cpu_to_le32(new_dir->i_ino); 2483 cpu_to_le32(new_dir->i_ino);
2477 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2484 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 955cc309142f..b6dbd056fcb1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -259,6 +259,11 @@ static void ext4_end_bio(struct bio *bio, int error)
259 bi_sector >> (inode->i_blkbits - 9)); 259 bi_sector >> (inode->i_blkbits - 9));
260 } 260 }
261 261
262 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
263 ext4_free_io_end(io_end);
264 return;
265 }
266
262 /* Add the io_end to per-inode completed io list*/ 267 /* Add the io_end to per-inode completed io list*/
263 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 268 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
264 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); 269 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
@@ -279,9 +284,9 @@ void ext4_io_submit(struct ext4_io_submit *io)
279 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); 284 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
280 bio_put(io->io_bio); 285 bio_put(io->io_bio);
281 } 286 }
282 io->io_bio = 0; 287 io->io_bio = NULL;
283 io->io_op = 0; 288 io->io_op = 0;
284 io->io_end = 0; 289 io->io_end = NULL;
285} 290}
286 291
287static int io_submit_init(struct ext4_io_submit *io, 292static int io_submit_init(struct ext4_io_submit *io,
@@ -310,8 +315,7 @@ static int io_submit_init(struct ext4_io_submit *io,
310 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 315 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
311 316
312 io->io_bio = bio; 317 io->io_bio = bio;
313 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? 318 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
314 WRITE_SYNC_PLUG : WRITE);
315 io->io_next_block = bh->b_blocknr; 319 io->io_next_block = bh->b_blocknr;
316 return 0; 320 return 0;
317} 321}
@@ -381,8 +385,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
381 385
382 BUG_ON(!PageLocked(page)); 386 BUG_ON(!PageLocked(page));
383 BUG_ON(PageWriteback(page)); 387 BUG_ON(PageWriteback(page));
384 set_page_writeback(page);
385 ClearPageError(page);
386 388
387 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); 389 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
388 if (!io_page) { 390 if (!io_page) {
@@ -393,6 +395,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
393 io_page->p_page = page; 395 io_page->p_page = page;
394 atomic_set(&io_page->p_count, 1); 396 atomic_set(&io_page->p_count, 1);
395 get_page(page); 397 get_page(page);
398 set_page_writeback(page);
399 ClearPageError(page);
396 400
397 for (bh = head = page_buffers(page), block_start = 0; 401 for (bh = head = page_buffers(page), block_start = 0;
398 bh != head || !block_start; 402 bh != head || !block_start;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3ecc6e45d2f9..80bbc9c60c24 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -230,7 +230,7 @@ static int setup_new_group_blocks(struct super_block *sb,
230 } 230 }
231 231
232 /* Zero out all of the reserved backup group descriptor table blocks */ 232 /* Zero out all of the reserved backup group descriptor table blocks */
233 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", 233 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
234 block, sbi->s_itb_per_group); 234 block, sbi->s_itb_per_group);
235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, 235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
236 GFP_NOFS); 236 GFP_NOFS);
@@ -248,7 +248,7 @@ static int setup_new_group_blocks(struct super_block *sb,
248 248
249 /* Zero out all of the inode table blocks */ 249 /* Zero out all of the inode table blocks */
250 block = input->inode_table; 250 block = input->inode_table;
251 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", 251 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
252 block, sbi->s_itb_per_group); 252 block, sbi->s_itb_per_group);
253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); 253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
254 if (err) 254 if (err)
@@ -499,12 +499,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
499 return err; 499 return err;
500 500
501exit_inode: 501exit_inode:
502 /* ext4_journal_release_buffer(handle, iloc.bh); */ 502 /* ext4_handle_release_buffer(handle, iloc.bh); */
503 brelse(iloc.bh); 503 brelse(iloc.bh);
504exit_dindj: 504exit_dindj:
505 /* ext4_journal_release_buffer(handle, dind); */ 505 /* ext4_handle_release_buffer(handle, dind); */
506exit_sbh: 506exit_sbh:
507 /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ 507 /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
508exit_dind: 508exit_dind:
509 brelse(dind); 509 brelse(dind);
510exit_bh: 510exit_bh:
@@ -586,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
586 /* 586 /*
587 int j; 587 int j;
588 for (j = 0; j < i; j++) 588 for (j = 0; j < i; j++)
589 ext4_journal_release_buffer(handle, primary[j]); 589 ext4_handle_release_buffer(handle, primary[j]);
590 */ 590 */
591 goto exit_bh; 591 goto exit_bh;
592 } 592 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 203f9e4a70be..8553dfb310af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -54,9 +54,9 @@
54 54
55static struct proc_dir_entry *ext4_proc_root; 55static struct proc_dir_entry *ext4_proc_root;
56static struct kset *ext4_kset; 56static struct kset *ext4_kset;
57struct ext4_lazy_init *ext4_li_info; 57static struct ext4_lazy_init *ext4_li_info;
58struct mutex ext4_li_mtx; 58static struct mutex ext4_li_mtx;
59struct ext4_features *ext4_feat; 59static struct ext4_features *ext4_feat;
60 60
61static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 61static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
62 unsigned long journal_devnum); 62 unsigned long journal_devnum);
@@ -75,6 +75,7 @@ static void ext4_write_super(struct super_block *sb);
75static int ext4_freeze(struct super_block *sb); 75static int ext4_freeze(struct super_block *sb);
76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77 const char *dev_name, void *data); 77 const char *dev_name, void *data);
78static int ext4_feature_set_ok(struct super_block *sb, int readonly);
78static void ext4_destroy_lazyinit_thread(void); 79static void ext4_destroy_lazyinit_thread(void);
79static void ext4_unregister_li_request(struct super_block *sb); 80static void ext4_unregister_li_request(struct super_block *sb);
80static void ext4_clear_request_list(void); 81static void ext4_clear_request_list(void);
@@ -241,27 +242,44 @@ static void ext4_put_nojournal(handle_t *handle)
241 * journal_end calls result in the superblock being marked dirty, so 242 * journal_end calls result in the superblock being marked dirty, so
242 * that sync() will call the filesystem's write_super callback if 243 * that sync() will call the filesystem's write_super callback if
243 * appropriate. 244 * appropriate.
245 *
246 * To avoid j_barrier hold in userspace when a user calls freeze(),
247 * ext4 prevents a new handle from being started by s_frozen, which
248 * is in an upper layer.
244 */ 249 */
245handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) 250handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
246{ 251{
247 journal_t *journal; 252 journal_t *journal;
253 handle_t *handle;
248 254
249 if (sb->s_flags & MS_RDONLY) 255 if (sb->s_flags & MS_RDONLY)
250 return ERR_PTR(-EROFS); 256 return ERR_PTR(-EROFS);
251 257
252 vfs_check_frozen(sb, SB_FREEZE_TRANS);
253 /* Special case here: if the journal has aborted behind our
254 * backs (eg. EIO in the commit thread), then we still need to
255 * take the FS itself readonly cleanly. */
256 journal = EXT4_SB(sb)->s_journal; 258 journal = EXT4_SB(sb)->s_journal;
257 if (journal) { 259 handle = ext4_journal_current_handle();
258 if (is_journal_aborted(journal)) { 260
259 ext4_abort(sb, "Detected aborted journal"); 261 /*
260 return ERR_PTR(-EROFS); 262 * If a handle has been started, it should be allowed to
261 } 263 * finish, otherwise deadlock could happen between freeze
262 return jbd2_journal_start(journal, nblocks); 264 * and others(e.g. truncate) due to the restart of the
265 * journal handle if the filesystem is forzen and active
266 * handles are not stopped.
267 */
268 if (!handle)
269 vfs_check_frozen(sb, SB_FREEZE_TRANS);
270
271 if (!journal)
272 return ext4_get_nojournal();
273 /*
274 * Special case here: if the journal has aborted behind our
275 * backs (eg. EIO in the commit thread), then we still need to
276 * take the FS itself readonly cleanly.
277 */
278 if (is_journal_aborted(journal)) {
279 ext4_abort(sb, "Detected aborted journal");
280 return ERR_PTR(-EROFS);
263 } 281 }
264 return ext4_get_nojournal(); 282 return jbd2_journal_start(journal, nblocks);
265} 283}
266 284
267/* 285/*
@@ -594,7 +612,7 @@ __acquires(bitlock)
594 612
595 vaf.fmt = fmt; 613 vaf.fmt = fmt;
596 vaf.va = &args; 614 vaf.va = &args;
597 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", 615 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
598 sb->s_id, function, line, grp); 616 sb->s_id, function, line, grp);
599 if (ino) 617 if (ino)
600 printk(KERN_CONT "inode %lu: ", ino); 618 printk(KERN_CONT "inode %lu: ", ino);
@@ -616,7 +634,7 @@ __acquires(bitlock)
616 * filesystem will have already been marked read/only and the 634 * filesystem will have already been marked read/only and the
617 * journal has been aborted. We return 1 as a hint to callers 635 * journal has been aborted. We return 1 as a hint to callers
618 * who might what to use the return value from 636 * who might what to use the return value from
619 * ext4_grp_locked_error() to distinguish beween the 637 * ext4_grp_locked_error() to distinguish between the
620 * ERRORS_CONT and ERRORS_RO case, and perhaps return more 638 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
621 * aggressively from the ext4 function in question, with a 639 * aggressively from the ext4 function in question, with a
622 * more appropriate error code. 640 * more appropriate error code.
@@ -997,13 +1015,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
997 if (test_opt(sb, OLDALLOC)) 1015 if (test_opt(sb, OLDALLOC))
998 seq_puts(seq, ",oldalloc"); 1016 seq_puts(seq, ",oldalloc");
999#ifdef CONFIG_EXT4_FS_XATTR 1017#ifdef CONFIG_EXT4_FS_XATTR
1000 if (test_opt(sb, XATTR_USER) && 1018 if (test_opt(sb, XATTR_USER))
1001 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
1002 seq_puts(seq, ",user_xattr"); 1019 seq_puts(seq, ",user_xattr");
1003 if (!test_opt(sb, XATTR_USER) && 1020 if (!test_opt(sb, XATTR_USER))
1004 (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
1005 seq_puts(seq, ",nouser_xattr"); 1021 seq_puts(seq, ",nouser_xattr");
1006 }
1007#endif 1022#endif
1008#ifdef CONFIG_EXT4_FS_POSIX_ACL 1023#ifdef CONFIG_EXT4_FS_POSIX_ACL
1009 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) 1024 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
@@ -1041,8 +1056,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1041 !(def_mount_opts & EXT4_DEFM_NODELALLOC)) 1056 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1042 seq_puts(seq, ",nodelalloc"); 1057 seq_puts(seq, ",nodelalloc");
1043 1058
1044 if (test_opt(sb, MBLK_IO_SUBMIT)) 1059 if (!test_opt(sb, MBLK_IO_SUBMIT))
1045 seq_puts(seq, ",mblk_io_submit"); 1060 seq_puts(seq, ",nomblk_io_submit");
1046 if (sbi->s_stripe) 1061 if (sbi->s_stripe)
1047 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1062 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1048 /* 1063 /*
@@ -1451,7 +1466,7 @@ static int parse_options(char *options, struct super_block *sb,
1451 * Initialize args struct so we know whether arg was 1466 * Initialize args struct so we know whether arg was
1452 * found; some options take optional arguments. 1467 * found; some options take optional arguments.
1453 */ 1468 */
1454 args[0].to = args[0].from = 0; 1469 args[0].to = args[0].from = NULL;
1455 token = match_token(p, tokens, args); 1470 token = match_token(p, tokens, args);
1456 switch (token) { 1471 switch (token) {
1457 case Opt_bsd_df: 1472 case Opt_bsd_df:
@@ -1771,7 +1786,7 @@ set_qf_format:
1771 return 0; 1786 return 0;
1772 if (option < 0 || option > (1 << 30)) 1787 if (option < 0 || option > (1 << 30))
1773 return 0; 1788 return 0;
1774 if (!is_power_of_2(option)) { 1789 if (option && !is_power_of_2(option)) {
1775 ext4_msg(sb, KERN_ERR, 1790 ext4_msg(sb, KERN_ERR,
1776 "EXT4-fs: inode_readahead_blks" 1791 "EXT4-fs: inode_readahead_blks"
1777 " must be a power of 2"); 1792 " must be a power of 2");
@@ -2120,6 +2135,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2120 return; 2135 return;
2121 } 2136 }
2122 2137
2138 /* Check if feature set would not allow a r/w mount */
2139 if (!ext4_feature_set_ok(sb, 0)) {
2140 ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
2141 "unknown ROCOMPAT features");
2142 return;
2143 }
2144
2123 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 2145 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2124 if (es->s_last_orphan) 2146 if (es->s_last_orphan)
2125 jbd_debug(1, "Errors on filesystem, " 2147 jbd_debug(1, "Errors on filesystem, "
@@ -2412,7 +2434,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2412 if (parse_strtoul(buf, 0x40000000, &t)) 2434 if (parse_strtoul(buf, 0x40000000, &t))
2413 return -EINVAL; 2435 return -EINVAL;
2414 2436
2415 if (!is_power_of_2(t)) 2437 if (t && !is_power_of_2(t))
2416 return -EINVAL; 2438 return -EINVAL;
2417 2439
2418 sbi->s_inode_readahead_blks = t; 2440 sbi->s_inode_readahead_blks = t;
@@ -2970,6 +2992,12 @@ static int ext4_register_li_request(struct super_block *sb,
2970 mutex_unlock(&ext4_li_info->li_list_mtx); 2992 mutex_unlock(&ext4_li_info->li_list_mtx);
2971 2993
2972 sbi->s_li_request = elr; 2994 sbi->s_li_request = elr;
2995 /*
2996 * set elr to NULL here since it has been inserted to
2997 * the request_list and the removal and free of it is
2998 * handled by ext4_clear_request_list from now on.
2999 */
3000 elr = NULL;
2973 3001
2974 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { 3002 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
2975 ret = ext4_run_lazyinit_thread(); 3003 ret = ext4_run_lazyinit_thread();
@@ -3095,14 +3123,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3095 } 3123 }
3096 if (def_mount_opts & EXT4_DEFM_UID16) 3124 if (def_mount_opts & EXT4_DEFM_UID16)
3097 set_opt(sb, NO_UID32); 3125 set_opt(sb, NO_UID32);
3126 /* xattr user namespace & acls are now defaulted on */
3098#ifdef CONFIG_EXT4_FS_XATTR 3127#ifdef CONFIG_EXT4_FS_XATTR
3099 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 3128 set_opt(sb, XATTR_USER);
3100 set_opt(sb, XATTR_USER);
3101#endif 3129#endif
3102#ifdef CONFIG_EXT4_FS_POSIX_ACL 3130#ifdef CONFIG_EXT4_FS_POSIX_ACL
3103 if (def_mount_opts & EXT4_DEFM_ACL) 3131 set_opt(sb, POSIX_ACL);
3104 set_opt(sb, POSIX_ACL);
3105#endif 3132#endif
3133 set_opt(sb, MBLK_IO_SUBMIT);
3106 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 3134 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3107 set_opt(sb, JOURNAL_DATA); 3135 set_opt(sb, JOURNAL_DATA);
3108 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 3136 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3380,6 +3408,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3380 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3408 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
3381 spin_lock_init(&sbi->s_next_gen_lock); 3409 spin_lock_init(&sbi->s_next_gen_lock);
3382 3410
3411 init_timer(&sbi->s_err_report);
3412 sbi->s_err_report.function = print_daily_error_info;
3413 sbi->s_err_report.data = (unsigned long) sb;
3414
3383 err = percpu_counter_init(&sbi->s_freeblocks_counter, 3415 err = percpu_counter_init(&sbi->s_freeblocks_counter,
3384 ext4_count_free_blocks(sb)); 3416 ext4_count_free_blocks(sb));
3385 if (!err) { 3417 if (!err) {
@@ -3516,7 +3548,7 @@ no_journal:
3516 * concurrency isn't really necessary. Limit it to 1. 3548 * concurrency isn't really necessary. Limit it to 1.
3517 */ 3549 */
3518 EXT4_SB(sb)->dio_unwritten_wq = 3550 EXT4_SB(sb)->dio_unwritten_wq =
3519 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1); 3551 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3520 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3552 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3521 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3553 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
3522 goto failed_mount_wq; 3554 goto failed_mount_wq;
@@ -3531,17 +3563,16 @@ no_journal:
3531 if (IS_ERR(root)) { 3563 if (IS_ERR(root)) {
3532 ext4_msg(sb, KERN_ERR, "get root inode failed"); 3564 ext4_msg(sb, KERN_ERR, "get root inode failed");
3533 ret = PTR_ERR(root); 3565 ret = PTR_ERR(root);
3566 root = NULL;
3534 goto failed_mount4; 3567 goto failed_mount4;
3535 } 3568 }
3536 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 3569 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
3537 iput(root);
3538 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); 3570 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
3539 goto failed_mount4; 3571 goto failed_mount4;
3540 } 3572 }
3541 sb->s_root = d_alloc_root(root); 3573 sb->s_root = d_alloc_root(root);
3542 if (!sb->s_root) { 3574 if (!sb->s_root) {
3543 ext4_msg(sb, KERN_ERR, "get root dentry failed"); 3575 ext4_msg(sb, KERN_ERR, "get root dentry failed");
3544 iput(root);
3545 ret = -ENOMEM; 3576 ret = -ENOMEM;
3546 goto failed_mount4; 3577 goto failed_mount4;
3547 } 3578 }
@@ -3642,9 +3673,6 @@ no_journal:
3642 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, 3673 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
3643 *sbi->s_es->s_mount_opts ? "; " : "", orig_data); 3674 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
3644 3675
3645 init_timer(&sbi->s_err_report);
3646 sbi->s_err_report.function = print_daily_error_info;
3647 sbi->s_err_report.data = (unsigned long) sb;
3648 if (es->s_error_count) 3676 if (es->s_error_count)
3649 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ 3677 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3650 3678
@@ -3657,6 +3685,8 @@ cantfind_ext4:
3657 goto failed_mount; 3685 goto failed_mount;
3658 3686
3659failed_mount4: 3687failed_mount4:
3688 iput(root);
3689 sb->s_root = NULL;
3660 ext4_msg(sb, KERN_ERR, "mount failed"); 3690 ext4_msg(sb, KERN_ERR, "mount failed");
3661 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 3691 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
3662failed_mount_wq: 3692failed_mount_wq:
@@ -3666,6 +3696,7 @@ failed_mount_wq:
3666 sbi->s_journal = NULL; 3696 sbi->s_journal = NULL;
3667 } 3697 }
3668failed_mount3: 3698failed_mount3:
3699 del_timer(&sbi->s_err_report);
3669 if (sbi->s_flex_groups) { 3700 if (sbi->s_flex_groups) {
3670 if (is_vmalloc_addr(sbi->s_flex_groups)) 3701 if (is_vmalloc_addr(sbi->s_flex_groups))
3671 vfree(sbi->s_flex_groups); 3702 vfree(sbi->s_flex_groups);
@@ -4132,6 +4163,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4132/* 4163/*
4133 * LVM calls this function before a (read-only) snapshot is created. This 4164 * LVM calls this function before a (read-only) snapshot is created. This
4134 * gives us a chance to flush the journal completely and mark the fs clean. 4165 * gives us a chance to flush the journal completely and mark the fs clean.
4166 *
4167 * Note that only this function cannot bring a filesystem to be in a clean
4168 * state independently, because ext4 prevents a new handle from being started
4169 * by @sb->s_frozen, which stays in an upper layer. It thus needs help from
4170 * the upper layer.
4135 */ 4171 */
4136static int ext4_freeze(struct super_block *sb) 4172static int ext4_freeze(struct super_block *sb)
4137{ 4173{
@@ -4608,17 +4644,30 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4608 4644
4609static int ext4_quota_off(struct super_block *sb, int type) 4645static int ext4_quota_off(struct super_block *sb, int type)
4610{ 4646{
4647 struct inode *inode = sb_dqopt(sb)->files[type];
4648 handle_t *handle;
4649
4611 /* Force all delayed allocation blocks to be allocated. 4650 /* Force all delayed allocation blocks to be allocated.
4612 * Caller already holds s_umount sem */ 4651 * Caller already holds s_umount sem */
4613 if (test_opt(sb, DELALLOC)) 4652 if (test_opt(sb, DELALLOC))
4614 sync_filesystem(sb); 4653 sync_filesystem(sb);
4615 4654
4655 /* Update modification times of quota files when userspace can
4656 * start looking at them */
4657 handle = ext4_journal_start(inode, 1);
4658 if (IS_ERR(handle))
4659 goto out;
4660 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
4661 ext4_mark_inode_dirty(handle, inode);
4662 ext4_journal_stop(handle);
4663
4664out:
4616 return dquot_quota_off(sb, type); 4665 return dquot_quota_off(sb, type);
4617} 4666}
4618 4667
4619/* Read data from quotafile - avoid pagecache and such because we cannot afford 4668/* Read data from quotafile - avoid pagecache and such because we cannot afford
4620 * acquiring the locks... As quota files are never truncated and quota code 4669 * acquiring the locks... As quota files are never truncated and quota code
4621 * itself serializes the operations (and noone else should touch the files) 4670 * itself serializes the operations (and no one else should touch the files)
4622 * we don't have to be afraid of races */ 4671 * we don't have to be afraid of races */
4623static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 4672static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
4624 size_t len, loff_t off) 4673 size_t len, loff_t off)
@@ -4708,9 +4757,8 @@ out:
4708 if (inode->i_size < off + len) { 4757 if (inode->i_size < off + len) {
4709 i_size_write(inode, off + len); 4758 i_size_write(inode, off + len);
4710 EXT4_I(inode)->i_disksize = inode->i_size; 4759 EXT4_I(inode)->i_disksize = inode->i_size;
4760 ext4_mark_inode_dirty(handle, inode);
4711 } 4761 }
4712 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
4713 ext4_mark_inode_dirty(handle, inode);
4714 mutex_unlock(&inode->i_mutex); 4762 mutex_unlock(&inode->i_mutex);
4715 return len; 4763 return len;
4716} 4764}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fc32176eee39..b545ca1c459c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -735,7 +735,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
735 int offset = (char *)s->here - bs->bh->b_data; 735 int offset = (char *)s->here - bs->bh->b_data;
736 736
737 unlock_buffer(bs->bh); 737 unlock_buffer(bs->bh);
738 jbd2_journal_release_buffer(handle, bs->bh); 738 ext4_handle_release_buffer(handle, bs->bh);
739 if (ce) { 739 if (ce) {
740 mb_cache_entry_release(ce); 740 mb_cache_entry_release(ce);
741 ce = NULL; 741 ce = NULL;
@@ -833,7 +833,7 @@ inserted:
833 new_bh = sb_getblk(sb, block); 833 new_bh = sb_getblk(sb, block);
834 if (!new_bh) { 834 if (!new_bh) {
835getblk_failed: 835getblk_failed:
836 ext4_free_blocks(handle, inode, 0, block, 1, 836 ext4_free_blocks(handle, inode, NULL, block, 1,
837 EXT4_FREE_BLOCKS_METADATA); 837 EXT4_FREE_BLOCKS_METADATA);
838 error = -EIO; 838 error = -EIO;
839 goto cleanup; 839 goto cleanup;