Merge branch 'master' into for-next

Fast-forwarded to current state of Linus' tree as there are patches to be applied for files that didn't exist on the old branch.
author: Jiri Kosina <jkosina@suse.cz> 2011-04-26 04:22:15 -0400
committer: Jiri Kosina <jkosina@suse.cz> 2011-04-26 04:22:59 -0400
commit: 07f9479a40cc778bc1462ada11f95b01360ae4ff (patch)
tree: 0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /fs/ext4
parent: 9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff)
parent: cd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff)
17 files changed, 579 insertions, 418 deletions
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e0270d1f8d82..21eacd7b7d79 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -433,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index adf96b822781..1c67139ad4b4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -21,6 +21,8 @@
 #include "ext4_jbd2.h"
 #include "mballoc.h"
+#include <trace/events/ext4.h>
 /*
 * balloc.c contains the blocks allocation and deallocation routines
 */
@@ -342,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
         * We do it here so the bitmap uptodate bit
         * get set with buffer lock held.
         */
+        trace_ext4_read_block_bitmap_load(sb, block_group);
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
@@ -544,7 +547,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 *
 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
 * it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
 * return TRUE.
 *
 * if the total number of retries exceed three times, return FALSE.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3aa0b72b3b94..4daaf2b753f4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -923,14 +923,14 @@ struct ext4_inode_info {
 #define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)
-#define ext4_set_bit                    ext2_set_bit
+#define ext4_set_bit                    __test_and_set_bit_le
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
-#define ext4_clear_bit                  ext2_clear_bit
+#define ext4_clear_bit                  __test_and_clear_bit_le
 #define ext4_clear_bit_atomic           ext2_clear_bit_atomic
-#define ext4_test_bit                   ext2_test_bit
+#define ext4_test_bit                   test_bit_le
-#define ext4_find_first_zero_bit        ext2_find_first_zero_bit
+#define ext4_find_first_zero_bit        find_first_zero_bit_le
-#define ext4_find_next_zero_bit         ext2_find_next_zero_bit
+#define ext4_find_next_zero_bit         find_next_zero_bit_le
-#define ext4_find_next_bit              ext2_find_next_bit
+#define ext4_find_next_bit              find_next_bit_le
 /*
 * Maximal mount counts between two filesystem checks
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d8b992e658c1..d0f53538a57f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,8 +86,8 @@
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
+ * allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
@@ -202,13 +202,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
        return 1;
 }
-static inline void ext4_journal_release_buffer(handle_t *handle,
-                                                struct buffer_head *bh)
-{
-        if (ext4_handle_valid(handle))
-                jbd2_journal_release_buffer(handle, bh);
-}
 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 {
        return ext4_journal_start_sb(inode->i_sb, nblocks);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7516fb9c0bd5..4890d6f3ad15 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,8 @@
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
+#include <trace/events/ext4.h>
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -664,6 +666,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                if (unlikely(!bh))
                        goto err;
                if (!bh_uptodate_or_lock(bh)) {
+                        trace_ext4_ext_load_extent(inode, block,
+                                                path[ppos].p_block);
                        if (bh_submit_read(bh) < 0) {
                                put_bh(bh);
                                goto err;
@@ -1034,7 +1038,7 @@ cleanup:
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
-                        ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
                                         EXT4_FREE_BLOCKS_METADATA);
                }
        }
@@ -1725,7 +1729,7 @@ repeat:
                BUG_ON(npath->p_depth != path->p_depth);
                eh = npath[depth].p_hdr;
                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
-                        ext_debug("next leaf isnt full(%d)\n",
+                        ext_debug("next leaf isn't full(%d)\n",
                                  le16_to_cpu(eh->eh_entries));
                        path = npath;
                        goto repeat;
@@ -2059,7 +2063,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        if (err)
                return err;
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
-        ext4_free_blocks(handle, inode, 0, leaf, 1,
+        ext4_free_blocks(handle, inode, NULL, leaf, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return err;
 }
@@ -2156,7 +2160,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                start = ext4_ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
-                ext4_free_blocks(handle, inode, 0, start, num, flags);
+                ext4_free_blocks(handle, inode, NULL, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2529,7 +2533,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 /*
 * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
- * extent into multiple extents (upto three - one initialized and two
+ * extent into multiple extents (up to three - one initialized and two
 * uninitialized).
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be initialized
@@ -3108,14 +3112,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
 {
        int i, depth;
        struct ext4_extent_header *eh;
-        struct ext4_extent *ex, *last_ex;
+        struct ext4_extent *last_ex;
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
                return 0;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
-        ex = path[depth].p_ext;
        if (unlikely(!eh->eh_entries)) {
                EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
@@ -3171,7 +3174,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                                                   path, flags);
                /*
                 * Flag the inode(non aio case) or end_io struct (aio case)
-                 * that this IO needs to convertion to written when IO is
+                 * that this IO needs to conversion to written when IO is
                 * completed
                 */
                if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
@@ -3295,9 +3298,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map, int flags)
 {
        struct ext4_ext_path *path = NULL;
-        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex;
-        ext4_fsblk_t newblock;
+        ext4_fsblk_t newblock = 0;
        int err = 0, depth, ret;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
@@ -3305,6 +3307,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
+        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        /* check in cache */
        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
@@ -3352,7 +3355,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                err = -EIO;
                goto out2;
        }
-        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        if (ex) {
@@ -3458,10 +3460,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_ext_mark_uninitialized(&newex);
                /*
                 * io_end structure was created for every IO write to an
-                 * uninitialized extent. To avoid unecessary conversion,
+                 * uninitialized extent. To avoid unnecessary conversion,
                 * here we flag the IO that really needs the conversion.
                 * For non asycn direct IO case, flag the inode state
-                 * that we need to perform convertion when IO is done.
+                 * that we need to perform conversion when IO is done.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                        if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
@@ -3485,7 +3487,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
@@ -3525,6 +3527,8 @@ out2:
                ext4_ext_drop_refs(path);
                kfree(path);
        }
+        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+                newblock, map->m_len, err ? err : allocated);
        return err ? err : allocated;
 }
@@ -3658,6 +3662,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
+        trace_ext4_fallocate_enter(inode, offset, len, mode);
        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
@@ -3673,6 +3678,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        ret = inode_newsize_ok(inode, (len + offset));
        if (ret) {
                mutex_unlock(&inode->i_mutex);
+                trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
                return ret;
        }
 retry:
@@ -3717,6 +3723,8 @@ retry:
                goto retry;
        }
        mutex_unlock(&inode->i_mutex);
+        trace_ext4_fallocate_exit(inode, offset, max_blocks,
+                                ret > 0 ? ret2 : ret);
        return ret > 0 ? ret2 : ret;
 }
@@ -3775,6 +3783,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
        }
        return ret > 0 ? ret2 : ret;
 }
 /*
 * Callback function called for each extent to gather FIEMAP information.
 */
@@ -3782,38 +3791,162 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
                       void *data)
 {
-        struct fiemap_extent_info *fieinfo = data;
-        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        __u64   logical;
        __u64   physical;
        __u64   length;
+        loff_t  size;
        __u32   flags = 0;
-        int     error;
+        int             ret = 0;
+        struct fiemap_extent_info *fieinfo = data;
+        unsigned char blksize_bits;
-        logical =  (__u64)newex->ec_block << blksize_bits;
+        blksize_bits = inode->i_sb->s_blocksize_bits;
+        logical = (__u64)newex->ec_block << blksize_bits;
        if (newex->ec_start == 0) {
-                pgoff_t offset;
+                /*
-                struct page *page;
+                 * No extent in extent-tree contains block @newex->ec_start,
+                 * then the block may stay in 1)a hole or 2)delayed-extent.
+                 *
+                 * Holes or delayed-extents are processed as follows.
+                 * 1. lookup dirty pages with specified range in pagecache.
+                 *    If no page is got, then there is no delayed-extent and
+                 *    return with EXT_CONTINUE.
+                 * 2. find the 1st mapped buffer,
+                 * 3. check if the mapped buffer is both in the request range
+                 *    and a delayed buffer. If not, there is no delayed-extent,
+                 *    then return.
+                 * 4. a delayed-extent is found, the extent will be collected.
+                 */
+                ext4_lblk_t     end = 0;
+                pgoff_t         last_offset;
+                pgoff_t         offset;
+                pgoff_t         index;
+                struct page     **pages = NULL;
                struct buffer_head *bh = NULL;
+                struct buffer_head *head = NULL;
+                unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
+                pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+                if (pages == NULL)
+                        return -ENOMEM;
                offset = logical >> PAGE_SHIFT;
-                page = find_get_page(inode->i_mapping, offset);
+repeat:
-                if (!page || !page_has_buffers(page))
+                last_offset = offset;
-                        return EXT_CONTINUE;
+                head = NULL;
+                ret = find_get_pages_tag(inode->i_mapping, &offset,
+                                        PAGECACHE_TAG_DIRTY, nr_pages, pages);
+                if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+                        /* First time, try to find a mapped buffer. */
+                        if (ret == 0) {
+out:
+                                for (index = 0; index < ret; index++)
+                                        page_cache_release(pages[index]);
+                                /* just a hole. */
+                                kfree(pages);
+                                return EXT_CONTINUE;
+                        }
-                bh = page_buffers(page);
+                        /* Try to find the 1st mapped buffer. */
+                        end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+                                  blksize_bits;
+                        if (!page_has_buffers(pages[0]))
+                                goto out;
+                        head = page_buffers(pages[0]);
+                        if (!head)
+                                goto out;
-                if (!bh)
+                        bh = head;
-                        return EXT_CONTINUE;
+                        do {
+                                if (buffer_mapped(bh)) {
+                                        /* get the 1st mapped buffer. */
+                                        if (end > newex->ec_block +
+                                                newex->ec_len)
+                                                /* The buffer is out of
+                                                 * the request range.
+                                                 */
+                                                goto out;
+                                        goto found_mapped_buffer;
+                                }
+                                bh = bh->b_this_page;
+                                end++;
+                        } while (bh != head);
-                if (buffer_delay(bh)) {
+                        /* No mapped buffer found. */
-                        flags |= FIEMAP_EXTENT_DELALLOC;
+                        goto out;
-                        page_cache_release(page);
                } else {
-                        page_cache_release(page);
+                        /*Find contiguous delayed buffers. */
-                        return EXT_CONTINUE;
+                        if (ret > 0 && pages[0]->index == last_offset)
+                                head = page_buffers(pages[0]);
+                        bh = head;
                }
+found_mapped_buffer:
+                if (bh != NULL && buffer_delay(bh)) {
+                        /* 1st or contiguous delayed buffer found. */
+                        if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+                                /*
+                                 * 1st delayed buffer found, record
+                                 * the start of extent.
+                                 */
+                                flags |= FIEMAP_EXTENT_DELALLOC;
+                                newex->ec_block = end;
+                                logical = (__u64)end << blksize_bits;
+                        }
+                        /* Find contiguous delayed buffers. */
+                        do {
+                                if (!buffer_delay(bh))
+                                        goto found_delayed_extent;
+                                bh = bh->b_this_page;
+                                end++;
+                        } while (bh != head);
+                        for (index = 1; index < ret; index++) {
+                                if (!page_has_buffers(pages[index])) {
+                                        bh = NULL;
+                                        break;
+                                }
+                                head = page_buffers(pages[index]);
+                                if (!head) {
+                                        bh = NULL;
+                                        break;
+                                }
+                                if (pages[index]->index !=
+                                        pages[0]->index + index) {
+                                        /* Blocks are not contiguous. */
+                                        bh = NULL;
+                                        break;
+                                }
+                                bh = head;
+                                do {
+                                        if (!buffer_delay(bh))
+                                                /* Delayed-extent ends. */
+                                                goto found_delayed_extent;
+                                        bh = bh->b_this_page;
+                                        end++;
+                                } while (bh != head);
+                        }
+                } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
+                        /* a hole found. */
+                        goto out;
+found_delayed_extent:
+                newex->ec_len = min(end - newex->ec_block,
+                                                (ext4_lblk_t)EXT_INIT_MAX_LEN);
+                if (ret == nr_pages && bh != NULL &&
+                        newex->ec_len < EXT_INIT_MAX_LEN &&
+                        buffer_delay(bh)) {
+                        /* Have not collected an extent and continue. */
+                        for (index = 0; index < ret; index++)
+                                page_cache_release(pages[index]);
+                        goto repeat;
+                }
+                for (index = 0; index < ret; index++)
+                        page_cache_release(pages[index]);
+                kfree(pages);
        }
        physical = (__u64)newex->ec_start << blksize_bits;
@@ -3822,32 +3955,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
        if (ex && ext4_ext_is_uninitialized(ex))
                flags |= FIEMAP_EXTENT_UNWRITTEN;
-        /*
+        size = i_size_read(inode);
-         * If this extent reaches EXT_MAX_BLOCK, it must be last.
+        if (logical + length >= size)
-         *
-         * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
-         * this also indicates no more allocated blocks.
-         *
-         * XXX this might miss a single-block extent at EXT_MAX_BLOCK
-         */
-        if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-            newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
-                loff_t size = i_size_read(inode);
-                loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
                flags |= FIEMAP_EXTENT_LAST;
-                if ((flags & FIEMAP_EXTENT_DELALLOC) &&
-                    logical+length > size)
-                        length = (size - logical + bs - 1) & ~(bs-1);
-        }
-        error = fiemap_fill_next_extent(fieinfo, logical, physical,
+        ret = fiemap_fill_next_extent(fieinfo, logical, physical,
                                        length, flags);
-        if (error < 0)
+        if (ret < 0)
-                return error;
+                return ret;
-        if (error == 1)
+        if (ret == 1)
                return EXT_BREAK;
        return EXT_CONTINUE;
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 7829b287822a..e9473cbe80df 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -101,7 +101,7 @@ extern int ext4_flush_completed_IO(struct inode *inode)
                 * to the work-to-be schedule is freed.
                 *
                 * Thus we need to keep the io structure still valid here after
-                 * convertion finished. The io structure has a flag to
+                 * conversion finished. The io structure has a flag to
                 * avoid double converting from both fsync and background work
                 * queue work.
                 */
@@ -125,9 +125,11 @@ extern int ext4_flush_completed_IO(struct inode *inode)
 * the parent directory's parent as well, and so on recursively, if
 * they are also freshly created.
 */
-static void ext4_sync_parent(struct inode *inode)
+static int ext4_sync_parent(struct inode *inode)
 {
+        struct writeback_control wbc;
        struct dentry *dentry = NULL;
+        int ret = 0;
        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -136,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
                        break;
                inode = dentry->d_parent->d_inode;
-                sync_mapping_buffers(inode->i_mapping);
+                ret = sync_mapping_buffers(inode->i_mapping);
+                if (ret)
+                        break;
+                memset(&wbc, 0, sizeof(wbc));
+                wbc.sync_mode = WB_SYNC_ALL;
+                wbc.nr_to_write = 0;         /* only write out the inode */
+                ret = sync_inode(inode, &wbc);
+                if (ret)
+                        break;
        }
+        return ret;
 }
 /*
@@ -164,20 +175,20 @@ int ext4_sync_file(struct file *file, int datasync)
        J_ASSERT(ext4_journal_current_handle() == NULL);
-        trace_ext4_sync_file(file, datasync);
+        trace_ext4_sync_file_enter(file, datasync);
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
        ret = ext4_flush_completed_IO(inode);
        if (ret < 0)
-                return ret;
+                goto out;
        if (!journal) {
                ret = generic_file_fsync(file, datasync);
                if (!ret && !list_empty(&inode->i_dentry))
-                        ext4_sync_parent(inode);
+                        ret = ext4_sync_parent(inode);
-                return ret;
+                goto out;
        }
        /*
@@ -194,8 +205,10 @@ int ext4_sync_file(struct file *file, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-        if (ext4_should_journal_data(inode))
+        if (ext4_should_journal_data(inode)) {
-                return ext4_force_commit(inode->i_sb);
+                ret = ext4_force_commit(inode->i_sb);
+                goto out;
+        }
        commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
        if (jbd2_log_start_commit(journal, commit_tid)) {
@@ -215,5 +228,7 @@ int ext4_sync_file(struct file *file, int datasync)
                ret = jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ out:
+        trace_ext4_sync_file_exit(inode, ret);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 78b79e1bd7ed..21bb2f61e502 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -152,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
         * We do it here so the bitmap uptodate bit
         * get set with buffer lock held.
         */
+        trace_ext4_load_inode_bitmap(sb, block_group);
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
@@ -649,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                *group = parent_group + flex_size;
                if (*group > ngroups)
                        *group = 0;
-                return find_group_orlov(sb, parent, group, mode, 0);
+                return find_group_orlov(sb, parent, group, mode, NULL);
        }
        /*
@@ -1054,6 +1055,11 @@ got:
                }
        }
+        if (ext4_handle_valid(handle)) {
+                ei->i_sync_tid = handle->h_transaction->t_tid;
+                ei->i_datasync_tid = handle->h_transaction->t_tid;
+        }
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f7f9e49914f..f2fa5e8a582c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -173,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
        up_write(&EXT4_I(inode)->i_data_sem);
-        ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+        ret = ext4_journal_restart(handle, nblocks);
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode);
@@ -720,7 +720,7 @@ allocated:
        return ret;
 failed_out:
        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
        return ret;
 }
@@ -823,20 +823,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        return err;
 failed:
        /* Allocation failed, free what we already allocated */
-        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
+        ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
                 */
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
+        ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
        return err;
 }
@@ -924,7 +924,7 @@ err_out:
                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
                         blks, 0);
        return err;
@@ -973,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
        int count = 0;
        ext4_fsblk_t first_block = 0;
+        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1058,6 +1059,8 @@ cleanup:
                partial--;
        }
 out:
+        trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+                                map->m_pblk, map->m_len, err);
        return err;
 }
@@ -2060,7 +2063,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
-                        int commit_write = 0, redirty_page = 0;
+                        int commit_write = 0, skip_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
@@ -2086,14 +2089,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                         * If the page does not have buffers (for
                         * whatever reason), try to create them using
                         * __block_write_begin.  If this fails,
-                         * redirty the page and move on.
+                         * skip the page and move on.
                         */
                        if (!page_has_buffers(page)) {
                                if (__block_write_begin(page, 0, len,
                                                noalloc_get_block_write)) {
-                                redirty_page:
+                                skip_page:
-                                        redirty_page_for_writepage(mpd->wbc,
-                                                                   page);
                                        unlock_page(page);
                                        continue;
                                }
@@ -2104,7 +2105,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                        block_start = 0;
                        do {
                                if (!bh)
-                                        goto redirty_page;
+                                        goto skip_page;
                                if (map && (cur_logical >= map->m_lblk) &&
                                    (cur_logical <= (map->m_lblk +
                                                     (map->m_len - 1)))) {
@@ -2120,22 +2121,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                        clear_buffer_unwritten(bh);
                                }
-                                /* redirty page if block allocation undone */
+                                /* skip page if block allocation undone */
                                if (buffer_delay(bh) || buffer_unwritten(bh))
-                                        redirty_page = 1;
+                                        skip_page = 1;
                                bh = bh->b_this_page;
                                block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
                        } while (bh != page_bufs);
-                        if (redirty_page)
+                        if (skip_page)
-                                goto redirty_page;
+                                goto skip_page;
                        if (commit_write)
                                /* mark the buffer_heads as dirty & uptodate */
                                block_commit_write(page, 0, len);
+                        clear_page_dirty_for_io(page);
                        /*
                         * Delalloc doesn't support data journalling,
                         * but eventually maybe we'll lift this
@@ -2165,8 +2167,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
        return ret;
 }
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
-                                        sector_t logical, long blk_cnt)
 {
        int nr_pages, i;
        pgoff_t index, end;
@@ -2174,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        index = mpd->first_page;
-        end   = (logical + blk_cnt - 1) >>
+        end   = mpd->next_page - 1;
-                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
        while (index <= end) {
                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
                if (nr_pages == 0)
@@ -2279,9 +2279,8 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                err = blks;
                /*
                 * If get block returns EAGAIN or ENOSPC and there
-                 * appears to be free blocks we will call
+                 * appears to be free blocks we will just let
-                 * ext4_writepage() for all of the pages which will
+                 * mpage_da_submit_io() unlock all of the pages.
-                 * just redirty the pages.
                 */
                if (err == -EAGAIN)
                        goto submit_io;
@@ -2312,8 +2311,10 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                                ext4_print_free_blocks(mpd->inode);
                }
                /* invalidate all the pages */
-                ext4_da_block_invalidatepages(mpd, next,
+                ext4_da_block_invalidatepages(mpd);
-                                mpd->b_size >> mpd->inode->i_blkbits);
+                /* Mark this page range as having been completed */
+                mpd->io_done = 1;
                return;
        }
        BUG_ON(blks == 0);
@@ -2438,102 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * __mpage_da_writepage - finds extent of pages and blocks
- *
- * @page: page to consider
- * @wbc: not used, we just follow rules
- * @data: context
- *
- * The function finds extents of pages and scan them for all blocks.
- */
-static int __mpage_da_writepage(struct page *page,
-                                struct writeback_control *wbc,
-                                struct mpage_da_data *mpd)
-{
-        struct inode *inode = mpd->inode;
-        struct buffer_head *bh, *head;
-        sector_t logical;
-        /*
-         * Can we merge this page to current extent?
-         */
-        if (mpd->next_page != page->index) {
-                /*
-                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them
-                 */
-                if (mpd->next_page != mpd->first_page) {
-                        mpage_da_map_and_submit(mpd);
-                        /*
-                         * skip rest of the page in the page_vec
-                         */
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return MPAGE_DA_EXTENT_TAIL;
-                }
-                /*
-                 * Start next extent of pages ...
-                 */
-                mpd->first_page = page->index;
-                /*
-                 * ... and blocks
-                 */
-                mpd->b_size = 0;
-                mpd->b_state = 0;
-                mpd->b_blocknr = 0;
-        }
-        mpd->next_page = page->index + 1;
-        logical = (sector_t) page->index <<
-                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        if (!page_has_buffers(page)) {
-                mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
-                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
-                if (mpd->io_done)
-                        return MPAGE_DA_EXTENT_TAIL;
-        } else {
-                /*
-                 * Page with regular buffer heads, just add all dirty ones
-                 */
-                head = page_buffers(page);
-                bh = head;
-                do {
-                        BUG_ON(buffer_locked(bh));
-                        /*
-                         * We need to try to allocate
-                         * unmapped blocks in the same page.
-                         * Otherwise we won't make progress
-                         * with the page in ext4_writepage
-                         */
-                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                                mpage_add_bh_to_extent(mpd, logical,
-                                                       bh->b_size,
-                                                       bh->b_state);
-                                if (mpd->io_done)
-                                        return MPAGE_DA_EXTENT_TAIL;
-                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
-                                /*
-                                 * mapped dirty buffer. We need to update
-                                 * the b_state because we look at
-                                 * b_state in mpage_da_map_blocks. We don't
-                                 * update b_size because if we find an
-                                 * unmapped buffer_head later we need to
-                                 * use the b_state flag of that buffer_head.
-                                 */
-                                if (mpd->b_size == 0)
-                                        mpd->b_state = bh->b_state & BH_FLAGS;
-                        }
-                        logical++;
-                } while ((bh = bh->b_this_page) != head);
-        }
-        return 0;
-}
-/*
 * This is a special get_blocks_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
@@ -2684,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 * because we should have holes filled from ext4_page_mkwrite(). We even don't
 * need to file the inode to the transaction's list in ordered mode because if
 * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
+ * we are writing back data modified via mmap(), no one guarantees in which
 * transaction the data will hit the disk. In case we are journaling data, we
 * cannot start transaction directly because transaction start ranks above page
 * lock so we have to do some magic.
@@ -2786,7 +2691,7 @@ static int ext4_writepage(struct page *page,
 /*
 * This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
+ * calculate the total number of credits to reserve to fit
 * a single extent allocation into a single transaction,
 * ext4_da_writpeages() will loop calling this before
 * the block allocation.
@@ -2811,27 +2716,27 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 /*
 * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and call the callback function (which usually writes
+ * address space and accumulate pages that need writing, and call
- * the pages).
+ * mpage_da_map_and_submit to map a single contiguous memory region
- *
+ * and then write them.
- * This is a forked version of write_cache_pages().  Differences:
- *      Range cyclic is ignored.
- *      no_nrwrite_index_update is always presumed true
 */
 static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
                                struct mpage_da_data *mpd,
                                pgoff_t *done_index)
 {
-        int ret = 0;
+        struct buffer_head      *bh, *head;
-        int done = 0;
+        struct inode            *inode = mapping->host;
-        struct pagevec pvec;
+        struct pagevec          pvec;
-        unsigned nr_pages;
+        unsigned int            nr_pages;
-        pgoff_t index;
+        sector_t                logical;
-        pgoff_t end;            /* Inclusive */
+        pgoff_t                 index, end;
-        long nr_to_write = wbc->nr_to_write;
+        long                    nr_to_write = wbc->nr_to_write;
-        int tag;
+        int                     i, tag, ret = 0;
+        memset(mpd, 0, sizeof(struct mpage_da_data));
+        mpd->wbc = wbc;
+        mpd->inode = inode;
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
@@ -2842,13 +2747,11 @@ static int write_cache_pages_da(struct address_space *mapping,
                tag = PAGECACHE_TAG_DIRTY;
        *done_index = index;
-        while (!done && (index <= end)) {
+        while (index <= end) {
-                int i;
                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
-                        break;
+                        return 0;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
@@ -2860,60 +2763,100 @@ static int write_cache_pages_da(struct address_space *mapping,
                         * mapping. However, page->index will not change
                         * because we have a reference on the page.
                         */
-                        if (page->index > end) {
+                        if (page->index > end)
-                                done = 1;
+                                goto out;
-                                break;
-                        }
                        *done_index = page->index + 1;
+                        /*
+                         * If we can't merge this page, and we have
+                         * accumulated an contiguous region, write it
+                         */
+                        if ((mpd->next_page != page->index) &&
+                            (mpd->next_page != mpd->first_page)) {
+                                mpage_da_map_and_submit(mpd);
+                                goto ret_extent_tail;
+                        }
                        lock_page(page);
                        /*
-                         * Page truncated or invalidated. We can freely skip it
+                         * If the page is no longer dirty, or its
-                         * then, even for data integrity operations: the page
+                         * mapping no longer corresponds to inode we
-                         * has disappeared concurrently, so there could be no
+                         * are writing (which means it has been
-                         * real expectation of this data interity operation
+                         * truncated or invalidated), or the page is
-                         * even if there is now a new, dirty page at the same
+                         * already under writeback and we are not
-                         * pagecache address.
+                         * doing a data integrity writeback, skip the page
                         */
-                        if (unlikely(page->mapping != mapping)) {
+                        if (!PageDirty(page) ||
-continue_unlock:
+                            (PageWriteback(page) &&
+                             (wbc->sync_mode == WB_SYNC_NONE)) ||
+                            unlikely(page->mapping != mapping)) {
                                unlock_page(page);
                                continue;
                        }
-                        if (!PageDirty(page)) {
+                        if (PageWriteback(page))
-                                /* someone wrote it for us */
+                                wait_on_page_writeback(page);
-                                goto continue_unlock;
-                        }
-                        if (PageWriteback(page)) {
-                                if (wbc->sync_mode != WB_SYNC_NONE)
-                                        wait_on_page_writeback(page);
-                                else
-                                        goto continue_unlock;
-                        }
                        BUG_ON(PageWriteback(page));
-                        if (!clear_page_dirty_for_io(page))
-                                goto continue_unlock;
-                        ret = __mpage_da_writepage(page, wbc, mpd);
+                        if (mpd->next_page != page->index)
-                        if (unlikely(ret)) {
+                                mpd->first_page = page->index;
-                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                        mpd->next_page = page->index + 1;
-                                        unlock_page(page);
+                        logical = (sector_t) page->index <<
-                                        ret = 0;
+                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                                } else {
-                                        done = 1;
+                        if (!page_has_buffers(page)) {
-                                        break;
+                                mpage_add_bh_to_extent(mpd, logical,
-                                }
+                                                       PAGE_CACHE_SIZE,
+                                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
+                                if (mpd->io_done)
+                                        goto ret_extent_tail;
+                        } else {
+                                /*
+                                 * Page with regular buffer heads,
+                                 * just add all dirty ones
+                                 */
+                                head = page_buffers(page);
+                                bh = head;
+                                do {
+                                        BUG_ON(buffer_locked(bh));
+                                        /*
+                                         * We need to try to allocate
+                                         * unmapped blocks in the same page.
+                                         * Otherwise we won't make progress
+                                         * with the page in ext4_writepage
+                                         */
+                                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+                                                mpage_add_bh_to_extent(mpd, logical,
+                                                                       bh->b_size,
+                                                                       bh->b_state);
+                                                if (mpd->io_done)
+                                                        goto ret_extent_tail;
+                                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+                                                /*
+                                                 * mapped dirty buffer. We need
+                                                 * to update the b_state
+                                                 * because we look at b_state
+                                                 * in mpage_da_map_blocks.  We
+                                                 * don't update b_size because
+                                                 * if we find an unmapped
+                                                 * buffer_head later we need to
+                                                 * use the b_state flag of that
+                                                 * buffer_head.
+                                                 */
+                                                if (mpd->b_size == 0)
+                                                        mpd->b_state = bh->b_state & BH_FLAGS;
+                                        }
+                                        logical++;
+                                } while ((bh = bh->b_this_page) != head);
                        }
                        if (nr_to_write > 0) {
                                nr_to_write--;
                                if (nr_to_write == 0 &&
-                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                    wbc->sync_mode == WB_SYNC_NONE)
                                        /*
                                         * We stop writing back only if we are
                                         * not doing integrity sync. In case of
@@ -2924,14 +2867,18 @@ continue_unlock:
                                         * pages, but have not synced all of the
                                         * old dirty pages.
                                         */
-                                        done = 1;
+                                        goto out;
-                                        break;
-                                }
                        }
                }
                pagevec_release(&pvec);
                cond_resched();
        }
+        return 0;
+ret_extent_tail:
+        ret = MPAGE_DA_EXTENT_TAIL;
+out:
+        pagevec_release(&pvec);
+        cond_resched();
        return ret;
 }
@@ -2945,7 +2892,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
        int pages_written = 0;
-        long pages_skipped;
        unsigned int max_pages;
        int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0;
@@ -3028,11 +2974,6 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->nr_to_write = desired_nr_to_write;
        }
-        mpd.wbc = wbc;
-        mpd.inode = mapping->host;
-        pages_skipped = wbc->pages_skipped;
 retry:
        if (wbc->sync_mode == WB_SYNC_ALL)
                tag_pages_for_writeback(mapping, index, end);
@@ -3059,22 +3000,10 @@ retry:
                }
                /*
-                 * Now call __mpage_da_writepage to find the next
+                 * Now call write_cache_pages_da() to find the next
                 * contiguous region of logical blocks that need
-                 * blocks to be allocated by ext4.  We don't actually
+                 * blocks to be allocated by ext4 and submit them.
-                 * submit the blocks for I/O here, even though
-                 * write_cache_pages thinks it will, and will set the
-                 * pages as clean for write before calling
-                 * __mpage_da_writepage().
                 */
-                mpd.b_size = 0;
-                mpd.b_state = 0;
-                mpd.b_blocknr = 0;
-                mpd.first_page = 0;
-                mpd.next_page = 0;
-                mpd.io_done = 0;
-                mpd.pages_written = 0;
-                mpd.retval = 0;
                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
@@ -3096,7 +3025,6 @@ retry:
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
-                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
                        /*
@@ -3104,7 +3032,6 @@ retry:
                         * rest of the pages
                         */
                        pages_written += mpd.pages_written;
-                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
                        io_done = 1;
                } else if (wbc->nr_to_write)
@@ -3122,11 +3049,6 @@ retry:
                wbc->range_end  = mapping->writeback_index - 1;
                goto retry;
        }
-        if (pages_skipped != wbc->pages_skipped)
-                ext4_msg(inode->i_sb, KERN_CRIT,
-                         "This should not happen leaving %s "
-                         "with nr_to_write = %ld ret = %d",
-                         __func__, wbc->nr_to_write, ret);
        /* Update index */
        wbc->range_cyclic = range_cyclic;
@@ -3383,7 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
-         * simplifying them becuase we wouldn't actually intend to
+         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
@@ -3460,6 +3382,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 static int ext4_readpage(struct file *file, struct page *page)
 {
+        trace_ext4_readpage(page);
        return mpage_readpage(page, ext4_get_block);
 }
@@ -3494,6 +3417,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        trace_ext4_invalidatepage(page, offset);
        /*
         * free any io_end structure allocated for buffers to be discarded
         */
@@ -3515,6 +3440,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        trace_ext4_releasepage(page);
        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
@@ -3768,7 +3695,7 @@ retry:
 *
 * The unwrritten extents will be converted to written when DIO is completed.
 * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the convertion
+ * set up an end_io call back function, which will do the conversion
 * when async direct IO completed.
 *
 * If the O_DIRECT write will extend the file then add this inode to the
@@ -3791,7 +3718,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                 * We could direct write to holes and fallocate.
                 *
                 * Allocated blocks to fill the hole are marked as uninitialized
-                 * to prevent paralel buffered read to expose the stale data
+                 * to prevent parallel buffered read to expose the stale data
                 * before DIO complete the data IO.
                 *
                 * As to previously fallocated extents, ext4 get_block
@@ -3852,7 +3779,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                        int err;
                        /*
                         * for non AIO case, since the IO is already
-                         * completed, we could do the convertion right here
+                         * completed, we could do the conversion right here
                         */
                        err = ext4_convert_unwritten_extents(inode,
                                                             offset, ret);
@@ -3873,11 +3800,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        ssize_t ret;
+        trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+                ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+        else
-        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+                ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+        trace_ext4_direct_IO_exit(inode, offset,
+                                iov_length(iov, nr_segs), rw, ret);
+        return ret;
 }
 /*
@@ -3903,7 +3835,6 @@ static const struct address_space_operations ext4_ordered_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_ordered_write_end,
        .bmap                   = ext4_bmap,
@@ -3919,7 +3850,6 @@ static const struct address_space_operations ext4_writeback_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_writeback_write_end,
        .bmap                   = ext4_bmap,
@@ -3935,7 +3865,6 @@ static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
        .set_page_dirty         = ext4_journalled_set_page_dirty,
@@ -3951,7 +3880,6 @@ static const struct address_space_operations ext4_da_aops = {
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
        .writepages             = ext4_da_writepages,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_da_write_begin,
        .write_end              = ext4_da_write_end,
        .bmap                   = ext4_bmap,
@@ -4098,7 +4026,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *
 *      When we do truncate() we may have to clean the ends of several
 *      indirect blocks but leave the blocks themselves alive. Block is
- *      partially truncated if some data below the new i_size is refered
+ *      partially truncated if some data below the new i_size is referred
 *      from it (and it is on the path to the first completely truncated
 *      data block, indeed).  We have to free the top of that path along
 *      with everything to the right of the path. Since no allocation
@@ -4177,6 +4105,9 @@ no_top:
 *
 * We release `count' blocks on disk, but (last - first) may be greater
 * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
 */
 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh,
@@ -4203,33 +4134,32 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                        err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        if (unlikely(err)) {
+                        if (unlikely(err))
-                                ext4_std_error(inode->i_sb, err);
+                                goto out_err;
-                                return 1;
-                        }
                }
                err = ext4_mark_inode_dirty(handle, inode);
-                if (unlikely(err)) {
+                if (unlikely(err))
-                        ext4_std_error(inode->i_sb, err);
+                        goto out_err;
-                        return 1;
-                }
                err = ext4_truncate_restart_trans(handle, inode,
                                                  blocks_for_truncate(inode));
-                if (unlikely(err)) {
+                if (unlikely(err))
-                        ext4_std_error(inode->i_sb, err);
+                        goto out_err;
-                        return 1;
-                }
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err))
+                                goto out_err;
                }
        }
        for (p = first; p < last; p++)
                *p = 0;
-        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+        ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
        return 0;
+out_err:
+        ext4_std_error(inode->i_sb, err);
+        return err;
 }
 /**
@@ -4240,7 +4170,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 * @first:      array of block numbers
 * @last:       points immediately past the end of array
 *
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
 *
 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
@@ -4263,7 +4193,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
        ext4_fsblk_t nr;                    /* Current block # */
        __le32 *p;                          /* Pointer into inode/ind
                                               for current block */
-        int err;
+        int err = 0;
        if (this_bh) {                          /* For indirect block */
                BUFFER_TRACE(this_bh, "get_write_access");
@@ -4285,9 +4215,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
-                                if (ext4_clear_blocks(handle, inode, this_bh,
+                                err = ext4_clear_blocks(handle, inode, this_bh,
-                                                      block_to_free, count,
+                                                        block_to_free, count,
-                                                      block_to_free_p, p))
+                                                        block_to_free_p, p);
+                                if (err)
                                        break;
                                block_to_free = nr;
                                block_to_free_p = p;
@@ -4296,9 +4227,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                }
        }
-        if (count > 0)
+        if (!err && count > 0)
-                ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+                err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
-                                  count, block_to_free_p, p);
+                                        count, block_to_free_p, p);
+        if (err < 0)
+                /* fatal error */
+                return;
        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4328,7 +4262,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 *      @last:  pointer immediately past the end of array
 *      @depth: depth of the branches to free
 *
- *      We are freeing all blocks refered from these branches (numbers are
+ *      We are freeing all blocks referred from these branches (numbers are
 *      stored as little-endian 32-bit) and updating @inode->i_blocks
 *      appropriately.
 */
@@ -4416,7 +4350,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * transaction where the data blocks are
                         * actually freed.
                         */
-                        ext4_free_blocks(handle, inode, 0, nr, 1,
+                        ext4_free_blocks(handle, inode, NULL, nr, 1,
                                         EXT4_FREE_BLOCKS_METADATA|
                                         EXT4_FREE_BLOCKS_FORGET);
@@ -4496,10 +4430,12 @@ void ext4_truncate(struct inode *inode)
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
-        int n;
+        int n = 0;
-        ext4_lblk_t last_block;
+        ext4_lblk_t last_block, max_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
+        trace_ext4_truncate_enter(inode);
        if (!ext4_can_truncate(inode))
                return;
@@ -4510,6 +4446,7 @@ void ext4_truncate(struct inode *inode)
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_ext_truncate(inode);
+                trace_ext4_truncate_exit(inode);
                return;
        }
@@ -4519,14 +4456,18 @@ void ext4_truncate(struct inode *inode)
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        if (inode->i_size & (blocksize - 1))
                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
                        goto out_stop;
-        n = ext4_block_to_path(inode, last_block, offsets, NULL);
+        if (last_block != max_block) {
-        if (n == 0)
+                n = ext4_block_to_path(inode, last_block, offsets, NULL);
-                goto out_stop;  /* error */
+                if (n == 0)
+                        goto out_stop;  /* error */
+        }
        /*
         * OK.  This truncate is going to happen.  We add the inode to the
@@ -4557,7 +4498,13 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
-        if (n == 1) {           /* direct blocks */
+        if (last_block == max_block) {
+                /*
+                 * It is unnecessary to free any data blocks if last_block is
+                 * equal to the indirect block limit.
+                 */
+                goto out_unlock;
+        } else if (n == 1) {            /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
                goto do_indirects;
@@ -4617,6 +4564,7 @@ do_indirects:
                ;
        }
+out_unlock:
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -4639,6 +4587,7 @@ out_stop:
                ext4_orphan_del(handle, inode);
        ext4_journal_stop(handle);
+        trace_ext4_truncate_exit(inode);
 }
 /*
@@ -4770,6 +4719,7 @@ make_io:
                 * has in-inode xattrs, or we don't have this inode in memory.
                 * Read the block from disk.
                 */
+                trace_ext4_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(READ_META, bh);
@@ -4875,7 +4825,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
-        iloc.bh = 0;
+        iloc.bh = NULL;
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
@@ -5460,13 +5410,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
        /* if nrblocks are contiguous */
        if (chunk) {
                /*
-                 * With N contiguous data blocks, it need at most
+                 * With N contiguous data blocks, we need at most
-                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-                 * 2 dindirect blocks
+                 * 2 dindirect blocks, and 1 tindirect block
-                 * 1 tindirect block
                 */
-                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                return DIV_ROUND_UP(nrblocks,
-                return indirects + 3;
+                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
        }
        /*
         * if nrblocks are not contiguous, worse case, each block touch
@@ -5540,7 +5489,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 /*
- * Calulate the total number of credits to reserve to fit
+ * Calculate the total number of credits to reserve to fit
 * the modification of a single pages into a single transaction,
 * which may include multiple chunks of block allocations.
 *
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eb3bc2fe647e..808c554e773f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                unsigned int oldflags;
                unsigned int jflag;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(flags, (int __user *) arg))
@@ -146,7 +146,7 @@ flags_out:
                __u32 generation;
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                err = mnt_want_write(filp->f_path.mnt);
@@ -298,7 +298,7 @@ mext_out:
        case EXT4_IOC_MIGRATE:
        {
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = mnt_want_write(filp->f_path.mnt);
@@ -320,7 +320,7 @@ mext_out:
        case EXT4_IOC_ALLOC_DA_BLKS:
        {
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = mnt_want_write(filp->f_path.mnt);
@@ -334,16 +334,22 @@ mext_out:
        case FITRIM:
        {
                struct super_block *sb = inode->i_sb;
+                struct request_queue *q = bdev_get_queue(sb->s_bdev);
                struct fstrim_range range;
                int ret = 0;
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
+                if (!blk_queue_discard(q))
+                        return -EOPNOTSUPP;
                if (copy_from_user(&range, (struct fstrim_range *)arg,
                    sizeof(range)))
                        return -EFAULT;
+                range.minlen = max((unsigned int)range.minlen,
+                                   q->limits.discard_granularity);
                ret = ext4_trim_fs(sb, &range);
                if (ret < 0)
                        return ret;
@@ -421,6 +427,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                return err;
        }
        case EXT4_IOC_MOVE_EXT:
+        case FITRIM:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d1fe09aea73d..d8a16eecf1d5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -92,7 +92,7 @@
 * between CPUs. It is possible to get scheduled at this point.
 *
 * The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
 *
 * If we can't allocate blocks via inode prealloc or/and locality group
 * prealloc then we look at the buddy cache. The buddy cache is represented
@@ -432,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
        }
        /* at order 0 we see each particular block */
-        *max = 1 << (e4b->bd_blkbits + 3);
+        if (order == 0) {
-        if (order == 0)
+                *max = 1 << (e4b->bd_blkbits + 3);
                return EXT4_MB_BITMAP(e4b);
+        }
        bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -616,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
        grp = ext4_get_group_info(sb, e4b->bd_group);
-        buddy = mb_find_buddy(e4b, 0, &max);
        list_for_each(cur, &grp->bb_prealloc_list) {
                ext4_group_t groupnr;
                struct ext4_prealloc_space *pa;
@@ -635,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 #define mb_check_buddy(e4b)
 #endif
-/* FIXME!! need more doc */
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
 static void ext4_mb_mark_free_simple(struct super_block *sb,
                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
                                        struct ext4_group_info *grp)
@@ -2381,7 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
         * So a two level scheme suffices for now. */
-        sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
+        sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
        if (sbi->s_group_info == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
                return -ENOMEM;
@@ -3208,7 +3213,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
        cur_distance = abs(goal_block - cpa->pa_pstart);
        new_distance = abs(goal_block - pa->pa_pstart);
-        if (cur_distance < new_distance)
+        if (cur_distance <= new_distance)
                return cpa;
        /* drop the previous reference */
@@ -3907,7 +3912,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
        struct super_block *sb = ac->ac_sb;
        ext4_group_t ngroups, i;
-        if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+        if (!mb_enable_debug ||
+            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
                return;
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4753,7 +4759,8 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 * bitmap. Then issue a TRIM command on this extent and free the extent in
 * the group buddy bitmap. This is done until whole group is scanned.
 */
-ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
 {
        void *bitmap;
@@ -4863,10 +4870,15 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                        break;
                }
-                if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+                /*
-                        len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+                 * For all the groups except the last one, last block will
-                else
+                 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
+                 * change it for the last group in which case start +
+                 * len < EXT4_BLOCKS_PER_GROUP(sb).
+                 */
+                if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
                        last_block = first_block + len;
+                len -= last_block - first_block;
                if (e4b.bd_info->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b619322c76f0..22bd4d7f289b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -169,7 +169,7 @@ struct ext4_allocation_context {
        /* original request */
        struct ext4_free_extent ac_o_ex;
-        /* goal request (after normalization) */
+        /* goal request (normalized ac_o_ex) */
        struct ext4_free_extent ac_g_ex;
        /* the best found extent */
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b0a126f23c20..92816b4e0f16 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -263,7 +263,7 @@ static int free_dind_blocks(handle_t *handle,
        for (i = 0; i < max_entries; i++) {
                if (tmp_idata[i]) {
                        extend_credit_for_blkdel(handle, inode);
-                        ext4_free_blocks(handle, inode, 0,
+                        ext4_free_blocks(handle, inode, NULL,
                                         le32_to_cpu(tmp_idata[i]), 1,
                                         EXT4_FREE_BLOCKS_METADATA |
                                         EXT4_FREE_BLOCKS_FORGET);
@@ -271,7 +271,7 @@ static int free_dind_blocks(handle_t *handle,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
                         EXT4_FREE_BLOCKS_METADATA |
                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
@@ -302,7 +302,7 @@ static int free_tind_blocks(handle_t *handle,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
                         EXT4_FREE_BLOCKS_METADATA |
                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
@@ -315,7 +315,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
        /* ei->i_data[EXT4_IND_BLOCK] */
        if (i_data[0]) {
                extend_credit_for_blkdel(handle, inode);
-                ext4_free_blocks(handle, inode, 0,
+                ext4_free_blocks(handle, inode, NULL,
                                le32_to_cpu(i_data[0]), 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
@@ -428,7 +428,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, 0, block, 1,
+        ext4_free_blocks(handle, inode, NULL, block, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return retval;
 }
@@ -517,7 +517,7 @@ int ext4_ext_migrate(struct inode *inode)
         * start with one credit accounted for
         * superblock modification.
         *
-         * For the tmp_inode we already have commited the
+         * For the tmp_inode we already have committed the
         * trascation that created the inode. Later as and
         * when we add extents we extent the journal
         */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e781b7ea5630..67fd0b025858 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -40,6 +40,7 @@
 #include "xattr.h"
 #include "acl.h"
+#include <trace/events/ext4.h>
 /*
 * define how far ahead to read directories while searching them.
 */
@@ -2183,6 +2184,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
        struct ext4_dir_entry_2 *de;
        handle_t *handle;
+        trace_ext4_unlink_enter(dir, dentry);
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
        dquot_initialize(dir);
@@ -2228,6 +2230,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 end_unlink:
        ext4_journal_stop(handle);
        brelse(bh);
+        trace_ext4_unlink_exit(dentry, retval);
        return retval;
 }
@@ -2402,6 +2405,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (!new_inode && new_dir != old_dir &&
                    EXT4_DIR_LINK_MAX(new_dir))
                        goto end_rename;
+                BUFFER_TRACE(dir_bh, "get_write_access");
+                retval = ext4_journal_get_write_access(handle, dir_bh);
+                if (retval)
+                        goto end_rename;
        }
        if (!new_bh) {
                retval = ext4_add_entry(handle, new_dentry, old_inode);
@@ -2409,7 +2416,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto end_rename;
        } else {
                BUFFER_TRACE(new_bh, "get write access");
-                ext4_journal_get_write_access(handle, new_bh);
+                retval = ext4_journal_get_write_access(handle, new_bh);
+                if (retval)
+                        goto end_rename;
                new_de->inode = cpu_to_le32(old_inode->i_ino);
                if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
                                              EXT4_FEATURE_INCOMPAT_FILETYPE))
@@ -2470,8 +2479,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
        ext4_update_dx_flag(old_dir);
        if (dir_bh) {
-                BUFFER_TRACE(dir_bh, "get_write_access");
-                ext4_journal_get_write_access(handle, dir_bh);
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 955cc309142f..b6dbd056fcb1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -259,6 +259,11 @@ static void ext4_end_bio(struct bio *bio, int error)
                             bi_sector >> (inode->i_blkbits - 9));
        }
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                ext4_free_io_end(io_end);
+                return;
+        }
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
        list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
@@ -279,9 +284,9 @@ void ext4_io_submit(struct ext4_io_submit *io)
                BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
                bio_put(io->io_bio);
        }
-        io->io_bio = 0;
+        io->io_bio = NULL;
        io->io_op = 0;
-        io->io_end = 0;
+        io->io_end = NULL;
 }
 static int io_submit_init(struct ext4_io_submit *io,
@@ -310,8 +315,7 @@ static int io_submit_init(struct ext4_io_submit *io,
        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
        io->io_bio = bio;
-        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
-                        WRITE_SYNC_PLUG : WRITE);
        io->io_next_block = bh->b_blocknr;
        return 0;
 }
@@ -381,8 +385,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        BUG_ON(!PageLocked(page));
        BUG_ON(PageWriteback(page));
-        set_page_writeback(page);
-        ClearPageError(page);
        io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
        if (!io_page) {
@@ -393,6 +395,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        io_page->p_page = page;
        atomic_set(&io_page->p_count, 1);
        get_page(page);
+        set_page_writeback(page);
+        ClearPageError(page);
        for (bh = head = page_buffers(page), block_start = 0;
             bh != head || !block_start;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3ecc6e45d2f9..80bbc9c60c24 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -230,7 +230,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        }
        /* Zero out all of the reserved backup group descriptor table blocks */
-        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+        ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
                        block, sbi->s_itb_per_group);
        err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
                               GFP_NOFS);
@@ -248,7 +248,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        /* Zero out all of the inode table blocks */
        block = input->inode_table;
-        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+        ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
                        block, sbi->s_itb_per_group);
        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
        if (err)
@@ -499,12 +499,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        return err;
 exit_inode:
-        /* ext4_journal_release_buffer(handle, iloc.bh); */
+        /* ext4_handle_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
-        /* ext4_journal_release_buffer(handle, dind); */
+        /* ext4_handle_release_buffer(handle, dind); */
 exit_sbh:
-        /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
+        /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -586,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
                        /*
                        int j;
                        for (j = 0; j < i; j++)
-                                ext4_journal_release_buffer(handle, primary[j]);
+                                ext4_handle_release_buffer(handle, primary[j]);
                         */
                        goto exit_bh;
                }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 203f9e4a70be..8553dfb310af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -54,9 +54,9 @@
 static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
-struct ext4_lazy_init *ext4_li_info;
+static struct ext4_lazy_init *ext4_li_info;
-struct mutex ext4_li_mtx;
+static struct mutex ext4_li_mtx;
-struct ext4_features *ext4_feat;
+static struct ext4_features *ext4_feat;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -75,6 +75,7 @@ static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
@@ -241,27 +242,44 @@ static void ext4_put_nojournal(handle_t *handle)
 * journal_end calls result in the superblock being marked dirty, so
 * that sync() will call the filesystem's write_super callback if
 * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
 */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
        journal_t *journal;
+        handle_t  *handle;
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
-        vfs_check_frozen(sb, SB_FREEZE_TRANS);
-        /* Special case here: if the journal has aborted behind our
-         * backs (eg. EIO in the commit thread), then we still need to
-         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
-        if (journal) {
+        handle = ext4_journal_current_handle();
-                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, "Detected aborted journal");
+        /*
-                        return ERR_PTR(-EROFS);
+         * If a handle has been started, it should be allowed to
-                }
+         * finish, otherwise deadlock could happen between freeze
-                return jbd2_journal_start(journal, nblocks);
+         * and others(e.g. truncate) due to the restart of the
+         * journal handle if the filesystem is forzen and active
+         * handles are not stopped.
+         */
+        if (!handle)
+                vfs_check_frozen(sb, SB_FREEZE_TRANS);
+        if (!journal)
+                return ext4_get_nojournal();
+        /*
+         * Special case here: if the journal has aborted behind our
+         * backs (eg. EIO in the commit thread), then we still need to
+         * take the FS itself readonly cleanly.
+         */
+        if (is_journal_aborted(journal)) {
+                ext4_abort(sb, "Detected aborted journal");
+                return ERR_PTR(-EROFS);
        }
-        return ext4_get_nojournal();
+        return jbd2_journal_start(journal, nblocks);
 }
 /*
@@ -594,7 +612,7 @@ __acquires(bitlock)
        vaf.fmt = fmt;
        vaf.va = &args;
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
               sb->s_id, function, line, grp);
        if (ino)
                printk(KERN_CONT "inode %lu: ", ino);
@@ -616,7 +634,7 @@ __acquires(bitlock)
         * filesystem will have already been marked read/only and the
         * journal has been aborted.  We return 1 as a hint to callers
         * who might what to use the return value from
-         * ext4_grp_locked_error() to distinguish beween the
+         * ext4_grp_locked_error() to distinguish between the
         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
         * aggressively from the ext4 function in question, with a
         * more appropriate error code.
@@ -997,13 +1015,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, OLDALLOC))
                seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT4_FS_XATTR
-        if (test_opt(sb, XATTR_USER) &&
+        if (test_opt(sb, XATTR_USER))
-                !(def_mount_opts & EXT4_DEFM_XATTR_USER))
                seq_puts(seq, ",user_xattr");
-        if (!test_opt(sb, XATTR_USER) &&
+        if (!test_opt(sb, XATTR_USER))
-            (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
                seq_puts(seq, ",nouser_xattr");
-        }
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
@@ -1041,8 +1056,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
-        if (test_opt(sb, MBLK_IO_SUBMIT))
+        if (!test_opt(sb, MBLK_IO_SUBMIT))
-                seq_puts(seq, ",mblk_io_submit");
+                seq_puts(seq, ",nomblk_io_submit");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -1451,7 +1466,7 @@ static int parse_options(char *options, struct super_block *sb,
                 * Initialize args struct so we know whether arg was
                 * found; some options take optional arguments.
                 */
-                args[0].to = args[0].from = 0;
+                args[0].to = args[0].from = NULL;
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_bsd_df:
@@ -1771,7 +1786,7 @@ set_qf_format:
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
-                        if (!is_power_of_2(option)) {
+                        if (option && !is_power_of_2(option)) {
                                ext4_msg(sb, KERN_ERR,
                                         "EXT4-fs: inode_readahead_blks"
                                         " must be a power of 2");
@@ -2120,6 +2135,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                return;
        }
+        /* Check if feature set would not allow a r/w mount */
+        if (!ext4_feature_set_ok(sb, 0)) {
+                ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+                         "unknown ROCOMPAT features");
+                return;
+        }
        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                if (es->s_last_orphan)
                        jbd_debug(1, "Errors on filesystem, "
@@ -2412,7 +2434,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
        if (parse_strtoul(buf, 0x40000000, &t))
                return -EINVAL;
-        if (!is_power_of_2(t))
+        if (t && !is_power_of_2(t))
                return -EINVAL;
        sbi->s_inode_readahead_blks = t;
@@ -2970,6 +2992,12 @@ static int ext4_register_li_request(struct super_block *sb,
        mutex_unlock(&ext4_li_info->li_list_mtx);
        sbi->s_li_request = elr;
+        /*
+         * set elr to NULL here since it has been inserted to
+         * the request_list and the removal and free of it is
+         * handled by ext4_clear_request_list from now on.
+         */
+        elr = NULL;
        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                ret = ext4_run_lazyinit_thread();
@@ -3095,14 +3123,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sb, NO_UID32);
+        /* xattr user namespace & acls are now defaulted on */
 #ifdef CONFIG_EXT4_FS_XATTR
-        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
+        set_opt(sb, XATTR_USER);
-                set_opt(sb, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-        if (def_mount_opts & EXT4_DEFM_ACL)
+        set_opt(sb, POSIX_ACL);
-                set_opt(sb, POSIX_ACL);
 #endif
+        set_opt(sb, MBLK_IO_SUBMIT);
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3380,6 +3408,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        init_timer(&sbi->s_err_report);
+        sbi->s_err_report.function = print_daily_error_info;
+        sbi->s_err_report.data = (unsigned long) sb;
        err = percpu_counter_init(&sbi->s_freeblocks_counter,
                        ext4_count_free_blocks(sb));
        if (!err) {
@@ -3516,7 +3548,7 @@ no_journal:
         * concurrency isn't really necessary.  Limit it to 1.
         */
        EXT4_SB(sb)->dio_unwritten_wq =
-                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1);
+                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
                goto failed_mount_wq;
@@ -3531,17 +3563,16 @@ no_journal:
        if (IS_ERR(root)) {
                ext4_msg(sb, KERN_ERR, "get root inode failed");
                ret = PTR_ERR(root);
+                root = NULL;
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-                iput(root);
                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                goto failed_mount4;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
                ext4_msg(sb, KERN_ERR, "get root dentry failed");
-                iput(root);
                ret = -ENOMEM;
                goto failed_mount4;
        }
@@ -3642,9 +3673,6 @@ no_journal:
                 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
                 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
-        init_timer(&sbi->s_err_report);
-        sbi->s_err_report.function = print_daily_error_info;
-        sbi->s_err_report.data = (unsigned long) sb;
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -3657,6 +3685,8 @@ cantfind_ext4:
        goto failed_mount;
 failed_mount4:
+        iput(root);
+        sb->s_root = NULL;
        ext4_msg(sb, KERN_ERR, "mount failed");
        destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
@@ -3666,6 +3696,7 @@ failed_mount_wq:
                sbi->s_journal = NULL;
        }
 failed_mount3:
+        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
                        vfree(sbi->s_flex_groups);
@@ -4132,6 +4163,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 /*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * the upper layer.
 */
 static int ext4_freeze(struct super_block *sb)
 {
@@ -4608,17 +4644,30 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 static int ext4_quota_off(struct super_block *sb, int type)
 {
+        struct inode *inode = sb_dqopt(sb)->files[type];
+        handle_t *handle;
        /* Force all delayed allocation blocks to be allocated.
         * Caller already holds s_umount sem */
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
+        /* Update modification times of quota files when userspace can
+         * start looking at them */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle))
+                goto out;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_journal_stop(handle);
+out:
        return dquot_quota_off(sb, type);
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
@@ -4708,9 +4757,8 @@ out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
+                ext4_mark_inode_dirty(handle, inode);
        }
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        ext4_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
        return len;
 }
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fc32176eee39..b545ca1c459c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -735,7 +735,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                        int offset = (char *)s->here - bs->bh->b_data;
                        unlock_buffer(bs->bh);
-                        jbd2_journal_release_buffer(handle, bs->bh);
+                        ext4_handle_release_buffer(handle, bs->bh);
                        if (ce) {
                                mb_cache_entry_release(ce);
                                ce = NULL;
@@ -833,7 +833,7 @@ inserted:
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
 getblk_failed:
-                                ext4_free_blocks(handle, inode, 0, block, 1,
+                                ext4_free_blocks(handle, inode, NULL, block, 1,
                                                 EXT4_FREE_BLOCKS_METADATA);
                                error = -EIO;
                                goto cleanup;
author	Jiri Kosina <jkosina@suse.cz>	2011-04-26 04:22:15 -0400
committer	Jiri Kosina <jkosina@suse.cz>	2011-04-26 04:22:59 -0400
commit	07f9479a40cc778bc1462ada11f95b01360ae4ff (patch)
tree	0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /fs/ext4
parent	9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff)
parent	cd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff)