Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework

Conflicts: arch/sh/kernel/cpu/sh2/setup-sh7619.c arch/sh/kernel/cpu/sh2a/setup-mxg.c arch/sh/kernel/cpu/sh2a/setup-sh7201.c arch/sh/kernel/cpu/sh2a/setup-sh7203.c arch/sh/kernel/cpu/sh2a/setup-sh7206.c arch/sh/kernel/cpu/sh3/setup-sh7705.c arch/sh/kernel/cpu/sh3/setup-sh770x.c arch/sh/kernel/cpu/sh3/setup-sh7710.c arch/sh/kernel/cpu/sh3/setup-sh7720.c arch/sh/kernel/cpu/sh4/setup-sh4-202.c arch/sh/kernel/cpu/sh4/setup-sh7750.c arch/sh/kernel/cpu/sh4/setup-sh7760.c arch/sh/kernel/cpu/sh4a/setup-sh7343.c arch/sh/kernel/cpu/sh4a/setup-sh7366.c arch/sh/kernel/cpu/sh4a/setup-sh7722.c arch/sh/kernel/cpu/sh4a/setup-sh7723.c arch/sh/kernel/cpu/sh4a/setup-sh7724.c arch/sh/kernel/cpu/sh4a/setup-sh7763.c arch/sh/kernel/cpu/sh4a/setup-sh7770.c arch/sh/kernel/cpu/sh4a/setup-sh7780.c arch/sh/kernel/cpu/sh4a/setup-sh7785.c arch/sh/kernel/cpu/sh4a/setup-sh7786.c arch/sh/kernel/cpu/sh4a/setup-shx3.c arch/sh/kernel/cpu/sh5/setup-sh5.c drivers/serial/sh-sci.c drivers/serial/sh-sci.h include/linux/serial_sci.h
author: Paul Mundt <lethal@linux-sh.org> 2011-01-13 01:06:28 -0500
committer: Paul Mundt <lethal@linux-sh.org> 2011-01-13 01:06:28 -0500
commit: f43dc23d5ea91fca257be02138a255f02d98e806 (patch)
tree: b29722f6e965316e90ac97abf79923ced250dc21 /fs/ext4/inode.c
parent: f8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff)
parent: 4162cf64973df51fc885825bc9ca4d055891c49f (diff)
1 files changed, 1603 insertions, 1138 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c17ae275af4..e80fc513eacc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,10 @@
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -50,13 +54,27 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
-        return jbd2_journal_begin_ordered_truncate(
+        trace_ext4_begin_ordered_truncate(inode, new_size);
-                                        EXT4_SB(inode->i_sb)->s_journal,
+        /*
-                                        &EXT4_I(inode)->jinode,
+         * If jinode is zero, then we never opened the file for
-                                        new_size);
+         * writing, so there's no need to call
+         * jbd2_journal_begin_ordered_truncate() since there's no
+         * outstanding writes we need to flush.
+         */
+        if (!EXT4_I(inode)->jinode)
+                return 0;
+        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+                                                   EXT4_I(inode)->jinode,
+                                                   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 /*
 * Test whether an inode is a fast symlink.
@@ -70,60 +88,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 }
 /*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled.  Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling so there's nothing to do.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-                struct buffer_head *bh, ext4_fsblk_t blocknr)
-{
-        int err;
-        if (!ext4_handle_valid(handle))
-                return 0;
-        might_sleep();
-        BUFFER_TRACE(bh, "enter");
-        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-                  "data mode %x\n",
-                  bh, is_metadata, inode->i_mode,
-                  test_opt(inode->i_sb, DATA_FLAGS));
-        /* Never use the revoke function if we are doing full data
-         * journaling: there is no need to, and a V1 superblock won't
-         * support it.  Otherwise, only skip the revoke on un-journaled
-         * data blocks. */
-        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-            (!is_metadata && !ext4_should_journal_data(inode))) {
-                if (bh) {
-                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
-                        return ext4_journal_forget(handle, bh);
-                }
-                return 0;
-        }
-        /*
-         * data!=journal && (is_metadata || should_journal_data(inode))
-         */
-        BUFFER_TRACE(bh, "call ext4_journal_revoke");
-        err = ext4_journal_revoke(handle, blocknr, bh);
-        if (err)
-                ext4_abort(inode->i_sb, __func__,
-                           "error %d when attempting revoke", err);
-        BUFFER_TRACE(bh, "exit");
-        return err;
-}
-/*
 * Work out how many blocks we need to proceed with the next chunk of a
 * truncate transaction.
 */
@@ -194,21 +158,44 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 * so before we call here everything must be consistently dirtied against
 * this transaction.
 */
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+                                 int nblocks)
 {
+        int ret;
+        /*
+         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
+         * moment, get_block can be called only for blocks inside i_size since
+         * page cache has been already dropped and writes are blocked by
+         * i_mutex. So we can safely drop the i_data_sem here.
+         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
-        return ext4_journal_restart(handle, blocks_for_truncate(inode));
+        up_write(&EXT4_I(inode)->i_data_sem);
+        ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+        down_write(&EXT4_I(inode)->i_data_sem);
+        ext4_discard_preallocations(inode);
+        return ret;
 }
 /*
 * Called at the last iput() if i_nlink is zero.
 */
-void ext4_delete_inode(struct inode *inode)
+void ext4_evict_inode(struct inode *inode)
 {
        handle_t *handle;
        int err;
+        trace_ext4_evict_inode(inode);
+        if (inode->i_nlink) {
+                truncate_inode_pages(&inode->i_data, 0);
+                goto no_delete;
+        }
+        if (!is_bad_inode(inode))
+                dquot_initialize(inode);
        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages(&inode->i_data, 0);
@@ -233,7 +220,7 @@ void ext4_delete_inode(struct inode *inode)
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
-                ext4_warning(inode->i_sb, __func__,
+                ext4_warning(inode->i_sb,
                             "couldn't mark inode dirty (err %d)", err);
                goto stop_handle;
        }
@@ -251,10 +238,11 @@ void ext4_delete_inode(struct inode *inode)
                if (err > 0)
                        err = ext4_journal_restart(handle, 3);
                if (err != 0) {
-                        ext4_warning(inode->i_sb, __func__,
+                        ext4_warning(inode->i_sb,
                                     "couldn't extend journal (err %d)", err);
                stop_handle:
                        ext4_journal_stop(handle);
+                        ext4_orphan_del(NULL, inode);
                        goto no_delete;
                }
        }
@@ -279,13 +267,13 @@ void ext4_delete_inode(struct inode *inode)
         */
        if (ext4_mark_inode_dirty(handle, inode))
                /* If that failed, just do the required in-core inode clear. */
-                clear_inode(inode);
+                ext4_clear_inode(inode);
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
        return;
 no_delete:
-        clear_inode(inode);     /* We must guarantee clearing of inode... */
+        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
 }
 typedef struct {
@@ -343,9 +331,7 @@ static int ext4_block_to_path(struct inode *inode,
        int n = 0;
        int final = 0;
-        if (i_block < 0) {
+        if (i_block < direct_blocks) {
-                ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
-        } else if (i_block < direct_blocks) {
                offsets[n++] = i_block;
                final = direct_blocks;
        } else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -364,8 +350,7 @@ static int ext4_block_to_path(struct inode *inode,
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else {
-                ext4_warning(inode->i_sb, "ext4_block_to_path",
+                ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
-                             "block %lu > max in inode %lu",
                             i_block + direct_blocks +
                             indirect_blocks + double_blocks, inode->i_ino);
        }
@@ -374,9 +359,11 @@ static int ext4_block_to_path(struct inode *inode,
        return n;
 }
-static int __ext4_check_blockref(const char *function, struct inode *inode,
+static int __ext4_check_blockref(const char *function, unsigned int line,
+                                 struct inode *inode,
                                 __le32 *p, unsigned int max)
 {
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        __le32 *bref = p;
        unsigned int blk;
@@ -385,9 +372,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
                if (blk &&
                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                    blk, 1))) {
-                        ext4_error(inode->i_sb, function,
+                        es->s_last_error_block = cpu_to_le64(blk);
-                                   "invalid block reference %u "
+                        ext4_error_inode(inode, function, line, blk,
-                                   "in inode #%lu", blk, inode->i_ino);
+                                         "invalid block");
                        return -EIO;
                }
        }
@@ -396,11 +383,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 #define ext4_check_indirect_blockref(inode, bh)                         \
-        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+        __ext4_check_blockref(__func__, __LINE__, inode,                \
+                              (__le32 *)(bh)->b_data,                   \
                              EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 #define ext4_check_inode_blockref(inode)                                \
-        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+        __ext4_check_blockref(__func__, __LINE__, inode,                \
+                              EXT4_I(inode)->i_data,                    \
                              EXT4_NDIR_BLOCKS)
 /**
@@ -553,19 +542,25 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 *
 *      Normally this function find the preferred place for block allocation,
 *      returns it.
+ *      Because this is only used for non-extent files, we limit the block nr
+ *      to 32 bits.
 */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
                                   Indirect *partial)
 {
+        ext4_fsblk_t goal;
        /*
         * XXX need to get goal block from mballoc's data structures
         */
-        return ext4_find_near(inode, partial);
+        goal = ext4_find_near(inode, partial);
+        goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+        return goal;
 }
 /**
- *      ext4_blks_to_allocate: Look up the block map and count the number
+ *      ext4_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -604,13 +599,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 /**
 *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: inode which needs allocated blocks
+ *      @iblock: the logical block to start allocated at
+ *      @goal: preferred physical block of allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks: number of desired blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: on return it will store the error code
- *              direct blocks
+ *
+ *      This function will return the number of blocks allocated as
+ *      requested by the passed-in parameters.
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                             ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -642,6 +643,15 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                if (*err)
                        goto failed_out;
+                if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+                        EXT4_ERROR_INODE(inode,
+                                         "current_block %llu + count %lu > %d!",
+                                         current_block, count,
+                                         EXT4_MAX_BLOCK_FILE_PHYS);
+                        *err = -EIO;
+                        goto failed_out;
+                }
                target -= count;
                /* allocate blocks for indirect blocks */
                while (index < indirect_blks && count) {
@@ -676,6 +686,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                ar.flags = EXT4_MB_HINT_DATA;
        current_block = ext4_mb_new_blocks(handle, &ar, err);
+        if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+                EXT4_ERROR_INODE(inode,
+                                 "current_block %llu + ar.len %d > %d!",
+                                 current_block, ar.len,
+                                 EXT4_MAX_BLOCK_FILE_PHYS);
+                *err = -EIO;
+                goto failed_out;
+        }
        if (*err && (target == blks)) {
                /*
@@ -701,15 +719,17 @@ allocated:
        return ret;
 failed_out:
        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
        return ret;
 }
 /**
 *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -759,13 +779,19 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                 * parent to disk.
                 */
                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+                if (unlikely(!bh)) {
+                        err = -EIO;
+                        goto failed;
+                }
                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, bh);
                if (err) {
+                        /* Don't brelse(bh) here; it's done in
+                         * ext4_journal_forget() below */
                        unlock_buffer(bh);
-                        brelse(bh);
                        goto failed;
                }
@@ -796,20 +822,27 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        return err;
 failed:
        /* Allocation failed, free what we already allocated */
+        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
-                BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
+                /*
-                ext4_journal_forget(handle, branch[i].bh);
+                 * branch[i].bh is newly allocated, so there is no
+                 * need to revoke the block, which is why we don't
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        for (i = 0; i < indirect_blks; i++)
+        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+        ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
        return err;
 }
 /**
 * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
 * @chain: chain of indirect blocks (with a missing link - see
@@ -882,20 +915,24 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 err_out:
        for (i = 1; i <= num; i++) {
-                BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
+                /*
-                ext4_journal_forget(handle, where[i].bh);
+                 * branch[i].bh is newly allocated, so there is no
-                ext4_free_blocks(handle, inode,
+                 * need to revoke the block, which is why we don't
-                                        le32_to_cpu(where[i-1].key), 1, 0);
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+        ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+                         blks, 0);
        return err;
 }
 /*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
 * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
 *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -920,9 +957,8 @@ err_out:
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
 * blocks.
 */
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned int maxblocks,
+                               struct ext4_map_blocks *map,
-                               struct buffer_head *bh_result,
                               int flags)
 {
        int err = -EIO;
@@ -936,9 +972,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        int count = 0;
        ext4_fsblk_t first_block = 0;
-        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
-        depth = ext4_block_to_path(inode, iblock, offsets,
+        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
                                   &blocks_to_boundary);
        if (depth == 0)
@@ -949,10 +985,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
-                clear_buffer_new(bh_result);
                count++;
                /*map more blocks*/
-                while (count < maxblocks && count <= blocks_to_boundary) {
+                while (count < map->m_len && count <= blocks_to_boundary) {
                        ext4_fsblk_t blk;
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -972,7 +1007,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /*
         * Okay, we need to do block allocation.
        */
-        goal = ext4_find_goal(inode, iblock, partial);
+        goal = ext4_find_goal(inode, map->m_lblk, partial);
        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;
@@ -982,11 +1017,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * direct blocks to allocate for this branch.
         */
        count = ext4_blks_to_allocate(partial, indirect_blks,
-                                        maxblocks, blocks_to_boundary);
+                                      map->m_len, blocks_to_boundary);
        /*
         * Block out ext4_truncate while we alter the tree
         */
-        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+        err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
                                &count, goal,
                                offsets + (partial - chain), partial);
@@ -998,16 +1033,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * may need to return -EAGAIN upwards in the worst case.  --sct
         */
        if (!err)
-                err = ext4_splice_branch(handle, inode, iblock,
+                err = ext4_splice_branch(handle, inode, map->m_lblk,
                                         partial, indirect_blks, count);
-        else
+        if (err)
                goto cleanup;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
+        ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
-        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+        map->m_flags |= EXT4_MAP_MAPPED;
+        map->m_pblk = le32_to_cpu(chain[depth-1].key);
+        map->m_len = count;
        if (count > blocks_to_boundary)
-                set_buffer_boundary(bh_result);
+                map->m_flags |= EXT4_MAP_BOUNDARY;
        err = count;
        /* Clean up and exit */
        partial = chain + depth - 1;    /* the whole chain */
@@ -1017,125 +1056,207 @@ cleanup:
                brelse(partial->bh);
                partial--;
        }
-        BUFFER_TRACE(bh_result, "returned");
 out:
        return err;
 }
-qsize_t ext4_get_reserved_space(struct inode *inode)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
 {
-        unsigned long long total;
+        return &EXT4_I(inode)->i_reserved_quota;
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-        total = EXT4_I(inode)->i_reserved_data_blocks +
-                EXT4_I(inode)->i_reserved_meta_blocks;
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        return total;
 }
+#endif
 /*
 * Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
+ * to allocate a new block at @lblocks for non extent file based file
 */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_indirect_calc_metadata_amount(struct inode *inode,
+                                              sector_t lblock)
 {
-        int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        struct ext4_inode_info *ei = EXT4_I(inode);
-        int ind_blks, dind_blks, tind_blks;
+        sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+        int blk_bits;
-        /* number of new indirect blocks needed */
-        ind_blks = (blocks + icap - 1) / icap;
-        dind_blks = (ind_blks + icap - 1) / icap;
+        if (lblock < EXT4_NDIR_BLOCKS)
+                return 0;
-        tind_blks = 1;
+        lblock -= EXT4_NDIR_BLOCKS;
-        return ind_blks + dind_blks + tind_blks;
+        if (ei->i_da_metadata_calc_len &&
+            (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+                ei->i_da_metadata_calc_len++;
+                return 0;
+        }
+        ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+        ei->i_da_metadata_calc_len = 1;
+        blk_bits = order_base_2(lblock);
+        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 /*
 * Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
+ * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
-        if (!blocks)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                return 0;
+                return ext4_ext_calc_metadata_amount(inode, lblock);
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-                return ext4_ext_calc_metadata_amount(inode, blocks);
-        return ext4_indirect_calc_metadata_amount(inode, blocks);
+        return ext4_indirect_calc_metadata_amount(inode, lblock);
 }
-static void ext4_da_update_reserve_space(struct inode *inode, int used)
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+void ext4_da_update_reserve_space(struct inode *inode,
+                                        int used, int quota_claim)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        int total, mdb, mdb_free;
+        struct ext4_inode_info *ei = EXT4_I(inode);
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-        /* recalculate the number of metablocks still need to be reserved */
-        total = EXT4_I(inode)->i_reserved_data_blocks - used;
-        mdb = ext4_calc_metadata_amount(inode, total);
-        /* figure out how many metablocks to release */
+        spin_lock(&ei->i_block_reservation_lock);
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+        trace_ext4_da_update_reserve_space(inode, used);
-        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+        if (unlikely(used > ei->i_reserved_data_blocks)) {
+                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+                         "with only %d reserved data blocks\n",
+                         __func__, inode->i_ino, used,
+                         ei->i_reserved_data_blocks);
+                WARN_ON(1);
+                used = ei->i_reserved_data_blocks;
+        }
-        if (mdb_free) {
+        /* Update per-inode reservations */
-                /* Account for allocated meta_blocks */
+        ei->i_reserved_data_blocks -= used;
-                mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                           used + ei->i_allocated_meta_blocks);
+        ei->i_allocated_meta_blocks = 0;
-                /* update fs dirty blocks counter */
+        if (ei->i_reserved_data_blocks == 0) {
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+                /*
-                EXT4_I(inode)->i_allocated_meta_blocks = 0;
+                 * We can release all of the reserved metadata blocks
-                EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+                 * only when we have written all of the delayed
+                 * allocation blocks.
+                 */
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
+                ei->i_reserved_meta_blocks = 0;
+                ei->i_da_metadata_calc_len = 0;
        }
-        /* update per-inode reservations */
-        BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
-        EXT4_I(inode)->i_reserved_data_blocks -= used;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        /*
+        /* Update quota subsystem for data blocks */
-         * free those over-booking quota for metadata blocks
+        if (quota_claim)
-         */
+                dquot_claim_block(inode, used);
-        if (mdb_free)
+        else {
-                vfs_dq_release_reservation_block(inode, mdb_free);
+                /*
+                 * We did fallocate with an offset that is already delayed
+                 * allocated. So on delayed allocated writeback we should
+                 * not re-claim the quota for fallocated blocks.
+                 */
+                dquot_release_reservation_block(inode, used);
+        }
        /*
         * If we have done all the pending block allocations and if
         * there aren't any writers on the inode, we can discard the
         * inode's preallocations.
         */
-        if (!total && (atomic_read(&inode->i_writecount) == 0))
+        if ((ei->i_reserved_data_blocks == 0) &&
+            (atomic_read(&inode->i_writecount) == 0))
                ext4_discard_preallocations(inode);
 }
-static int check_block_validity(struct inode *inode, sector_t logical,
+static int __check_block_validity(struct inode *inode, const char *func,
-                                sector_t phys, int len)
+                                unsigned int line,
+                                struct ext4_map_blocks *map)
 {
-        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
-                ext4_error(inode->i_sb, "check_block_validity",
+                                   map->m_len)) {
-                           "inode #%lu logical block %llu mapped to %llu "
+                ext4_error_inode(inode, func, line, map->m_pblk,
-                           "(size %d)", inode->i_ino,
+                                 "lblock %lu mapped to illegal pblock "
-                           (unsigned long long) logical,
+                                 "(length %d)", (unsigned long) map->m_lblk,
-                           (unsigned long long) phys, len);
+                                 map->m_len);
-                WARN_ON(1);
                return -EIO;
        }
        return 0;
 }
+#define check_block_validity(inode, map)        \
+        __check_block_validity((inode), __func__, __LINE__, (map))
+/*
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+                                    unsigned int max_pages)
+{
+        struct address_space *mapping = inode->i_mapping;
+        pgoff_t index;
+        struct pagevec pvec;
+        pgoff_t num = 0;
+        int i, nr_pages, done = 0;
+        if (max_pages == 0)
+                return 0;
+        pagevec_init(&pvec, 0);
+        while (!done) {
+                index = idx;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                              PAGECACHE_TAG_DIRTY,
+                                              (pgoff_t)PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        struct buffer_head *bh, *head;
+                        lock_page(page);
+                        if (unlikely(page->mapping != mapping) ||
+                            !PageDirty(page) ||
+                            PageWriteback(page) ||
+                            page->index != idx) {
+                                done = 1;
+                                unlock_page(page);
+                                break;
+                        }
+                        if (page_has_buffers(page)) {
+                                bh = head = page_buffers(page);
+                                do {
+                                        if (!buffer_delay(bh) &&
+                                            !buffer_unwritten(bh))
+                                                done = 1;
+                                        bh = bh->b_this_page;
+                                } while (!done && (bh != head));
+                        }
+                        unlock_page(page);
+                        if (done)
+                                break;
+                        idx++;
+                        num++;
+                        if (num >= max_pages) {
+                                done = 1;
+                                break;
+                        }
+                }
+                pagevec_release(&pvec);
+        }
+        return num;
+}
 /*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
- * If file type is extents based, it will call ext4_ext_get_blocks(),
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocate.
@@ -1148,32 +1269,29 @@ static int check_block_validity(struct inode *inode, sector_t logical,
 *
 * It returns the error in case of allocation failure.
 */
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
-                    unsigned int max_blocks, struct buffer_head *bh,
+                    struct ext4_map_blocks *map, int flags)
-                    int flags)
 {
        int retval;
-        clear_buffer_mapped(bh);
+        map->m_flags = 0;
-        clear_buffer_unwritten(bh);
+        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
+                  (unsigned long) map->m_lblk);
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, 0);
-                                bh, 0);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ind_map_blocks(handle, inode, map, 0);
-                                             bh, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, block,
+                int ret = check_block_validity(inode, map);
-                                               bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@ -1189,7 +1307,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * ext4_ext_get_block() returns th create = 0
         * with buffer head unmapped.
         */
-        if (retval > 0 && buffer_mapped(bh))
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                return retval;
        /*
@@ -1202,7 +1320,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * of BH_Unwritten and BH_Mapped flags being simultaneously
         * set on the buffer_head.
         */
-        clear_buffer_unwritten(bh);
+        map->m_flags &= ~EXT4_MAP_UNWRITTEN;
        /*
         * New blocks allocate and/or writing to uninitialized extent
@@ -1219,43 +1337,41 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * avoid double accounting
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
         */
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, flags);
-                                              bh, flags);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block,
+                retval = ext4_ind_map_blocks(handle, inode, map, flags);
-                                             max_blocks, bh, flags);
-                if (retval > 0 && buffer_new(bh)) {
+                if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
                        /*
                         * We allocated new blocks which will result in
                         * i_data's format changing.  Force the migrate
                         * to fail by clearing migrate flags
                         */
-                        EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
+                        ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
-                                                        ~EXT4_EXT_MIGRATE;
                }
-        }
+                /*
+                 * Update reserved blocks/metadata blocks after successful
+                 * block allocation which had been deferred till now. We don't
+                 * support fallocate for non extent files. So we can update
+                 * reserve space here.
+                 */
+                if ((retval > 0) &&
+                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+                        ext4_da_update_reserve_space(inode, retval, 1);
+        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
-        /*
-         * Update reserved blocks/metadata blocks after successful
-         * block allocation which had been deferred till now.
-         */
-        if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
-                ext4_da_update_reserve_space(inode, retval);
        up_write((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, block,
+                int ret = check_block_validity(inode, map);
-                                               bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@ -1265,109 +1381,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
-int ext4_get_block(struct inode *inode, sector_t iblock,
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
-                   struct buffer_head *bh_result, int create)
+                           struct buffer_head *bh, int flags)
 {
        handle_t *handle = ext4_journal_current_handle();
+        struct ext4_map_blocks map;
        int ret = 0, started = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        int dio_credits;
-        if (create && !handle) {
+        map.m_lblk = iblock;
+        map.m_len = bh->b_size >> inode->i_blkbits;
+        if (flags && !handle) {
                /* Direct IO write... */
-                if (max_blocks > DIO_MAX_BLOCKS)
+                if (map.m_len > DIO_MAX_BLOCKS)
-                        max_blocks = DIO_MAX_BLOCKS;
+                        map.m_len = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
                handle = ext4_journal_start(inode, dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                        goto out;
+                        return ret;
                }
                started = 1;
        }
-        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+        ret = ext4_map_blocks(handle, inode, &map, flags);
-                              create ? EXT4_GET_BLOCKS_CREATE : 0);
        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
+                map_bh(bh, inode->i_sb, map.m_pblk);
+                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        }
        if (started)
                ext4_journal_stop(handle);
-out:
        return ret;
 }
+int ext4_get_block(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh, int create)
+{
+        return _ext4_get_block(inode, iblock, bh,
+                               create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
 /*
 * `handle' can be NULL if create is zero
 */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int create, int *errp)
 {
-        struct buffer_head dummy;
+        struct ext4_map_blocks map;
+        struct buffer_head *bh;
        int fatal = 0, err;
-        int flags = 0;
        J_ASSERT(handle != NULL || create == 0);
-        dummy.b_state = 0;
+        map.m_lblk = block;
-        dummy.b_blocknr = -1000;
+        map.m_len = 1;
-        buffer_trace_init(&dummy.b_history);
+        err = ext4_map_blocks(handle, inode, &map,
-        if (create)
+                              create ? EXT4_GET_BLOCKS_CREATE : 0);
-                flags |= EXT4_GET_BLOCKS_CREATE;
-        err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
+        if (err < 0)
-        /*
+                *errp = err;
-         * ext4_get_blocks() returns number of blocks mapped. 0 in
+        if (err <= 0)
-         * case of a HOLE.
+                return NULL;
-         */
+        *errp = 0;
-        if (err > 0) {
-                if (err > 1)
+        bh = sb_getblk(inode->i_sb, map.m_pblk);
-                        WARN_ON(1);
+        if (!bh) {
-                err = 0;
+                *errp = -EIO;
+                return NULL;
        }
-        *errp = err;
+        if (map.m_flags & EXT4_MAP_NEW) {
-        if (!err && buffer_mapped(&dummy)) {
+                J_ASSERT(create != 0);
-                struct buffer_head *bh;
+                J_ASSERT(handle != NULL);
-                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-                if (!bh) {
-                        *errp = -EIO;
-                        goto err;
-                }
-                if (buffer_new(&dummy)) {
-                        J_ASSERT(create != 0);
-                        J_ASSERT(handle != NULL);
-                        /*
+                /*
-                         * Now that we do not always journal data, we should
+                 * Now that we do not always journal data, we should
-                         * keep in mind whether this should always journal the
+                 * keep in mind whether this should always journal the
-                         * new buffer as metadata.  For now, regular file
+                 * new buffer as metadata.  For now, regular file
-                         * writes use ext4_get_block instead, so it's not a
+                 * writes use ext4_get_block instead, so it's not a
-                         * problem.
+                 * problem.
-                         */
+                 */
-                        lock_buffer(bh);
+                lock_buffer(bh);
-                        BUFFER_TRACE(bh, "call get_create_access");
+                BUFFER_TRACE(bh, "call get_create_access");
-                        fatal = ext4_journal_get_create_access(handle, bh);
+                fatal = ext4_journal_get_create_access(handle, bh);
-                        if (!fatal && !buffer_uptodate(bh)) {
+                if (!fatal && !buffer_uptodate(bh)) {
-                                memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-                                set_buffer_uptodate(bh);
+                        set_buffer_uptodate(bh);
-                        }
-                        unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        if (!fatal)
-                                fatal = err;
-                } else {
-                        BUFFER_TRACE(bh, "not a new buffer");
-                }
-                if (fatal) {
-                        *errp = fatal;
-                        brelse(bh);
-                        bh = NULL;
                }
-                return bh;
+                unlock_buffer(bh);
+                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
+                if (!fatal)
+                        fatal = err;
+        } else {
+                BUFFER_TRACE(bh, "not a new buffer");
        }
-err:
+        if (fatal) {
-        return NULL;
+                *errp = fatal;
+                brelse(bh);
+                bh = NULL;
+        }
+        return bh;
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1448,11 +1564,39 @@ static int walk_page_buffers(handle_t *handle,
 static int do_journal_get_write_access(handle_t *handle,
                                       struct buffer_head *bh)
 {
+        int dirty = buffer_dirty(bh);
+        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
-        return ext4_journal_get_write_access(handle, bh);
+        /*
+         * __block_write_begin() could have dirtied some buffers. Clean
+         * the dirty bit as jbd2_journal_get_write_access() could complain
+         * otherwise about fs integrity issues. Setting of the dirty bit
+         * by __block_write_begin() isn't a real problem here as we clear
+         * the bit before releasing a page lock and thus writeback cannot
+         * ever write the buffer.
+         */
+        if (dirty)
+                clear_buffer_dirty(bh);
+        ret = ext4_journal_get_write_access(handle, bh);
+        if (!ret && dirty)
+                ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+        return ret;
 }
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+        truncate_inode_pages(inode->i_mapping, inode->i_size);
+        ext4_truncate(inode);
+}
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
                            struct page **pagep, void **fsdata)
@@ -1494,8 +1638,10 @@ retry:
        }
        *pagep = page;
-        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        if (ext4_should_dioread_nolock(inode))
-                                ext4_get_block);
+                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+        else
+                ret = __block_write_begin(page, pos, len, ext4_get_block);
        if (!ret && ext4_should_journal_data(inode)) {
                ret = walk_page_buffers(handle, page_buffers(page),
@@ -1506,21 +1652,21 @@ retry:
                unlock_page(page);
                page_cache_release(page);
                /*
-                 * block_write_begin may have instantiated a few blocks
+                 * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
                 *
                 * Add inode to orphan list in case we crash before
                 * truncate finishes
                 */
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext4_can_truncate(inode))
                        ext4_orphan_add(handle, inode);
                ext4_journal_stop(handle);
                if (pos + len > inode->i_size) {
-                        vmtruncate(inode, inode->i_size);
+                        ext4_truncate_failed_write(inode);
                        /*
-                         * If vmtruncate failed early the inode might
+                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
                         * make sure the inode is removed from the
                         * orphan list in that case.
@@ -1614,7 +1760,7 @@ static int ext4_ordered_write_end(struct file *file,
                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext4_can_truncate(inode))
                        /* if we have allocated more blocks and copied
                         * less. We will have blocks allocated outside
                         * inode->i_size. So truncate them
@@ -1628,9 +1774,9 @@ static int ext4_ordered_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                vmtruncate(inode, inode->i_size);
+                ext4_truncate_failed_write(inode);
                /*
-                 * If vmtruncate failed early the inode might still be
+                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
@@ -1655,7 +1801,7 @@ static int ext4_writeback_write_end(struct file *file,
        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
@@ -1670,9 +1816,9 @@ static int ext4_writeback_write_end(struct file *file,
                ret = ret2;
        if (pos + len > inode->i_size) {
-                vmtruncate(inode, inode->i_size);
+                ext4_truncate_failed_write(inode);
                /*
-                 * If vmtruncate failed early the inode might still be
+                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
@@ -1712,7 +1858,7 @@ static int ext4_journalled_write_end(struct file *file,
        new_i_size = pos + copied;
        if (new_i_size > inode->i_size)
                i_size_write(inode, pos+copied);
-        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
        if (new_i_size > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, new_i_size);
                ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1722,7 +1868,7 @@ static int ext4_journalled_write_end(struct file *file,
        unlock_page(page);
        page_cache_release(page);
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
@@ -1733,9 +1879,9 @@ static int ext4_journalled_write_end(struct file *file,
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size) {
-                vmtruncate(inode, inode->i_size);
+                ext4_truncate_failed_write(inode);
                /*
-                 * If vmtruncate failed early the inode might still be
+                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
@@ -1746,11 +1892,16 @@ static int ext4_journalled_write_end(struct file *file,
        return ret ? ret : copied;
 }
-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+/*
+ * Reserve a single block located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        unsigned long md_needed, mdblocks, total = 0;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned long md_needed;
+        int ret;
        /*
         * recalculate the amount of metadata blocks to reserve
@@ -1758,86 +1909,84 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
         * worse case is one extent per block
         */
 repeat:
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        spin_lock(&ei->i_block_reservation_lock);
-        total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+        md_needed = ext4_calc_metadata_amount(inode, lblock);
-        mdblocks = ext4_calc_metadata_amount(inode, total);
+        trace_ext4_da_reserve_space(inode, md_needed);
-        BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+        spin_unlock(&ei->i_block_reservation_lock);
-        md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
-        total = md_needed + nrblocks;
        /*
-         * Make quota reservation here to prevent quota overflow
+         * We will charge metadata quota at writeout time; this saves
-         * later. Real quota accounting is done at pages writeout
+         * us from metadata over-estimation, though we may go over by
-         * time.
+         * a small amount in the end.  Here we just reserve for data.
         */
-        if (vfs_dq_reserve_block(inode, total)) {
+        ret = dquot_reserve_block(inode, 1);
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+        if (ret)
-                return -EDQUOT;
+                return ret;
-        }
+        /*
+         * We do still charge estimated metadata to the sb though;
-        if (ext4_claim_free_blocks(sbi, total)) {
+         * we cannot afford to run out of free blocks.
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+         */
+        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+                dquot_release_reservation_block(inode, 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
                }
-                vfs_dq_release_reservation_block(inode, total);
                return -ENOSPC;
        }
-        EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+        spin_lock(&ei->i_block_reservation_lock);
-        EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+        ei->i_reserved_data_blocks++;
+        ei->i_reserved_meta_blocks += md_needed;
+        spin_unlock(&ei->i_block_reservation_lock);
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        return 0;       /* success */
 }
 static void ext4_da_release_space(struct inode *inode, int to_free)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        int total, mdb, mdb_free, release;
+        struct ext4_inode_info *ei = EXT4_I(inode);
        if (!to_free)
                return;         /* Nothing to release, exit */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-        if (!EXT4_I(inode)->i_reserved_data_blocks) {
+        trace_ext4_da_release_space(inode, to_free);
+        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
-                 * if there is no reserved blocks, but we try to free some
+                 * if there aren't enough reserved blocks, then the
-                 * then the counter is messed up somewhere.
+                 * counter is messed up somewhere.  Since this
-                 * but since this function is called from invalidate
+                 * function is called from invalidate page, it's
-                 * page, it's harmless to return without any action
+                 * harmless to return without any action.
                 */
-                printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+                ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
-                            "blocks for inode %lu, but there is no reserved "
+                         "ino %lu, to_free %d with only %d reserved "
-                            "data blocks\n", to_free, inode->i_ino);
+                         "data blocks\n", inode->i_ino, to_free,
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                         ei->i_reserved_data_blocks);
-                return;
+                WARN_ON(1);
+                to_free = ei->i_reserved_data_blocks;
        }
+        ei->i_reserved_data_blocks -= to_free;
-        /* recalculate the number of metablocks still need to be reserved */
+        if (ei->i_reserved_data_blocks == 0) {
-        total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
+                /*
-        mdb = ext4_calc_metadata_amount(inode, total);
+                 * We can release all of the reserved metadata blocks
+                 * only when we have written all of the delayed
-        /* figure out how many metablocks to release */
+                 * allocation blocks.
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+                 */
-        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
-        release = to_free + mdb_free;
+                ei->i_reserved_meta_blocks = 0;
+                ei->i_da_metadata_calc_len = 0;
-        /* update fs dirty blocks counter for truncate case */
+        }
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
-        /* update per-inode reservations */
+        /* update fs dirty data blocks counter */
-        BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
-        EXT4_I(inode)->i_reserved_data_blocks -= to_free;
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-        EXT4_I(inode)->i_reserved_meta_blocks = mdb;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        vfs_dq_release_reservation_block(inode, release);
+        dquot_release_reservation_block(inode, to_free);
 }
 static void ext4_da_page_release_reservation(struct page *page,
@@ -1865,18 +2014,6 @@ static void ext4_da_page_release_reservation(struct page *page,
 * Delayed allocation stuff
 */
-struct mpage_da_data {
-        struct inode *inode;
-        sector_t b_blocknr;             /* start block number of extent */
-        size_t b_size;                  /* size of extent */
-        unsigned long b_state;          /* state of the extent */
-        unsigned long first_page, next_page;    /* extent of pages */
-        struct writeback_control *wbc;
-        int io_done;
-        int pages_written;
-        int retval;
-};
 /*
 * mpage_da_submit_io - walks through extent of pages and try to write
 * them with writepage() call back
@@ -1890,16 +2027,23 @@ struct mpage_da_data {
 *
 * As pages are already locked by write_cache_pages(), we can't use it
 */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                              struct ext4_map_blocks *map)
 {
-        long pages_skipped;
        struct pagevec pvec;
        unsigned long index, end;
        int ret = 0, err, nr_pages, i;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+        loff_t size = i_size_read(inode);
+        unsigned int len, block_start;
+        struct buffer_head *bh, *page_bufs = NULL;
+        int journal_data = ext4_should_journal_data(inode);
+        sector_t pblock = 0, cur_logical = 0;
+        struct ext4_io_submit io_submit;
        BUG_ON(mpd->next_page <= mpd->first_page);
+        memset(&io_submit, 0, sizeof(io_submit));
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -1915,139 +2059,109 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
+                        int commit_write = 0, redirty_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
                        if (index > end)
                                break;
+                        if (index == size >> PAGE_CACHE_SHIFT)
+                                len = size & ~PAGE_CACHE_MASK;
+                        else
+                                len = PAGE_CACHE_SIZE;
+                        if (map) {
+                                cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                        inode->i_blkbits);
+                                pblock = map->m_pblk + (cur_logical -
+                                                        map->m_lblk);
+                        }
                        index++;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        pages_skipped = mpd->wbc->pages_skipped;
-                        err = mapping->a_ops->writepage(page, mpd->wbc);
-                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                                /*
-                                 * have successfully written the page
-                                 * without skipping the same
-                                 */
-                                mpd->pages_written++;
                        /*
-                         * In error case, we have to continue because
+                         * If the page does not have buffers (for
-                         * remaining pages are still locked
+                         * whatever reason), try to create them using
-                         * XXX: unlock and re-dirty them?
+                         * __block_write_begin.  If this fails,
+                         * redirty the page and move on.
                         */
-                        if (ret == 0)
+                        if (!page_has_buffers(page)) {
-                                ret = err;
+                                if (__block_write_begin(page, 0, len,
-                }
+                                                noalloc_get_block_write)) {
-                pagevec_release(&pvec);
+                                redirty_page:
-        }
+                                        redirty_page_for_writepage(mpd->wbc,
-        return ret;
+                                                                   page);
-}
+                                        unlock_page(page);
+                                        continue;
-/*
+                                }
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+                                commit_write = 1;
- *
+                        }
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
-                                 struct buffer_head *exbh)
-{
-        struct inode *inode = mpd->inode;
-        struct address_space *mapping = inode->i_mapping;
-        int blocks = exbh->b_size >> inode->i_blkbits;
-        sector_t pblock = exbh->b_blocknr, cur_logical;
-        struct buffer_head *head, *bh;
-        pgoff_t index, end;
-        struct pagevec pvec;
-        int nr_pages, i;
-        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        pagevec_init(&pvec, 0);
-        while (index <= end) {
-                /* XXX: optimize tail */
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        struct page *page = pvec.pages[i];
-                        index = page->index;
-                        if (index > end)
-                                break;
-                        index++;
-                        BUG_ON(!PageLocked(page));
-                        BUG_ON(PageWriteback(page));
-                        BUG_ON(!page_has_buffers(page));
-                        bh = page_buffers(page);
-                        head = bh;
-                        /* skip blocks out of the range */
-                        do {
-                                if (cur_logical >= logical)
-                                        break;
-                                cur_logical++;
-                        } while ((bh = bh->b_this_page) != head);
+                        bh = page_bufs = page_buffers(page);
+                        block_start = 0;
                        do {
-                                if (cur_logical >= logical + blocks)
+                                if (!bh)
-                                        break;
+                                        goto redirty_page;
+                                if (map && (cur_logical >= map->m_lblk) &&
-                                if (buffer_delay(bh) ||
+                                    (cur_logical <= (map->m_lblk +
-                                                buffer_unwritten(bh)) {
+                                                     (map->m_len - 1)))) {
-                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
                                        if (buffer_delay(bh)) {
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
-                                        } else {
-                                                /*
-                                                 * unwritten already should have
-                                                 * blocknr assigned. Verify that
-                                                 */
-                                                clear_buffer_unwritten(bh);
-                                                BUG_ON(bh->b_blocknr != pblock);
                                        }
+                                        if (buffer_unwritten(bh) ||
+                                            buffer_mapped(bh))
+                                                BUG_ON(bh->b_blocknr != pblock);
+                                        if (map->m_flags & EXT4_MAP_UNINIT)
+                                                set_buffer_uninit(bh);
+                                        clear_buffer_unwritten(bh);
+                                }
-                                } else if (buffer_mapped(bh))
+                                /* redirty page if block allocation undone */
-                                        BUG_ON(bh->b_blocknr != pblock);
+                                if (buffer_delay(bh) || buffer_unwritten(bh))
+                                        redirty_page = 1;
+                                bh = bh->b_this_page;
+                                block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
-                        } while ((bh = bh->b_this_page) != head);
+                        } while (bh != page_bufs);
-                }
-                pagevec_release(&pvec);
-        }
-}
+                        if (redirty_page)
+                                goto redirty_page;
-/*
+                        if (commit_write)
- * __unmap_underlying_blocks - just a helper function to unmap
+                                /* mark the buffer_heads as dirty & uptodate */
- * set of blocks described by @bh
+                                block_commit_write(page, 0, len);
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
-                                             struct buffer_head *bh)
-{
-        struct block_device *bdev = inode->i_sb->s_bdev;
-        int blocks, i;
-        blocks = bh->b_size >> inode->i_blkbits;
+                        /*
-        for (i = 0; i < blocks; i++)
+                         * Delalloc doesn't support data journalling,
-                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+                         * but eventually maybe we'll lift this
+                         * restriction.
+                         */
+                        if (unlikely(journal_data && PageChecked(page)))
+                                err = __ext4_journalled_writepage(page, len);
+                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
+                                err = ext4_bio_write_page(&io_submit, page,
+                                                          len, mpd->wbc);
+                        else
+                                err = block_write_full_page(page,
+                                        noalloc_get_block_write, mpd->wbc);
+                        if (!err)
+                                mpd->pages_written++;
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         */
+                        if (ret == 0)
+                                ret = err;
+                }
+                pagevec_release(&pvec);
+        }
+        ext4_io_submit(&io_submit);
+        return ret;
 }
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
@@ -2068,17 +2182,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
-                        index = page->index;
+                        if (page->index > end)
-                        if (index > end)
                                break;
-                        index++;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
                        block_invalidatepage(page, 0);
                        ClearPageUptodate(page);
                        unlock_page(page);
                }
+                index = pvec.pages[nr_pages - 1]->index + 1;
+                pagevec_release(&pvec);
        }
        return;
 }
@@ -2086,57 +2199,54 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 static void ext4_print_free_blocks(struct inode *inode)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        printk(KERN_EMERG "Total free blocks count %lld\n",
+        printk(KERN_CRIT "Total free blocks count %lld\n",
-                        ext4_count_free_blocks(inode->i_sb));
+               ext4_count_free_blocks(inode->i_sb));
-        printk(KERN_EMERG "Free/Dirty block details\n");
+        printk(KERN_CRIT "Free/Dirty block details\n");
-        printk(KERN_EMERG "free_blocks=%lld\n",
+        printk(KERN_CRIT "free_blocks=%lld\n",
-                        (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
+               (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
-        printk(KERN_EMERG "dirty_blocks=%lld\n",
+        printk(KERN_CRIT "dirty_blocks=%lld\n",
-                        (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+               (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
-        printk(KERN_EMERG "Block reservation details\n");
+        printk(KERN_CRIT "Block reservation details\n");
-        printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
+        printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
-                        EXT4_I(inode)->i_reserved_data_blocks);
+               EXT4_I(inode)->i_reserved_data_blocks);
-        printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
+        printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
-                        EXT4_I(inode)->i_reserved_meta_blocks);
+               EXT4_I(inode)->i_reserved_meta_blocks);
        return;
 }
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
 *
 * @mpd - bh describing space
 *
 * The function skips space we know is already mapped to disk blocks.
 *
 */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct buffer_head new;
+        struct ext4_map_blocks map, *mapp = NULL;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
        handle_t *handle = NULL;
        /*
-         * We consider only non-mapped and non-allocated blocks
+         * If the blocks are mapped already, or we couldn't accumulate
+         * any blocks, then proceed immediately to the submission stage.
         */
-        if ((mpd->b_state  & (1 << BH_Mapped)) &&
+        if ((mpd->b_size == 0) ||
-                !(mpd->b_state & (1 << BH_Delay)) &&
+            ((mpd->b_state  & (1 << BH_Mapped)) &&
-                !(mpd->b_state & (1 << BH_Unwritten)))
+             !(mpd->b_state & (1 << BH_Delay)) &&
-                return 0;
+             !(mpd->b_state & (1 << BH_Unwritten))))
+                goto submit_io;
-        /*
-         * If we didn't accumulate anything to write simply return
-         */
-        if (!mpd->b_size)
-                return 0;
        handle = ext4_journal_current_handle();
        BUG_ON(!handle);
        /*
-         * Call ext4_get_blocks() to allocate any delayed allocation
+         * Call ext4_map_blocks() to allocate any delayed allocation
         * blocks, or to convert an uninitialized extent to be
         * initialized (in the case where we have written into
         * one or more preallocated blocks).
@@ -2145,35 +2255,40 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * indicate that we are on the delayed allocation path.  This
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
-         * want to change *many* call functions, so ext4_get_blocks()
+         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the magic i_delalloc_reserved_flag once the
+         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
         * inode's allocation semaphore is taken.
         *
         * If the blocks in questions were delalloc blocks, set
         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
         * variables are updated after the blocks have been allocated.
         */
-        new.b_state = 0;
+        map.m_lblk = next;
-        get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+        map.m_len = max_blocks;
-                            EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+        if (ext4_should_dioread_nolock(mpd->inode))
+                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (mpd->b_state & (1 << BH_Delay))
-                get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
-                               &new, get_blocks_flags);
+        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
        if (blks < 0) {
+                struct super_block *sb = mpd->inode->i_sb;
                err = blks;
                /*
-                 * If get block returns with error we simply
+                 * If get block returns EAGAIN or ENOSPC and there
-                 * return. Later writepage will redirty the page and
+                 * appears to be free blocks we will call
-                 * writepages will find the dirty page again
+                 * ext4_writepage() for all of the pages which will
+                 * just redirty the pages.
                 */
                if (err == -EAGAIN)
-                        return 0;
+                        goto submit_io;
                if (err == -ENOSPC &&
-                    ext4_count_free_blocks(mpd->inode->i_sb)) {
+                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
-                        return 0;
+                        goto submit_io;
                }
                /*
@@ -2183,41 +2298,39 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                 * writepage and writepages will again try to write
                 * the same.
                 */
-                printk(KERN_EMERG "%s block allocation failed for inode %lu "
+                if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
-                                  "at logical offset %llu with max blocks "
+                        ext4_msg(sb, KERN_CRIT,
-                                  "%zd with error %d\n",
+                                 "delayed block allocation failed for inode %lu "
-                                  __func__, mpd->inode->i_ino,
+                                 "at logical offset %llu with max blocks %zd "
-                                  (unsigned long long)next,
+                                 "with error %d", mpd->inode->i_ino,
-                                  mpd->b_size >> mpd->inode->i_blkbits, err);
+                                 (unsigned long long) next,
-                printk(KERN_EMERG "This should not happen.!! "
+                                 mpd->b_size >> mpd->inode->i_blkbits, err);
-                                        "Data will be lost\n");
+                        ext4_msg(sb, KERN_CRIT,
-                if (err == -ENOSPC) {
+                                "This should not happen!! Data will be lost\n");
-                        ext4_print_free_blocks(mpd->inode);
+                        if (err == -ENOSPC)
+                                ext4_print_free_blocks(mpd->inode);
                }
                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
-                return err;
+                return;
        }
        BUG_ON(blks == 0);
-        new.b_size = (blks << mpd->inode->i_blkbits);
+        mapp = &map;
+        if (map.m_flags & EXT4_MAP_NEW) {
+                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+                int i;
-        if (buffer_new(&new))
+                for (i = 0; i < map.m_len; i++)
-                __unmap_underlying_blocks(mpd->inode, &new);
+                        unmap_underlying_metadata(bdev, map.m_pblk + i);
+        }
-        /*
-         * If blocks are delayed marked, we need to
-         * put actual blocknr and drop delayed bit
-         */
-        if ((mpd->b_state & (1 << BH_Delay)) ||
-            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, next, &new);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
                if (err)
-                        return err;
+                        /* This only happens if the journal is aborted */
+                        return;
        }
        /*
@@ -2228,10 +2341,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                disksize = i_size_read(mpd->inode);
        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                ext4_update_i_disksize(mpd->inode, disksize);
-                return ext4_mark_inode_dirty(handle, mpd->inode);
+                err = ext4_mark_inode_dirty(handle, mpd->inode);
+                if (err)
+                        ext4_error(mpd->inode->i_sb,
+                                   "Failed to mark inode %lu dirty",
+                                   mpd->inode->i_ino);
        }
-        return 0;
+submit_io:
+        mpage_da_submit_io(mpd, mapp);
+        mpd->io_done = 1;
 }
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2253,8 +2372,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
        sector_t next;
        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+        /*
+         * XXX Don't go larger than mballoc is willing to allocate
+         * This is a stopgap solution.  We eventually need to fold
+         * mpage_da_submit_io() into this function and then call
+         * ext4_map_blocks() multiple times in a loop
+         */
+        if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+                goto flush_it;
        /* check if thereserved journal credits might overflow */
-        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
                        /*
                         * With non-extent format we are limited by the journal
@@ -2299,21 +2427,13 @@ flush_it:
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-        if (mpage_da_map_blocks(mpd) == 0)
+        mpage_da_map_and_submit(mpd);
-                mpage_da_submit_io(mpd);
-        mpd->io_done = 1;
        return;
 }
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 {
-        /*
+        return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
-         * unmapped buffer is possible for holes.
-         * delay buffer is possible with delayed allocation.
-         * We also need to consider unwritten buffer as unmapped.
-         */
-        return (!buffer_mapped(bh) || buffer_delay(bh) ||
-                                buffer_unwritten(bh)) && buffer_dirty(bh);
 }
 /*
@@ -2326,39 +2446,26 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 * The function finds extents of pages and scan them for all blocks.
 */
 static int __mpage_da_writepage(struct page *page,
-                                struct writeback_control *wbc, void *data)
+                                struct writeback_control *wbc,
+                                struct mpage_da_data *mpd)
 {
-        struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
        struct buffer_head *bh, *head;
        sector_t logical;
-        if (mpd->io_done) {
-                /*
-                 * Rest of the page in the page_vec
-                 * redirty then and skip then. We will
-                 * try to to write them again after
-                 * starting a new transaction
-                 */
-                redirty_page_for_writepage(wbc, page);
-                unlock_page(page);
-                return MPAGE_DA_EXTENT_TAIL;
-        }
        /*
         * Can we merge this page to current extent?
         */
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them using writepage()
+                 * and start IO on them
                 */
                if (mpd->next_page != mpd->first_page) {
-                        if (mpage_da_map_blocks(mpd) == 0)
+                        mpage_da_map_and_submit(mpd);
-                                mpage_da_submit_io(mpd);
                        /*
                         * skip rest of the page in the page_vec
                         */
-                        mpd->io_done = 1;
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return MPAGE_DA_EXTENT_TAIL;
@@ -2398,9 +2505,9 @@ static int __mpage_da_writepage(struct page *page,
                         * We need to try to allocate
                         * unmapped blocks in the same page.
                         * Otherwise we won't make progress
-                         * with the page in ext4_da_writepage
+                         * with the page in ext4_writepage
                         */
-                        if (ext4_bh_unmapped_or_delay(NULL, bh)) {
+                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
                                mpage_add_bh_to_extent(mpd, logical,
                                                       bh->b_size,
                                                       bh->b_state);
@@ -2438,8 +2545,9 @@ static int __mpage_da_writepage(struct page *page,
 * initialized properly.
 */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-                                  struct buffer_head *bh_result, int create)
+                                  struct buffer_head *bh, int create)
 {
+        struct ext4_map_blocks map;
        int ret = 0;
        sector_t invalid_block = ~((sector_t) 0xffff);
@@ -2447,165 +2555,228 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                invalid_block = ~0;
        BUG_ON(create == 0);
-        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+        map.m_lblk = iblock;
+        map.m_len = 1;
        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
+        ret = ext4_map_blocks(NULL, inode, &map, 0);
-        if ((ret == 0) && !buffer_delay(bh_result)) {
+        if (ret < 0)
-                /* the block isn't (pre)allocated yet, let's reserve space */
+                return ret;
+        if (ret == 0) {
+                if (buffer_delay(bh))
+                        return 0; /* Not sure this could or should happen */
                /*
-                 * XXX: __block_prepare_write() unmaps passed block,
+                 * XXX: __block_write_begin() unmaps passed block, is it OK?
-                 * is it OK?
                 */
-                ret = ext4_da_reserve_space(inode, 1);
+                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
                        /* not enough space to reserve */
                        return ret;
-                map_bh(bh_result, inode->i_sb, invalid_block);
+                map_bh(bh, inode->i_sb, invalid_block);
-                set_buffer_new(bh_result);
+                set_buffer_new(bh);
-                set_buffer_delay(bh_result);
+                set_buffer_delay(bh);
-        } else if (ret > 0) {
+                return 0;
-                bh_result->b_size = (ret << inode->i_blkbits);
-                if (buffer_unwritten(bh_result)) {
-                        /* A delayed write to unwritten bh should
-                         * be marked new and mapped.  Mapped ensures
-                         * that we don't do get_block multiple times
-                         * when we write to the same offset and new
-                         * ensures that we do proper zero out for
-                         * partial write.
-                         */
-                        set_buffer_new(bh_result);
-                        set_buffer_mapped(bh_result);
-                }
-                ret = 0;
        }
-        return ret;
+        map_bh(bh, inode->i_sb, map.m_pblk);
+        bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+        if (buffer_unwritten(bh)) {
+                /* A delayed write to unwritten bh should be marked
+                 * new and mapped.  Mapped ensures that we don't do
+                 * get_block multiple times when we write to the same
+                 * offset and new ensures that we do proper zero out
+                 * for partial write.
+                 */
+                set_buffer_new(bh);
+                set_buffer_mapped(bh);
+        }
+        return 0;
 }
 /*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write(), nobh_writepage(), and
+ * callback function for block_write_begin() and block_write_full_page().
- * block_write_full_page().  These functions should only try to map a
+ * These functions should only try to map a single block at a time.
- * single block at a time.
 *
 * Since this function doesn't do block allocations even if the caller
 * requests it by passing in create=1, it is critically important that
 * any caller checks to make sure that any buffer heads are returned
 * by this function are either all already mapped or marked for
- * delayed allocation before calling nobh_writepage() or
+ * delayed allocation before calling  block_write_full_page().  Otherwise,
- * block_write_full_page().  Otherwise, b_blocknr could be left
+ * b_blocknr could be left unitialized, and the page write functions will
- * unitialized, and the page write functions will be taken by
+ * be taken by surprise.
- * surprise.
 */
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
+        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        return _ext4_get_block(inode, iblock, bh_result, 0);
+}
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+        get_bh(bh);
+        return 0;
+}
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+        put_bh(bh);
+        return 0;
+}
+static int __ext4_journalled_writepage(struct page *page,
+                                       unsigned int len)
+{
+        struct address_space *mapping = page->mapping;
+        struct inode *inode = mapping->host;
+        struct buffer_head *page_bufs;
+        handle_t *handle = NULL;
        int ret = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        int err;
-        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        ClearPageChecked(page);
+        page_bufs = page_buffers(page);
+        BUG_ON(!page_bufs);
+        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+        /* As soon as we unlock the page, it can go away, but we have
+         * references to buffers so we are safe */
+        unlock_page(page);
-        /*
+        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-         * we don't want to do block allocation in writepage
+        if (IS_ERR(handle)) {
-         * so call get_block_wrap with create = 0
+                ret = PTR_ERR(handle);
-         */
+                goto out;
-        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-        BUG_ON(create && ret == 0);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
        }
+        ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                do_journal_get_write_access);
+        err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+                                write_end_fn);
+        if (ret == 0)
+                ret = err;
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+out:
        return ret;
 }
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 /*
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
 * This function can get called via...
 *   - ext4_da_writepages after taking page lock (have journal handle)
 *   - journal_submit_inode_data_buffers (no journal handle)
 *   - shrink_page_list via pdflush (no journal handle)
 *   - grab_page_cache when doing write_begin (have journal handle)
+ *
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other bufer_heads would be unmapped but dirty(dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
+ *
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
+ *
+ *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *              ext4_writepage()
+ *
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
 */
-static int ext4_da_writepage(struct page *page,
+static int ext4_writepage(struct page *page,
-                                struct writeback_control *wbc)
+                          struct writeback_control *wbc)
 {
-        int ret = 0;
+        int ret = 0, commit_write = 0;
        loff_t size;
        unsigned int len;
-        struct buffer_head *page_bufs;
+        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
-        trace_ext4_da_writepage(inode, page);
+        trace_ext4_writepage(inode, page);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
        else
                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
+        /*
-                page_bufs = page_buffers(page);
+         * If the page does not have buffers (for whatever reason),
-                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+         * try to create them using __block_write_begin.  If this
-                                        ext4_bh_unmapped_or_delay)) {
+         * fails, redirty the page and move on.
-                        /*
+         */
-                         * We don't want to do  block allocation
+        if (!page_has_buffers(page)) {
-                         * So redirty the page and return
+                if (__block_write_begin(page, 0, len,
-                         * We may reach here when we do a journal commit
+                                        noalloc_get_block_write)) {
-                         * via journal_submit_inode_data_buffers.
+                redirty_page:
-                         * If we don't have mapping block we just ignore
-                         * them. We can also reach here via shrink_page_list
-                         */
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return 0;
                }
-        } else {
+                commit_write = 1;
+        }
+        page_bufs = page_buffers(page);
+        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                              ext4_bh_delay_or_unwritten)) {
                /*
-                 * The test for page_has_buffers() is subtle:
+                 * We don't want to do block allocation, so redirty
-                 * We know the page is dirty but it lost buffers. That means
+                 * the page and return.  We may reach here when we do
-                 * that at some moment in time after write_begin()/write_end()
+                 * a journal commit via journal_submit_inode_data_buffers.
-                 * has been called all buffers have been clean and thus they
+                 * We can also reach here via shrink_page_list
-                 * must have been written at least once. So they are all
-                 * mapped and we can happily proceed with mapping them
-                 * and writing the page.
-                 *
-                 * Try to initialize the buffer_heads and check whether
-                 * all are mapped and non delay. We don't want to
-                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                goto redirty_page;
-                                          noalloc_get_block_write);
-                if (!ret) {
-                        page_bufs = page_buffers(page);
-                        /* check whether all are mapped and non delay */
-                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                                ext4_bh_unmapped_or_delay)) {
-                                redirty_page_for_writepage(wbc, page);
-                                unlock_page(page);
-                                return 0;
-                        }
-                } else {
-                        /*
-                         * We can't do block allocation here
-                         * so just redity the page and unlock
-                         * and return
-                         */
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return 0;
-                }
-                /* now mark the buffer_heads as dirty and uptodate */
-                block_commit_write(page, 0, PAGE_CACHE_SIZE);
        }
+        if (commit_write)
+                /* now mark the buffer_heads as dirty and uptodate */
+                block_commit_write(page, 0, len);
-        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+        if (PageChecked(page) && ext4_should_journal_data(inode))
-                ret = nobh_writepage(page, noalloc_get_block_write, wbc);
+                /*
-        else
+                 * It's mmapped pagecache.  Add buffers and journal it.  There
+                 * doesn't seem much point in redirtying the page here.
+                 */
+                return __ext4_journalled_writepage(page, len);
+        if (buffer_uninit(page_bufs)) {
+                ext4_set_bh_endio(page_bufs, inode);
+                ret = block_write_full_page_endio(page, noalloc_get_block_write,
+                                            wbc, ext4_end_io_buffer_write);
+        } else
                ret = block_write_full_page(page, noalloc_get_block_write,
                                            wbc);
@@ -2630,13 +2801,140 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
         * number of contiguous block. So we will limit
         * number of contiguous block to a sane value
         */
-        if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
            (max_blocks > EXT4_MAX_TRANS_DATA))
                max_blocks = EXT4_MAX_TRANS_DATA;
        return ext4_chunk_trans_blocks(inode, max_blocks);
 }
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages().  Differences:
+ *      Range cyclic is ignored.
+ *      no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+                                struct writeback_control *wbc,
+                                struct mpage_da_data *mpd,
+                                pgoff_t *done_index)
+{
+        int ret = 0;
+        int done = 0;
+        struct pagevec pvec;
+        unsigned nr_pages;
+        pgoff_t index;
+        pgoff_t end;            /* Inclusive */
+        long nr_to_write = wbc->nr_to_write;
+        int tag;
+        pagevec_init(&pvec, 0);
+        index = wbc->range_start >> PAGE_CACHE_SHIFT;
+        end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
+        *done_index = index;
+        while (!done && (index <= end)) {
+                int i;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /*
+                         * At this point, the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or
+                         * even swizzled back from swapper_space to tmpfs file
+                         * mapping. However, page->index will not change
+                         * because we have a reference on the page.
+                         */
+                        if (page->index > end) {
+                                done = 1;
+                                break;
+                        }
+                        *done_index = page->index + 1;
+                        lock_page(page);
+                        /*
+                         * Page truncated or invalidated. We can freely skip it
+                         * then, even for data integrity operations: the page
+                         * has disappeared concurrently, so there could be no
+                         * real expectation of this data interity operation
+                         * even if there is now a new, dirty page at the same
+                         * pagecache address.
+                         */
+                        if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (!PageDirty(page)) {
+                                /* someone wrote it for us */
+                                goto continue_unlock;
+                        }
+                        if (PageWriteback(page)) {
+                                if (wbc->sync_mode != WB_SYNC_NONE)
+                                        wait_on_page_writeback(page);
+                                else
+                                        goto continue_unlock;
+                        }
+                        BUG_ON(PageWriteback(page));
+                        if (!clear_page_dirty_for_io(page))
+                                goto continue_unlock;
+                        ret = __mpage_da_writepage(page, wbc, mpd);
+                        if (unlikely(ret)) {
+                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                        unlock_page(page);
+                                        ret = 0;
+                                } else {
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                        if (nr_to_write > 0) {
+                                nr_to_write--;
+                                if (nr_to_write == 0 &&
+                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                        /*
+                                         * We stop writing back only if we are
+                                         * not doing integrity sync. In case of
+                                         * integrity sync we have to keep going
+                                         * because someone may be concurrently
+                                         * dirtying pages, and we might have
+                                         * synced a lot of newly appeared dirty
+                                         * pages, but have not synced all of the
+                                         * old dirty pages.
+                                         */
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        return ret;
+}
 static int ext4_da_writepages(struct address_space *mapping,
                              struct writeback_control *wbc)
 {
@@ -2645,12 +2943,16 @@ static int ext4_da_writepages(struct address_space *mapping,
        handle_t *handle = NULL;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
-        int no_nrwrite_index_update;
        int pages_written = 0;
        long pages_skipped;
+        unsigned int max_pages;
        int range_cyclic, cycled = 1, io_done = 0;
-        int needed_blocks, ret = 0, nr_to_writebump = 0;
+        int needed_blocks, ret = 0;
+        long desired_nr_to_write, nr_to_writebump = 0;
+        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+        pgoff_t done_index = 0;
+        pgoff_t end;
        trace_ext4_da_writepages(inode, wbc);
@@ -2675,16 +2977,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
                return -EROFS;
-        /*
-         * Make sure nr_to_write is >= sbi->s_mb_stream_request
-         * This make sure small files blocks are allocated in
-         * single attempt. This ensure that small files
-         * get less fragmented.
-         */
-        if (wbc->nr_to_write < sbi->s_mb_stream_request) {
-                nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
-                wbc->nr_to_write = sbi->s_mb_stream_request;
-        }
        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;
@@ -2696,21 +2988,54 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->range_start = index << PAGE_CACHE_SHIFT;
                wbc->range_end  = LLONG_MAX;
                wbc->range_cyclic = 0;
-        } else
+                end = -1;
+        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        }
+        /*
+         * This works around two forms of stupidity.  The first is in
+         * the writeback code, which caps the maximum number of pages
+         * written to be 1024 pages.  This is wrong on multiple
+         * levels; different architectues have a different page size,
+         * which changes the maximum amount of data which gets
+         * written.  Secondly, 4 megabytes is way too small.  XFS
+         * forces this value to be 16 megabytes by multiplying
+         * nr_to_write parameter by four, and then relies on its
+         * allocator to allocate larger extents to make them
+         * contiguous.  Unfortunately this brings us to the second
+         * stupidity, which is that ext4's mballoc code only allocates
+         * at most 2048 blocks.  So we force contiguous writes up to
+         * the number of dirty blocks in the inode, or
+         * sbi->max_writeback_mb_bump whichever is smaller.
+         */
+        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+        if (!range_cyclic && range_whole) {
+                if (wbc->nr_to_write == LONG_MAX)
+                        desired_nr_to_write = wbc->nr_to_write;
+                else
+                        desired_nr_to_write = wbc->nr_to_write * 8;
+        } else
+                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+                                                           max_pages);
+        if (desired_nr_to_write > max_pages)
+                desired_nr_to_write = max_pages;
+        if (wbc->nr_to_write < desired_nr_to_write) {
+                nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+                wbc->nr_to_write = desired_nr_to_write;
+        }
        mpd.wbc = wbc;
        mpd.inode = mapping->host;
-        /*
-         * we don't want write_cache_pages to update
-         * nr_to_write and writeback_index
-         */
-        no_nrwrite_index_update = wbc->no_nrwrite_index_update;
-        wbc->no_nrwrite_index_update = 1;
        pages_skipped = wbc->pages_skipped;
 retry:
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag_pages_for_writeback(mapping, index, end);
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -2726,10 +3051,9 @@ retry:
                handle = ext4_journal_start(inode, needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                        printk(KERN_CRIT "%s: jbd2_start: "
+                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
-                               "%ld pages, ino %lu; err %d\n", __func__,
+                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
-                        dump_stack();
                        goto out_writepages;
                }
@@ -2750,19 +3074,17 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
-                                        &mpd);
                /*
-                 * If we have a contigous extent of pages and we
+                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                        if (mpage_da_map_blocks(&mpd) == 0)
+                        mpage_da_map_and_submit(&mpd);
-                                mpage_da_submit_io(&mpd);
-                        mpd.io_done = 1;
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
+                trace_ext4_da_write_pages(inode, &mpd);
                wbc->nr_to_write -= mpd.pages_written;
                ext4_journal_stop(handle);
@@ -2800,24 +3122,23 @@ retry:
                goto retry;
        }
        if (pages_skipped != wbc->pages_skipped)
-                printk(KERN_EMERG "This should not happen leaving %s "
+                ext4_msg(inode->i_sb, KERN_CRIT,
-                                "with nr_to_write = %ld ret = %d\n",
+                         "This should not happen leaving %s "
-                                __func__, wbc->nr_to_write, ret);
+                         "with nr_to_write = %ld ret = %d",
+                         __func__, wbc->nr_to_write, ret);
        /* Update index */
-        index += pages_written;
        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-                mapping->writeback_index = index;
+                mapping->writeback_index = done_index;
 out_writepages:
-        if (!no_nrwrite_index_update)
-                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
+        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
 }
@@ -2841,11 +3162,18 @@ static int ext4_nonda_switch(struct super_block *sb)
        if (2 * free_blocks < 3 * dirty_blocks ||
                free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
                /*
-                 * free block count is less that 150% of dirty blocks
+                 * free block count is less than 150% of dirty blocks
-                 * or free blocks is less that watermark
+                 * or free blocks is less than watermark
                 */
                return 1;
        }
+        /*
+         * Even if we don't switch but are nearing capacity,
+         * start pushing delalloc when 1/2 of free blocks are dirty.
+         */
+        if (free_blocks < 2 * dirty_blocks)
+                writeback_inodes_sb_if_idle(sb);
        return 0;
 }
@@ -2856,13 +3184,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        int ret, retries = 0;
        struct page *page;
        pgoff_t index;
-        unsigned from, to;
        struct inode *inode = mapping->host;
        handle_t *handle;
        index = pos >> PAGE_CACHE_SHIFT;
-        from = pos & (PAGE_CACHE_SIZE - 1);
-        to = from + len;
        if (ext4_nonda_switch(inode->i_sb)) {
                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -2895,8 +3220,7 @@ retry:
        }
        *pagep = page;
-        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
-                                ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
@@ -2907,7 +3231,7 @@ retry:
                 * i_size_read because we hold i_mutex.
                 */
                if (pos + len > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
+                        ext4_truncate_failed_write(inode);
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3030,6 +3354,8 @@ out:
 */
 int ext4_alloc_da_blocks(struct inode *inode)
 {
+        trace_ext4_alloc_da_blocks(inode);
        if (!EXT4_I(inode)->i_reserved_data_blocks &&
            !EXT4_I(inode)->i_reserved_meta_blocks)
                return 0;
@@ -3098,7 +3424,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                filemap_write_and_wait(mapping);
        }
-        if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+        if (EXT4_JOURNAL(inode) &&
+            ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
                 * bmap on dirty files is expected to be extremely rare:
@@ -3117,7 +3444,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                 * everything they get.
                 */
-                EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+                ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
                journal = EXT4_JOURNAL(inode);
                jbd2_journal_lock_updates(journal);
                err = jbd2_journal_flush(journal);
@@ -3130,222 +3457,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping, block, ext4_get_block);
 }
-static int bget_one(handle_t *handle, struct buffer_head *bh)
-{
-        get_bh(bh);
-        return 0;
-}
-static int bput_one(handle_t *handle, struct buffer_head *bh)
-{
-        put_bh(bh);
-        return 0;
-}
-/*
- * Note that we don't need to start a transaction unless we're journaling data
- * because we should have holes filled from ext4_page_mkwrite(). We even don't
- * need to file the inode to the transaction's list in ordered mode because if
- * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
- * transaction the data will hit the disk. In case we are journaling data, we
- * cannot start transaction directly because transaction start ranks above page
- * lock so we have to do some magic.
- *
- * In all journaling modes block_write_full_page() will start the I/O.
- *
- * Problem:
- *
- *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- *              ext4_writepage()
- *
- * Similar for:
- *
- *      ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext4_get_block().  We will deadlock on various things like
- * lock_journal and i_data_sem
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- *          non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- *   In journalled data mode, a data buffer may be metadata against the
- *   current transaction.  But the same file is part of a shared mapping
- *   and someone does a writepage() on it.
- *
- *   We will move the buffer onto the async_data list, but *after* it has
- *   been dirtied. So there's a small window where we have dirty data on
- *   BJ_Metadata.
- *
- *   Note that this only applies to the last partial page in the file.  The
- *   bit which block_write_full_page() uses prepare/commit for.  (That's
- *   broken code anyway: it's wrong for msync()).
- *
- *   It's a rare case: affects the final partial page, for journalled data
- *   where the file is subject to bith write() and writepage() in the same
- *   transction.  To fix it we'll need a custom block_write_full_page().
- *   We'll probably need that anyway for journalling writepage() output.
- *
- * We don't honour synchronous mounts for writepage().  That would be
- * disastrous.  Any write() or metadata operation will sync the fs for
- * us.
- *
- */
-static int __ext4_normal_writepage(struct page *page,
-                                   struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        if (test_opt(inode->i_sb, NOBH))
-                return nobh_writepage(page, noalloc_get_block_write, wbc);
-        else
-                return block_write_full_page(page, noalloc_get_block_write,
-                                             wbc);
-}
-static int ext4_normal_writepage(struct page *page,
-                                 struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t size = i_size_read(inode);
-        loff_t len;
-        trace_ext4_normal_writepage(inode, page);
-        J_ASSERT(PageLocked(page));
-        if (page->index == size >> PAGE_CACHE_SHIFT)
-                len = size & ~PAGE_CACHE_MASK;
-        else
-                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
-                /* if page has buffers it should all be mapped
-                 * and allocated. If there are not buffers attached
-                 * to the page we know the page is dirty but it lost
-                 * buffers. That means that at some moment in time
-                 * after write_begin() / write_end() has been called
-                 * all buffers have been clean and thus they must have been
-                 * written at least once. So they are all mapped and we can
-                 * happily proceed with mapping them and writing the page.
-                 */
-                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay));
-        }
-        if (!ext4_journal_current_handle())
-                return __ext4_normal_writepage(page, wbc);
-        redirty_page_for_writepage(wbc, page);
-        unlock_page(page);
-        return 0;
-}
-static int __ext4_journalled_writepage(struct page *page,
-                                       struct writeback_control *wbc)
-{
-        struct address_space *mapping = page->mapping;
-        struct inode *inode = mapping->host;
-        struct buffer_head *page_bufs;
-        handle_t *handle = NULL;
-        int ret = 0;
-        int err;
-        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                  noalloc_get_block_write);
-        if (ret != 0)
-                goto out_unlock;
-        page_bufs = page_buffers(page);
-        walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
-                                                                bget_one);
-        /* As soon as we unlock the page, it can go away, but we have
-         * references to buffers so we are safe */
-        unlock_page(page);
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                goto out;
-        }
-        ret = walk_page_buffers(handle, page_bufs, 0,
-                        PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-        err = walk_page_buffers(handle, page_bufs, 0,
-                                PAGE_CACHE_SIZE, NULL, write_end_fn);
-        if (ret == 0)
-                ret = err;
-        err = ext4_journal_stop(handle);
-        if (!ret)
-                ret = err;
-        walk_page_buffers(handle, page_bufs, 0,
-                                PAGE_CACHE_SIZE, NULL, bput_one);
-        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-        goto out;
-out_unlock:
-        unlock_page(page);
-out:
-        return ret;
-}
-static int ext4_journalled_writepage(struct page *page,
-                                     struct writeback_control *wbc)
-{
-        struct inode *inode = page->mapping->host;
-        loff_t size = i_size_read(inode);
-        loff_t len;
-        trace_ext4_journalled_writepage(inode, page);
-        J_ASSERT(PageLocked(page));
-        if (page->index == size >> PAGE_CACHE_SHIFT)
-                len = size & ~PAGE_CACHE_MASK;
-        else
-                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
-                /* if page has buffers it should all be mapped
-                 * and allocated. If there are not buffers attached
-                 * to the page we know the page is dirty but it lost
-                 * buffers. That means that at some moment in time
-                 * after write_begin() / write_end() has been called
-                 * all buffers have been clean and thus they must have been
-                 * written at least once. So they are all mapped and we can
-                 * happily proceed with mapping them and writing the page.
-                 */
-                BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                        ext4_bh_unmapped_or_delay));
-        }
-        if (ext4_journal_current_handle())
-                goto no_write;
-        if (PageChecked(page)) {
-                /*
-                 * It's mmapped pagecache.  Add buffers and journal it.  There
-                 * doesn't seem much point in redirtying the page here.
-                 */
-                ClearPageChecked(page);
-                return __ext4_journalled_writepage(page, wbc);
-        } else {
-                /*
-                 * It may be a page full of checkpoint-mode buffers.  We don't
-                 * really know unless we go poke around in the buffer_heads.
-                 * But block_write_full_page will do the right thing.
-                 */
-                return block_write_full_page(page, noalloc_get_block_write,
-                                             wbc);
-        }
-no_write:
-        redirty_page_for_writepage(wbc, page);
-        unlock_page(page);
-        return 0;
-}
 static int ext4_readpage(struct file *file, struct page *page)
 {
        return mpage_readpage(page, ext4_get_block);
@@ -3358,11 +3469,36 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
+static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+{
+        struct buffer_head *head, *bh;
+        unsigned int curr_off = 0;
+        if (!page_has_buffers(page))
+                return;
+        head = bh = page_buffers(page);
+        do {
+                if (offset <= curr_off && test_clear_buffer_uninit(bh)
+                                        && bh->b_private) {
+                        ext4_free_io_end(bh->b_private);
+                        bh->b_private = NULL;
+                        bh->b_end_io = NULL;
+                }
+                curr_off = curr_off + bh->b_size;
+                bh = bh->b_this_page;
+        } while (bh != head);
+}
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
        /*
+         * free any io_end structure allocated for buffers to be discarded
+         */
+        if (ext4_should_dioread_nolock(page->mapping->host))
+                ext4_invalidatepage_free_endio(page, offset);
+        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
        if (offset == 0)
@@ -3388,6 +3524,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 /*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
 * If the O_DIRECT write will extend the file then add this inode to the
 * orphan list.  So recovery will truncate it back to the original size
 * if the machine crashes during the write.
@@ -3396,7 +3534,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 * crashes then stale disk data _may_ be exposed inside the file. But current
 * VFS code falls back into buffered path in that case so we are safe.
 */
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
                              const struct iovec *iov, loff_t offset,
                              unsigned long nr_segs)
 {
@@ -3407,6 +3545,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        ssize_t ret;
        int orphan = 0;
        size_t count = iov_length(iov, nr_segs);
+        int retries = 0;
        if (rw == WRITE) {
                loff_t final_size = offset + count;
@@ -3429,10 +3568,29 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
                }
        }
-        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+retry:
+        if (rw == READ && ext4_should_dioread_nolock(inode))
+                ret = __blockdev_direct_IO(rw, iocb, inode,
+                                 inode->i_sb->s_bdev, iov,
+                                 offset, nr_segs,
+                                 ext4_get_block, NULL, NULL, 0);
+        else {
+                ret = blockdev_direct_IO(rw, iocb, inode,
+                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext4_get_block, NULL);
+                if (unlikely((rw & WRITE) && ret < 0)) {
+                        loff_t isize = i_size_read(inode);
+                        loff_t end = offset + iov_length(iov, nr_segs);
+                        if (end > isize)
+                                vmtruncate(inode, isize);
+                }
+        }
+        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                goto retry;
        if (orphan) {
                int err;
@@ -3443,6 +3601,9 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
                         * but cannot extend i_size. Bail out and pretend
                         * the write failed... */
                        ret = PTR_ERR(handle);
+                        if (inode->i_nlink)
+                                ext4_orphan_del(NULL, inode);
                        goto out;
                }
                if (inode->i_nlink)
@@ -3471,6 +3632,254 @@ out:
 }
 /*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh_result, int create)
+{
+        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+                   inode->i_ino, create);
+        return _ext4_get_block(inode, iblock, bh_result,
+                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+                            ssize_t size, void *private, int ret,
+                            bool is_async)
+{
+        ext4_io_end_t *io_end = iocb->private;
+        struct workqueue_struct *wq;
+        unsigned long flags;
+        struct ext4_inode_info *ei;
+        /* if not async direct IO or dio with 0 bytes write, just return */
+        if (!io_end || !size)
+                goto out;
+        ext_debug("ext4_end_io_dio(): io_end 0x%p"
+                  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+                  iocb->private, io_end->inode->i_ino, iocb, offset,
+                  size);
+        /* if not aio dio with unwritten extents, just free io and return */
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                ext4_free_io_end(io_end);
+                iocb->private = NULL;
+out:
+                if (is_async)
+                        aio_complete(iocb, ret, 0);
+                return;
+        }
+        io_end->offset = offset;
+        io_end->size = size;
+        if (is_async) {
+                io_end->iocb = iocb;
+                io_end->result = ret;
+        }
+        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+        /* Add the io_end to per-inode completed aio dio list*/
+        ei = EXT4_I(io_end->inode);
+        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+        list_add_tail(&io_end->list, &ei->i_completed_io_list);
+        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
+        iocb->private = NULL;
+}
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
+{
+        ext4_io_end_t *io_end = bh->b_private;
+        struct workqueue_struct *wq;
+        struct inode *inode;
+        unsigned long flags;
+        if (!test_clear_buffer_uninit(bh) || !io_end)
+                goto out;
+        if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
+                printk("sb umounted, discard end_io request for inode %lu\n",
+                        io_end->inode->i_ino);
+                ext4_free_io_end(io_end);
+                goto out;
+        }
+        io_end->flag = EXT4_IO_END_UNWRITTEN;
+        inode = io_end->inode;
+        /* Add the io_end to per-inode completed io list*/
+        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+        list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+        wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
+out:
+        bh->b_private = NULL;
+        bh->b_end_io = NULL;
+        clear_buffer_uninit(bh);
+        end_buffer_async_write(bh, uptodate);
+}
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+        ext4_io_end_t *io_end;
+        struct page *page = bh->b_page;
+        loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+        size_t size = bh->b_size;
+retry:
+        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
+        if (!io_end) {
+                pr_warning_ratelimited("%s: allocation fail\n", __func__);
+                schedule();
+                goto retry;
+        }
+        io_end->offset = offset;
+        io_end->size = size;
+        /*
+         * We need to hold a reference to the page to make sure it
+         * doesn't get evicted before ext4_end_io_work() has a chance
+         * to convert the extent from written to unwritten.
+         */
+        io_end->page = page;
+        get_page(io_end->page);
+        bh->b_private = io_end;
+        bh->b_end_io = ext4_end_io_buffer_write;
+        return 0;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the convertion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+                              const struct iovec *iov, loff_t offset,
+                              unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        ssize_t ret;
+        size_t count = iov_length(iov, nr_segs);
+        loff_t final_size = offset + count;
+        if (rw == WRITE && final_size <= inode->i_size) {
+                /*
+                 * We could direct write to holes and fallocate.
+                 *
+                 * Allocated blocks to fill the hole are marked as uninitialized
+                 * to prevent paralel buffered read to expose the stale data
+                 * before DIO complete the data IO.
+                 *
+                 * As to previously fallocated extents, ext4 get_block
+                 * will just simply mark the buffer mapped but still
+                 * keep the extents uninitialized.
+                 *
+                 * for non AIO case, we will convert those unwritten extents
+                 * to written after return back from blockdev_direct_IO.
+                 *
+                 * for async DIO, the conversion needs to be defered when
+                 * the IO is completed. The ext4 end_io callback function
+                 * will be called to take care of the conversion work.
+                 * Here for async case, we allocate an io_end structure to
+                 * hook to the iocb.
+                 */
+                iocb->private = NULL;
+                EXT4_I(inode)->cur_aio_dio = NULL;
+                if (!is_sync_kiocb(iocb)) {
+                        iocb->private = ext4_init_io_end(inode, GFP_NOFS);
+                        if (!iocb->private)
+                                return -ENOMEM;
+                        /*
+                         * we save the io structure for current async
+                         * direct IO, so that later ext4_map_blocks()
+                         * could flag the io structure whether there
+                         * is a unwritten extents needs to be converted
+                         * when IO is completed.
+                         */
+                        EXT4_I(inode)->cur_aio_dio = iocb->private;
+                }
+                ret = blockdev_direct_IO(rw, iocb, inode,
+                                         inode->i_sb->s_bdev, iov,
+                                         offset, nr_segs,
+                                         ext4_get_block_write,
+                                         ext4_end_io_dio);
+                if (iocb->private)
+                        EXT4_I(inode)->cur_aio_dio = NULL;
+                /*
+                 * The io_end structure takes a reference to the inode,
+                 * that structure needs to be destroyed and the
+                 * reference to the inode need to be dropped, when IO is
+                 * complete, even with 0 byte write, or failed.
+                 *
+                 * In the successful AIO DIO case, the io_end structure will be
+                 * desctroyed and the reference to the inode will be dropped
+                 * after the end_io call back function is called.
+                 *
+                 * In the case there is 0 byte write, or error case, since
+                 * VFS direct IO won't invoke the end_io call back function,
+                 * we need to free the end_io structure here.
+                 */
+                if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+                        ext4_free_io_end(iocb->private);
+                        iocb->private = NULL;
+                } else if (ret > 0 && ext4_test_inode_state(inode,
+                                                EXT4_STATE_DIO_UNWRITTEN)) {
+                        int err;
+                        /*
+                         * for non AIO case, since the IO is already
+                         * completed, we could do the convertion right here
+                         */
+                        err = ext4_convert_unwritten_extents(inode,
+                                                             offset, ret);
+                        if (err < 0)
+                                ret = err;
+                        ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+                }
+                return ret;
+        }
+        /* for write the the end of file case, we fall back to old way */
+        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+                              const struct iovec *iov, loff_t offset,
+                              unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+/*
 * Pages can be marked dirty completely asynchronously from ext4's journalling
 * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
 * much here because ->set_page_dirty is called under VFS locks.  The page is
@@ -3492,7 +3901,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_normal_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_ordered_write_end,
@@ -3502,12 +3911,13 @@ static const struct address_space_operations ext4_ordered_aops = {
        .direct_IO              = ext4_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+        .error_remove_page      = generic_error_remove_page,
 };
 static const struct address_space_operations ext4_writeback_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_normal_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_writeback_write_end,
@@ -3517,12 +3927,13 @@ static const struct address_space_operations ext4_writeback_aops = {
        .direct_IO              = ext4_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+        .error_remove_page      = generic_error_remove_page,
 };
 static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_journalled_writepage,
+        .writepage              = ext4_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
@@ -3531,12 +3942,13 @@ static const struct address_space_operations ext4_journalled_aops = {
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
        .is_partially_uptodate  = block_is_partially_uptodate,
+        .error_remove_page      = generic_error_remove_page,
 };
 static const struct address_space_operations ext4_da_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
-        .writepage              = ext4_da_writepage,
+        .writepage              = ext4_writepage,
        .writepages             = ext4_da_writepages,
        .sync_page              = block_sync_page,
        .write_begin            = ext4_da_write_begin,
@@ -3547,6 +3959,7 @@ static const struct address_space_operations ext4_da_aops = {
        .direct_IO              = ext4_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+        .error_remove_page      = generic_error_remove_page,
 };
 void ext4_set_aops(struct inode *inode)
@@ -3583,7 +3996,8 @@ int ext4_block_truncate_page(handle_t *handle,
        struct page *page;
        int err = 0;
-        page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
        if (!page)
                return -EINVAL;
@@ -3591,17 +4005,6 @@ int ext4_block_truncate_page(handle_t *handle,
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-        /*
-         * For "nobh" option,  we can only work if we don't need to
-         * read-in the page - otherwise we create buffers to do the IO.
-         */
-        if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-             ext4_should_writeback_data(inode) && PageUptodate(page)) {
-                zero_user(page, offset, length);
-                set_page_dirty(page);
-                goto unlock;
-        }
        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);
@@ -3658,7 +4061,7 @@ int ext4_block_truncate_page(handle_t *handle,
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
-                if (ext4_should_order_data(inode))
+                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -3725,7 +4128,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
        int k, err;
        *top = 0;
-        /* Make k index the deepest non-null offest + 1 */
+        /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -3774,47 +4177,58 @@ no_top:
 * We release `count' blocks on disk, but (last - first) may be greater
 * than `count' because there can be holes in there.
 */
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
-                              struct buffer_head *bh,
+                             struct buffer_head *bh,
-                              ext4_fsblk_t block_to_free,
+                             ext4_fsblk_t block_to_free,
-                              unsigned long count, __le32 *first,
+                             unsigned long count, __le32 *first,
-                              __le32 *last)
+                             __le32 *last)
 {
        __le32 *p;
+        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     err;
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                flags |= EXT4_FREE_BLOCKS_METADATA;
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+                                   count)) {
+                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+                                 "blocks %llu len %lu",
+                                 (unsigned long long) block_to_free, count);
+                return 1;
+        }
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, inode, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                        if (unlikely(err)) {
+                                ext4_std_error(inode->i_sb, err);
+                                return 1;
+                        }
+                }
+                err = ext4_mark_inode_dirty(handle, inode);
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
+                }
+                err = ext4_truncate_restart_trans(handle, inode,
+                                                  blocks_for_truncate(inode));
+                if (unlikely(err)) {
+                        ext4_std_error(inode->i_sb, err);
+                        return 1;
                }
-                ext4_mark_inode_dirty(handle, inode);
-                ext4_journal_test_restart(handle, inode);
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext4_journal_get_write_access(handle, bh);
                }
        }
-        /*
+        for (p = first; p < last; p++)
-         * Any buffers which are on the journal will be in memory. We
+                *p = 0;
-         * find them on the hash table so jbd2_journal_revoke() will
-         * run jbd2_journal_forget() on them.  We've already detached
-         * each block from the file, so bforget() in
-         * jbd2_journal_forget() should be safe.
-         *
-         * AKPM: turn on bforget in jbd2_journal_forget()!!!
-         */
-        for (p = first; p < last; p++) {
-                u32 nr = le32_to_cpu(*p);
-                if (nr) {
-                        struct buffer_head *tbh;
-                        *p = 0;
+        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
-                        tbh = sb_find_get_block(inode->i_sb, nr);
+        return 0;
-                        ext4_forget(handle, 0, inode, tbh, nr);
-                }
-        }
-        ext4_free_blocks(handle, inode, block_to_free, count, 0);
 }
 /**
@@ -3870,9 +4284,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
-                                ext4_clear_blocks(handle, inode, this_bh,
+                                if (ext4_clear_blocks(handle, inode, this_bh,
-                                                  block_to_free,
+                                                      block_to_free, count,
-                                                  count, block_to_free_p, p);
+                                                      block_to_free_p, p))
+                                        break;
                                block_to_free = nr;
                                block_to_free_p = p;
                                count = 1;
@@ -3896,11 +4311,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
-                        ext4_error(inode->i_sb, __func__,
+                        EXT4_ERROR_INODE(inode,
-                                   "circular indirect block detected, "
+                                         "circular indirect block detected at "
-                                   "inode=%lu, block=%llu",
+                                         "block %llu",
-                                   inode->i_ino,
+                                (unsigned long long) this_bh->b_blocknr);
-                                   (unsigned long long) this_bh->b_blocknr);
        }
 }
@@ -3936,6 +4350,15 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                        if (!nr)
                                continue;               /* A hole */
+                        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+                                                   nr, 1)) {
+                                EXT4_ERROR_INODE(inode,
+                                                 "invalid indirect mapped "
+                                                 "block %lu (level %d)",
+                                                 (unsigned long) nr, depth);
+                                break;
+                        }
                        /* Go read the buffer for the next level down */
                        bh = sb_bread(inode->i_sb, nr);
@@ -3944,9 +4367,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * (should be rare).
                         */
                        if (!bh) {
-                                ext4_error(inode->i_sb, "ext4_free_branches",
+                                EXT4_ERROR_INODE_BLOCK(inode, nr,
-                                           "Read failure, inode=%lu, block=%llu",
+                                                       "Read failure");
-                                           inode->i_ino, nr);
                                continue;
                        }
@@ -3956,27 +4378,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
+                        brelse(bh);
-                        /*
-                         * We've probably journalled the indirect block several
-                         * times during the truncate.  But it's no longer
-                         * needed and we now drop it from the transaction via
-                         * jbd2_journal_revoke().
-                         *
-                         * That's easy if it's exclusively part of this
-                         * transaction.  But if it's part of the committing
-                         * transaction then jbd2_journal_forget() will simply
-                         * brelse() it.  That means that if the underlying
-                         * block is reallocated in ext4_get_block(),
-                         * unmap_underlying_metadata() will find this block
-                         * and will try to get rid of it.  damn, damn.
-                         *
-                         * If this block has already been committed to the
-                         * journal, a revoke record will be written.  And
-                         * revoke records must be emitted *before* clearing
-                         * this block's bit in the bitmaps.
-                         */
-                        ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
                        /*
                         * Everything below this this pointer has been
@@ -3998,10 +4400,24 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                return;
                        if (try_to_extend_transaction(handle, inode)) {
                                ext4_mark_inode_dirty(handle, inode);
-                                ext4_journal_test_restart(handle, inode);
+                                ext4_truncate_restart_trans(handle, inode,
+                                            blocks_for_truncate(inode));
                        }
-                        ext4_free_blocks(handle, inode, nr, 1, 1);
+                        /*
+                         * The forget flag here is critical because if
+                         * we are journaling (and not doing data
+                         * journaling), we have to make sure a revoke
+                         * record is written to prevent the journal
+                         * replay from overwriting the (former)
+                         * indirect block if it gets reallocated as a
+                         * data block.  This must happen in the same
+                         * transaction where the data blocks are
+                         * actually freed.
+                         */
+                        ext4_free_blocks(handle, inode, 0, nr, 1,
+                                         EXT4_FREE_BLOCKS_METADATA|
+                                         EXT4_FREE_BLOCKS_FORGET);
                        if (parent_bh) {
                                /*
@@ -4086,11 +4502,12 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
-        if (ei->i_disksize && inode->i_size == 0 &&
+        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-            !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
-                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_ext_truncate(inode);
                return;
        }
@@ -4258,9 +4675,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
        bh = sb_getblk(sb, block);
        if (!bh) {
-                ext4_error(sb, "ext4_get_inode_loc", "unable to read "
+                EXT4_ERROR_INODE_BLOCK(inode, block,
-                           "inode block - inode=%lu, block=%llu",
+                                       "unable to read itable block");
-                           inode->i_ino, block);
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -4358,9 +4774,8 @@ make_io:
                submit_bh(READ_META, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                        ext4_error(sb, __func__,
+                        EXT4_ERROR_INODE_BLOCK(inode, block,
-                                   "unable to read inode block - inode=%lu, "
+                                               "unable to read itable block");
-                                   "block=%llu", inode->i_ino, block);
                        brelse(bh);
                        return -EIO;
                }
@@ -4374,7 +4789,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
        /* We have all inode data except xattrs in memory here. */
        return __ext4_get_inode_loc(inode, iloc,
-                !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+                !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 void ext4_set_inode_flags(struct inode *inode)
@@ -4397,20 +4812,26 @@ void ext4_set_inode_flags(struct inode *inode)
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
-        unsigned int flags = ei->vfs_inode.i_flags;
+        unsigned int vfs_fl;
+        unsigned long old_fl, new_fl;
-        ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-                        EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
+        do {
-        if (flags & S_SYNC)
+                vfs_fl = ei->vfs_inode.i_flags;
-                ei->i_flags |= EXT4_SYNC_FL;
+                old_fl = ei->i_flags;
-        if (flags & S_APPEND)
+                new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-                ei->i_flags |= EXT4_APPEND_FL;
+                                EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
-        if (flags & S_IMMUTABLE)
+                                EXT4_DIRSYNC_FL);
-                ei->i_flags |= EXT4_IMMUTABLE_FL;
+                if (vfs_fl & S_SYNC)
-        if (flags & S_NOATIME)
+                        new_fl |= EXT4_SYNC_FL;
-                ei->i_flags |= EXT4_NOATIME_FL;
+                if (vfs_fl & S_APPEND)
-        if (flags & S_DIRSYNC)
+                        new_fl |= EXT4_APPEND_FL;
-                ei->i_flags |= EXT4_DIRSYNC_FL;
+                if (vfs_fl & S_IMMUTABLE)
+                        new_fl |= EXT4_IMMUTABLE_FL;
+                if (vfs_fl & S_NOATIME)
+                        new_fl |= EXT4_NOATIME_FL;
+                if (vfs_fl & S_DIRSYNC)
+                        new_fl |= EXT4_DIRSYNC_FL;
+        } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -4425,7 +4846,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
                /* we are using combined 48 bit field */
                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                        le32_to_cpu(raw_inode->i_blocks_lo);
-                if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+                if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
                        /* i_blocks represent file system block size */
                        return i_blocks  << (inode->i_blkbits - 9);
                } else {
@@ -4441,8 +4862,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
-        struct buffer_head *bh;
        struct inode *inode;
+        journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        int block;
@@ -4453,15 +4874,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
+        iloc.bh = 0;
-        ei->i_acl = EXT4_ACL_NOT_CACHED;
-        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
                goto bad_inode;
-        bh = iloc.bh;
        raw_inode = ext4_raw_inode(&iloc);
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4472,7 +4889,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        ei->i_state = 0;
+        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -4484,7 +4901,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                if (inode->i_mode == 0 ||
                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
                        /* this inode is deleted */
-                        brelse(bh);
                        ret = -ESTALE;
                        goto bad_inode;
                }
@@ -4501,6 +4917,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
        inode->i_size = ext4_isize(raw_inode);
        ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+        ei->i_reserved_quota = 0;
+#endif
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
        ei->i_last_alloc_group = ~0;
@@ -4512,11 +4931,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
+        /*
+         * Set transaction id's of transactions that have to be committed
+         * to finish f[data]sync. We set them to currently running transaction
+         * as we cannot be sure that the inode or some of its metadata isn't
+         * part of the transaction - the inode could have been reclaimed and
+         * now it is reread from disk.
+         */
+        if (journal) {
+                transaction_t *transaction;
+                tid_t tid;
+                read_lock(&journal->j_state_lock);
+                if (journal->j_running_transaction)
+                        transaction = journal->j_running_transaction;
+                else
+                        transaction = journal->j_committing_transaction;
+                if (transaction)
+                        tid = transaction->t_tid;
+                else
+                        tid = journal->j_commit_sequence;
+                read_unlock(&journal->j_state_lock);
+                ei->i_sync_tid = tid;
+                ei->i_datasync_tid = tid;
+        }
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                    EXT4_INODE_SIZE(inode->i_sb)) {
-                        brelse(bh);
                        ret = -EIO;
                        goto bad_inode;
                }
@@ -4529,7 +4972,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                                        EXT4_GOOD_OLD_INODE_SIZE +
                                        ei->i_extra_isize;
                        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-                                ei->i_state |= EXT4_STATE_XATTR;
+                                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                }
        } else
                ei->i_extra_isize = 0;
@@ -4548,16 +4991,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
-            ((ei->i_file_acl <
+            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-              (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+                EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
-               EXT4_SB(sb)->s_gdb_count)) ||
+                                 ei->i_file_acl);
-             (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
-                ext4_error(sb, __func__,
-                           "bad extended attribute block %llu in inode #%lu",
-                           ei->i_file_acl, inode->i_ino);
                ret = -EIO;
                goto bad_inode;
-        } else if (ei->i_flags & EXT4_EXTENTS_FL) {
+        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                    (S_ISLNK(inode->i_mode) &&
                     !ext4_inode_is_fast_symlink(inode)))
@@ -4569,10 +5008,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                /* Validate block references which are part of inode */
                ret = ext4_check_inode_blockref(inode);
        }
-        if (ret) {
+        if (ret)
-                brelse(bh);
                goto bad_inode;
-        }
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
@@ -4600,11 +5037,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
-                brelse(bh);
                ret = -EIO;
-                ext4_error(inode->i_sb, __func__,
+                EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
-                           "bogus i_mode (%o) for inode=%lu",
-                           inode->i_mode, inode->i_ino);
                goto bad_inode;
        }
        brelse(iloc.bh);
@@ -4613,6 +5047,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        return inode;
 bad_inode:
+        brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
 }
@@ -4632,7 +5067,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
-                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }
        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -4645,9 +5080,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
-                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
        } else {
-                ei->i_flags |= EXT4_HUGE_FILE_FL;
+                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
@@ -4674,7 +5109,7 @@ static int ext4_do_update_inode(handle_t *handle,
        /* For fields not not tracking in the in-memory inode,
         * initialise them to zero for new inodes. */
-        if (ei->i_state & EXT4_STATE_NEW)
+        if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
        ext4_get_inode_flags(ei);
@@ -4713,8 +5148,7 @@ static int ext4_do_update_inode(handle_t *handle,
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-        /* clear the migrate flag in the raw_inode */
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
-        raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
@@ -4739,7 +5173,7 @@ static int ext4_do_update_inode(handle_t *handle,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
                        sb->s_dirt = 1;
                        ext4_handle_sync(handle);
-                        err = ext4_handle_dirty_metadata(handle, inode,
+                        err = ext4_handle_dirty_metadata(handle, NULL,
                                        EXT4_SB(sb)->s_sbh);
                }
        }
@@ -4768,11 +5202,12 @@ static int ext4_do_update_inode(handle_t *handle,
        }
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-        rc = ext4_handle_dirty_metadata(handle, inode, bh);
+        rc = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (!err)
                err = rc;
-        ei->i_state &= ~EXT4_STATE_NEW;
+        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+        ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
        brelse(bh);
        ext4_std_error(inode->i_sb, err);
@@ -4814,21 +5249,40 @@ out_brelse:
 * `stuff()' is running, and the new i_size will be lost.  Plus the inode
 * will no longer be on the superblock's dirty inode list.
 */
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
+        int err;
        if (current->flags & PF_MEMALLOC)
                return 0;
-        if (ext4_journal_current_handle()) {
+        if (EXT4_SB(inode->i_sb)->s_journal) {
-                jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+                if (ext4_journal_current_handle()) {
-                dump_stack();
+                        jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
-                return -EIO;
+                        dump_stack();
-        }
+                        return -EIO;
+                }
-        if (!wait)
+                if (wbc->sync_mode != WB_SYNC_ALL)
-                return 0;
+                        return 0;
+                err = ext4_force_commit(inode->i_sb);
+        } else {
+                struct ext4_iloc iloc;
-        return ext4_force_commit(inode->i_sb);
+                err = __ext4_get_inode_loc(inode, &iloc, 0);
+                if (err)
+                        return err;
+                if (wbc->sync_mode == WB_SYNC_ALL)
+                        sync_dirty_buffer(iloc.bh);
+                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+                        EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
+                                         "IO error syncing inode");
+                        err = -EIO;
+                }
+                brelse(iloc.bh);
+        }
+        return err;
 }
 /*
@@ -4859,25 +5313,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        int error, rc = 0;
+        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        error = inode_change_ok(inode, attr);
        if (error)
                return error;
+        if (is_quota_modification(inode, attr))
+                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
                handle_t *handle;
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
-                handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
+                handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
-                                        EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+                                        EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
                }
-                error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+                error = dquot_transfer(inode, attr);
                if (error) {
                        ext4_journal_stop(handle);
                        return error;
@@ -4893,18 +5350,18 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
        if (attr->ia_valid & ATTR_SIZE) {
-                if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+                        if (attr->ia_size > sbi->s_bitmap_maxbytes)
-                                error = -EFBIG;
+                                return -EFBIG;
-                                goto err_out;
-                        }
                }
        }
        if (S_ISREG(inode->i_mode) &&
-            attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+            attr->ia_valid & ATTR_SIZE &&
+            (attr->ia_size < inode->i_size ||
+             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -4912,8 +5369,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        error = PTR_ERR(handle);
                        goto err_out;
                }
+                if (ext4_handle_valid(handle)) {
-                error = ext4_orphan_add(handle, inode);
+                        error = ext4_orphan_add(handle, inode);
+                        orphan = 1;
+                }
                EXT4_I(inode)->i_disksize = attr->ia_size;
                rc = ext4_mark_inode_dirty(handle, inode);
                if (!error)
@@ -4931,18 +5390,30 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                        goto err_out;
                                }
                                ext4_orphan_del(handle, inode);
+                                orphan = 0;
                                ext4_journal_stop(handle);
                                goto err_out;
                        }
                }
+                /* ext4_truncate will clear the flag */
+                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
+                        ext4_truncate(inode);
        }
-        rc = inode_setattr(inode, attr);
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode))
+                rc = vmtruncate(inode, attr->ia_size);
-        /* If inode_setattr's call to ext4_truncate failed to get a
+        if (!rc) {
-         * transaction handle at all, we need to clean up the in-core
+                setattr_copy(inode, attr);
-         * orphan list manually. */
+                mark_inode_dirty(inode);
-        if (inode->i_nlink)
+        }
+        /*
+         * If the call to ext4_truncate failed to get a transaction handle at
+         * all, we need to clean up the in-core orphan list manually.
+         */
+        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);
        if (!rc && (ia_valid & ATTR_MODE))
@@ -4974,9 +5445,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
        return 0;
@@ -5009,7 +5478,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
@@ -5020,12 +5489,12 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 * worse case, the indexs blocks spread over different block groups
 *
 * If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiugous, with flexbg,
+ * different block groups too. If they are contiuguous, with flexbg,
 * they could still across block group boundary.
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
@@ -5096,7 +5565,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
@@ -5162,7 +5631,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
 {
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
-        struct ext4_xattr_entry *entry;
        if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
                return 0;
@@ -5170,11 +5638,10 @@ static int ext4_expand_extra_isize(struct inode *inode,
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
-        entry = IFIRST(header);
        /* No extended attributes present */
-        if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
+        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
-                header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+            header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
                        new_extra_isize);
                EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5215,10 +5682,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        int err, ret;
        might_sleep();
+        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
-            !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+            !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
                /*
                 * We need extra buffer credits since we may write into EA block
                 * with this same handle. If journal_extend fails, then it will
@@ -5232,10 +5700,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
                                                      sbi->s_want_extra_isize,
                                                      iloc, handle);
                        if (ret) {
-                                EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+                                ext4_set_inode_state(inode,
+                                                     EXT4_STATE_NO_EXPAND);
                                if (mnt_count !=
                                        le16_to_cpu(sbi->s_es->s_mnt_count)) {
-                                        ext4_warning(inode->i_sb, __func__,
+                                        ext4_warning(inode->i_sb,
                                        "Unable to expand inode %lu. Delete"
                                        " some EAs or run e2fsck.",
                                        inode->i_ino);
@@ -5257,7 +5726,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 * i_size has been changed by generic_commit_write() and we thus need
 * to include the updated inode in the current transaction.
 *
- * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
 * are allocated to the file.
 *
 * If the inode is marked synchronous, we don't honour that here - doing
@@ -5266,27 +5735,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 */
 void ext4_dirty_inode(struct inode *inode)
 {
-        handle_t *current_handle = ext4_journal_current_handle();
        handle_t *handle;
-        if (!ext4_handle_valid(current_handle)) {
-                ext4_mark_inode_dirty(current_handle, inode);
-                return;
-        }
        handle = ext4_journal_start(inode, 2);
        if (IS_ERR(handle))
                goto out;
-        if (current_handle &&
-                current_handle->h_transaction != handle->h_transaction) {
+        ext4_mark_inode_dirty(handle, inode);
-                /* This task has a transaction open against a different fs */
-                printk(KERN_EMERG "%s: transactions do not match!\n",
-                       __func__);
-        } else {
-                jbd_debug(5, "marking dirty.  outer handle=%p\n",
-                                current_handle);
-                ext4_mark_inode_dirty(handle, inode);
-        }
        ext4_journal_stop(handle);
 out:
        return;
@@ -5312,7 +5768,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
                        err = jbd2_journal_get_write_access(handle, iloc.bh);
                        if (!err)
                                err = ext4_handle_dirty_metadata(handle,
-                                                                 inode,
+                                                                 NULL,
                                                                 iloc.bh);
                        brelse(iloc.bh);
                }
@@ -5356,9 +5812,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         */
        if (val)
-                EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else
-                EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        ext4_set_aops(inode);
        jbd2_journal_unlock_updates(journal);
@@ -5413,12 +5869,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        else
                len = PAGE_CACHE_SIZE;
+        lock_page(page);
+        /*
+         * return if we have all the buffers mapped. This avoid
+         * the need to call write_begin/write_end which does a
+         * journal_start/journal_stop which can block and take
+         * long time
+         */
        if (page_has_buffers(page)) {
-                /* return if we have all the buffers mapped */
                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                       ext4_bh_unmapped))
+                                        ext4_bh_unmapped)) {
+                        unlock_page(page);
                        goto out_unlock;
+                }
        }
+        unlock_page(page);
        /*
         * OK, we need to fill the hole... Do write_begin write_end
         * to do block allocation/reservation.We are not holding
author	Paul Mundt <lethal@linux-sh.org>	2011-01-13 01:06:28 -0500
committer	Paul Mundt <lethal@linux-sh.org>	2011-01-13 01:06:28 -0500
commit	f43dc23d5ea91fca257be02138a255f02d98e806 (patch)
tree	b29722f6e965316e90ac97abf79923ced250dc21 /fs/ext4/inode.c
parent	f8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff)
parent	4162cf64973df51fc885825bc9ca4d055891c49f (diff)