1 files changed, 532 insertions, 666 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debeb3965..e3126c051006 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,7 +39,9 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -53,13 +55,27 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
-        return jbd2_journal_begin_ordered_truncate(
+        trace_ext4_begin_ordered_truncate(inode, new_size);
-                                        EXT4_SB(inode->i_sb)->s_journal,
+        /*
-                                        &EXT4_I(inode)->jinode,
+         * If jinode is zero, then we never opened the file for
-                                        new_size);
+         * writing, so there's no need to call
+         * jbd2_journal_begin_ordered_truncate() since there's no
+         * outstanding writes we need to flush.
+         */
+        if (!EXT4_I(inode)->jinode)
+                return 0;
+        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+                                                   EXT4_I(inode)->jinode,
+                                                   new_size);
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 /*
 * Test whether an inode is a fast symlink.
@@ -157,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
        up_write(&EXT4_I(inode)->i_data_sem);
-        ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+        ret = ext4_journal_restart(handle, nblocks);
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode);
@@ -172,6 +188,7 @@ void ext4_evict_inode(struct inode *inode)
        handle_t *handle;
        int err;
+        trace_ext4_evict_inode(inode);
        if (inode->i_nlink) {
                truncate_inode_pages(&inode->i_data, 0);
                goto no_delete;
@@ -544,7 +561,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 }
 /**
- *      ext4_blks_to_allocate: Look up the block map and count the number
+ *      ext4_blks_to_allocate - Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
 *      @branch: chain of indirect blocks
@@ -583,13 +600,19 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 /**
 *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: inode which needs allocated blocks
+ *      @iblock: the logical block to start allocated at
+ *      @goal: preferred physical block of allocation
 *      @indirect_blks: the number of blocks need to allocate for indirect
 *                      blocks
- *
+ *      @blks: number of desired blocks
 *      @new_blocks: on return it will store the new block numbers for
 *      the indirect blocks(if needed) and the first direct block,
- *      @blks:  on return it will store the total number of allocated
+ *      @err: on return it will store the error code
- *              direct blocks
+ *
+ *      This function will return the number of blocks allocated as
+ *      requested by the passed-in parameters.
 */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                             ext4_lblk_t iblock, ext4_fsblk_t goal,
@@ -616,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        while (target > 0) {
                count = target;
                /* allocating blocks for indirect blocks and direct blocks */
-                current_block = ext4_new_meta_blocks(handle, inode,
+                current_block = ext4_new_meta_blocks(handle, inode, goal,
-                                                        goal, &count, err);
+                                                     0, &count, err);
                if (*err)
                        goto failed_out;
@@ -697,15 +720,17 @@ allocated:
        return ret;
 failed_out:
        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
        return ret;
 }
 /**
 *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
 *      @inode: owner
 *      @indirect_blks: number of allocated indirect blocks
 *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
 *      @offsets: offsets (in the blocks) to store the pointers to next.
 *      @branch: place to store the chain in.
 *
@@ -755,6 +780,11 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                 * parent to disk.
                 */
                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+                if (unlikely(!bh)) {
+                        err = -EIO;
+                        goto failed;
+                }
                branch[n].bh = bh;
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
@@ -793,26 +823,27 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        return err;
 failed:
        /* Allocation failed, free what we already allocated */
-        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
+        ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
                 */
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
+        ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
        return err;
 }
 /**
 * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
 * @inode: owner
 * @block: (logical) number of block we are adding
 * @chain: chain of indirect blocks (with a missing link - see
@@ -893,7 +924,7 @@ err_out:
                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
                         blks, 0);
        return err;
@@ -942,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
        int count = 0;
        ext4_fsblk_t first_block = 0;
+        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1027,6 +1059,8 @@ cleanup:
                partial--;
        }
 out:
+        trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+                                map->m_pblk, map->m_len, err);
        return err;
 }
@@ -1068,7 +1102,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 * Calculate the number of metadata blocks need to reserve
 * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
@@ -1207,8 +1241,10 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
                                break;
                        idx++;
                        num++;
-                        if (num >= max_pages)
+                        if (num >= max_pages) {
+                                done = 1;
                                break;
+                        }
                }
                pagevec_release(&pvec);
        }
@@ -1305,7 +1341,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * avoid double accounting
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
+                ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
@@ -1335,7 +1371,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
-                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1538,10 +1574,10 @@ static int do_journal_get_write_access(handle_t *handle,
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
-         * __block_prepare_write() could have dirtied some buffers. Clean
+         * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
-         * by __block_prepare_write() isn't a real problem here as we clear
+         * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
@@ -1863,7 +1899,7 @@ static int ext4_journalled_write_end(struct file *file,
 /*
 * Reserve a single block located at lblock
 */
-static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1894,7 +1930,7 @@ repeat:
         * We do still charge estimated metadata to the sb though;
         * we cannot afford to run out of free blocks.
         */
-        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+        if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
                dquot_release_reservation_block(inode, 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
@@ -1995,16 +2031,23 @@ static void ext4_da_page_release_reservation(struct page *page,
 *
 * As pages are already locked by write_cache_pages(), we can't use it
 */
-static int mpage_da_submit_io(struct mpage_da_data *mpd)
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                              struct ext4_map_blocks *map)
 {
-        long pages_skipped;
        struct pagevec pvec;
        unsigned long index, end;
        int ret = 0, err, nr_pages, i;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
+        loff_t size = i_size_read(inode);
+        unsigned int len, block_start;
+        struct buffer_head *bh, *page_bufs = NULL;
+        int journal_data = ext4_should_journal_data(inode);
+        sector_t pblock = 0, cur_logical = 0;
+        struct ext4_io_submit io_submit;
        BUG_ON(mpd->next_page <= mpd->first_page);
+        memset(&io_submit, 0, sizeof(io_submit));
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
@@ -2020,124 +2063,111 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
+                        int commit_write = 0, skip_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
                        if (index > end)
                                break;
+                        if (index == size >> PAGE_CACHE_SHIFT)
+                                len = size & ~PAGE_CACHE_MASK;
+                        else
+                                len = PAGE_CACHE_SIZE;
+                        if (map) {
+                                cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                        inode->i_blkbits);
+                                pblock = map->m_pblk + (cur_logical -
+                                                        map->m_lblk);
+                        }
                        index++;
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        pages_skipped = mpd->wbc->pages_skipped;
-                        err = mapping->a_ops->writepage(page, mpd->wbc);
-                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                                /*
-                                 * have successfully written the page
-                                 * without skipping the same
-                                 */
-                                mpd->pages_written++;
                        /*
-                         * In error case, we have to continue because
+                         * If the page does not have buffers (for
-                         * remaining pages are still locked
+                         * whatever reason), try to create them using
-                         * XXX: unlock and re-dirty them?
+                         * __block_write_begin.  If this fails,
+                         * skip the page and move on.
                         */
-                        if (ret == 0)
+                        if (!page_has_buffers(page)) {
-                                ret = err;
+                                if (__block_write_begin(page, 0, len,
-                }
+                                                noalloc_get_block_write)) {
-                pagevec_release(&pvec);
+                                skip_page:
-        }
+                                        unlock_page(page);
-        return ret;
+                                        continue;
-}
+                                }
+                                commit_write = 1;
-/*
+                        }
- * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
- *
- * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
- */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                 struct ext4_map_blocks *map)
-{
-        struct inode *inode = mpd->inode;
-        struct address_space *mapping = inode->i_mapping;
-        int blocks = map->m_len;
-        sector_t pblock = map->m_pblk, cur_logical;
-        struct buffer_head *head, *bh;
-        pgoff_t index, end;
-        struct pagevec pvec;
-        int nr_pages, i;
-        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        pagevec_init(&pvec, 0);
-        while (index <= end) {
-                /* XXX: optimize tail */
-                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                if (nr_pages == 0)
-                        break;
-                for (i = 0; i < nr_pages; i++) {
-                        struct page *page = pvec.pages[i];
-                        index = page->index;
-                        if (index > end)
-                                break;
-                        index++;
-                        BUG_ON(!PageLocked(page));
-                        BUG_ON(PageWriteback(page));
-                        BUG_ON(!page_has_buffers(page));
-                        bh = page_buffers(page);
-                        head = bh;
-                        /* skip blocks out of the range */
-                        do {
-                                if (cur_logical >= map->m_lblk)
-                                        break;
-                                cur_logical++;
-                        } while ((bh = bh->b_this_page) != head);
+                        bh = page_bufs = page_buffers(page);
+                        block_start = 0;
                        do {
-                                if (cur_logical >= map->m_lblk + blocks)
+                                if (!bh)
-                                        break;
+                                        goto skip_page;
+                                if (map && (cur_logical >= map->m_lblk) &&
-                                if (buffer_delay(bh) || buffer_unwritten(bh)) {
+                                    (cur_logical <= (map->m_lblk +
+                                                     (map->m_len - 1)))) {
-                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
                                        if (buffer_delay(bh)) {
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
-                                        } else {
-                                                /*
-                                                 * unwritten already should have
-                                                 * blocknr assigned. Verify that
-                                                 */
-                                                clear_buffer_unwritten(bh);
-                                                BUG_ON(bh->b_blocknr != pblock);
                                        }
+                                        if (buffer_unwritten(bh) ||
+                                            buffer_mapped(bh))
+                                                BUG_ON(bh->b_blocknr != pblock);
+                                        if (map->m_flags & EXT4_MAP_UNINIT)
+                                                set_buffer_uninit(bh);
+                                        clear_buffer_unwritten(bh);
+                                }
-                                } else if (buffer_mapped(bh))
+                                /* skip page if block allocation undone */
-                                        BUG_ON(bh->b_blocknr != pblock);
+                                if (buffer_delay(bh) || buffer_unwritten(bh))
+                                        skip_page = 1;
-                                if (map->m_flags & EXT4_MAP_UNINIT)
+                                bh = bh->b_this_page;
-                                        set_buffer_uninit(bh);
+                                block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
-                        } while ((bh = bh->b_this_page) != head);
+                        } while (bh != page_bufs);
+                        if (skip_page)
+                                goto skip_page;
+                        if (commit_write)
+                                /* mark the buffer_heads as dirty & uptodate */
+                                block_commit_write(page, 0, len);
+                        clear_page_dirty_for_io(page);
+                        /*
+                         * Delalloc doesn't support data journalling,
+                         * but eventually maybe we'll lift this
+                         * restriction.
+                         */
+                        if (unlikely(journal_data && PageChecked(page)))
+                                err = __ext4_journalled_writepage(page, len);
+                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
+                                err = ext4_bio_write_page(&io_submit, page,
+                                                          len, mpd->wbc);
+                        else
+                                err = block_write_full_page(page,
+                                        noalloc_get_block_write, mpd->wbc);
+                        if (!err)
+                                mpd->pages_written++;
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         */
+                        if (ret == 0)
+                                ret = err;
                }
                pagevec_release(&pvec);
        }
+        ext4_io_submit(&io_submit);
+        return ret;
 }
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
-                                        sector_t logical, long blk_cnt)
 {
        int nr_pages, i;
        pgoff_t index, end;
@@ -2145,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        index = mpd->first_page;
-        end   = (logical + blk_cnt - 1) >>
+        end   = mpd->next_page - 1;
-                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
        while (index <= end) {
                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
                if (nr_pages == 0)
@@ -2187,35 +2216,32 @@ static void ext4_print_free_blocks(struct inode *inode)
 }
 /*
- * mpage_da_map_blocks - go through given space
+ * mpage_da_map_and_submit - go through given space, map them
+ *       if necessary, and then submit them for I/O
 *
 * @mpd - bh describing space
 *
 * The function skips space we know is already mapped to disk blocks.
 *
 */
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct ext4_map_blocks map;
+        struct ext4_map_blocks map, *mapp = NULL;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
        handle_t *handle = NULL;
        /*
-         * We consider only non-mapped and non-allocated blocks
+         * If the blocks are mapped already, or we couldn't accumulate
+         * any blocks, then proceed immediately to the submission stage.
         */
-        if ((mpd->b_state  & (1 << BH_Mapped)) &&
+        if ((mpd->b_size == 0) ||
-                !(mpd->b_state & (1 << BH_Delay)) &&
+            ((mpd->b_state  & (1 << BH_Mapped)) &&
-                !(mpd->b_state & (1 << BH_Unwritten)))
+             !(mpd->b_state & (1 << BH_Delay)) &&
-                return 0;
+             !(mpd->b_state & (1 << BH_Unwritten))))
+                goto submit_io;
-        /*
-         * If we didn't accumulate anything to write simply return
-         */
-        if (!mpd->b_size)
-                return 0;
        handle = ext4_journal_current_handle();
        BUG_ON(!handle);
@@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
         * want to change *many* call functions, so ext4_map_blocks()
-         * will set the magic i_delalloc_reserved_flag once the
+         * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
         * inode's allocation semaphore is taken.
         *
         * If the blocks in questions were delalloc blocks, set
@@ -2252,17 +2278,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                err = blks;
                /*
-                 * If get block returns with error we simply
+                 * If get block returns EAGAIN or ENOSPC and there
-                 * return. Later writepage will redirty the page and
+                 * appears to be free blocks we will just let
-                 * writepages will find the dirty page again
+                 * mpage_da_submit_io() unlock all of the pages.
                 */
                if (err == -EAGAIN)
-                        return 0;
+                        goto submit_io;
                if (err == -ENOSPC &&
                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
-                        return 0;
+                        goto submit_io;
                }
                /*
@@ -2285,12 +2311,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                                ext4_print_free_blocks(mpd->inode);
                }
                /* invalidate all the pages */
-                ext4_da_block_invalidatepages(mpd, next,
+                ext4_da_block_invalidatepages(mpd);
-                                mpd->b_size >> mpd->inode->i_blkbits);
-                return err;
+                /* Mark this page range as having been completed */
+                mpd->io_done = 1;
+                return;
        }
        BUG_ON(blks == 0);
+        mapp = &map;
        if (map.m_flags & EXT4_MAP_NEW) {
                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
                int i;
@@ -2299,18 +2328,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
        }
-        /*
-         * If blocks are delayed marked, we need to
-         * put actual blocknr and drop delayed bit
-         */
-        if ((mpd->b_state & (1 << BH_Delay)) ||
-            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
                if (err)
-                        return err;
+                        /* This only happens if the journal is aborted */
+                        return;
        }
        /*
@@ -2321,10 +2343,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                disksize = i_size_read(mpd->inode);
        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                ext4_update_i_disksize(mpd->inode, disksize);
-                return ext4_mark_inode_dirty(handle, mpd->inode);
+                err = ext4_mark_inode_dirty(handle, mpd->inode);
+                if (err)
+                        ext4_error(mpd->inode->i_sb,
+                                   "Failed to mark inode %lu dirty",
+                                   mpd->inode->i_ino);
        }
-        return 0;
+submit_io:
+        mpage_da_submit_io(mpd, mapp);
+        mpd->io_done = 1;
 }
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -2401,9 +2429,7 @@ flush_it:
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-        if (mpage_da_map_blocks(mpd) == 0)
+        mpage_da_map_and_submit(mpd);
-                mpage_da_submit_io(mpd);
-        mpd->io_done = 1;
        return;
 }
@@ -2413,104 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * __mpage_da_writepage - finds extent of pages and blocks
- *
- * @page: page to consider
- * @wbc: not used, we just follow rules
- * @data: context
- *
- * The function finds extents of pages and scan them for all blocks.
- */
-static int __mpage_da_writepage(struct page *page,
-                                struct writeback_control *wbc, void *data)
-{
-        struct mpage_da_data *mpd = data;
-        struct inode *inode = mpd->inode;
-        struct buffer_head *bh, *head;
-        sector_t logical;
-        /*
-         * Can we merge this page to current extent?
-         */
-        if (mpd->next_page != page->index) {
-                /*
-                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them using writepage()
-                 */
-                if (mpd->next_page != mpd->first_page) {
-                        if (mpage_da_map_blocks(mpd) == 0)
-                                mpage_da_submit_io(mpd);
-                        /*
-                         * skip rest of the page in the page_vec
-                         */
-                        mpd->io_done = 1;
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return MPAGE_DA_EXTENT_TAIL;
-                }
-                /*
-                 * Start next extent of pages ...
-                 */
-                mpd->first_page = page->index;
-                /*
-                 * ... and blocks
-                 */
-                mpd->b_size = 0;
-                mpd->b_state = 0;
-                mpd->b_blocknr = 0;
-        }
-        mpd->next_page = page->index + 1;
-        logical = (sector_t) page->index <<
-                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        if (!page_has_buffers(page)) {
-                mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
-                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
-                if (mpd->io_done)
-                        return MPAGE_DA_EXTENT_TAIL;
-        } else {
-                /*
-                 * Page with regular buffer heads, just add all dirty ones
-                 */
-                head = page_buffers(page);
-                bh = head;
-                do {
-                        BUG_ON(buffer_locked(bh));
-                        /*
-                         * We need to try to allocate
-                         * unmapped blocks in the same page.
-                         * Otherwise we won't make progress
-                         * with the page in ext4_writepage
-                         */
-                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                                mpage_add_bh_to_extent(mpd, logical,
-                                                       bh->b_size,
-                                                       bh->b_state);
-                                if (mpd->io_done)
-                                        return MPAGE_DA_EXTENT_TAIL;
-                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
-                                /*
-                                 * mapped dirty buffer. We need to update
-                                 * the b_state because we look at
-                                 * b_state in mpage_da_map_blocks. We don't
-                                 * update b_size because if we find an
-                                 * unmapped buffer_head later we need to
-                                 * use the b_state flag of that buffer_head.
-                                 */
-                                if (mpd->b_size == 0)
-                                        mpd->b_state = bh->b_state & BH_FLAGS;
-                        }
-                        logical++;
-                } while ((bh = bh->b_this_page) != head);
-        }
-        return 0;
-}
-/*
 * This is a special get_blocks_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
@@ -2550,8 +2478,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                if (buffer_delay(bh))
                        return 0; /* Not sure this could or should happen */
                /*
-                 * XXX: __block_prepare_write() unmaps passed block,
+                 * XXX: __block_write_begin() unmaps passed block, is it OK?
-                 * is it OK?
                 */
                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
@@ -2583,7 +2510,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write() and block_write_full_page().
+ * callback function for block_write_begin() and block_write_full_page().
 * These functions should only try to map a single block at a time.
 *
 * Since this function doesn't do block allocations even if the caller
@@ -2623,6 +2550,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int ret = 0;
        int err;
+        ClearPageChecked(page);
        page_bufs = page_buffers(page);
        BUG_ON(!page_bufs);
        walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@ -2661,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 * because we should have holes filled from ext4_page_mkwrite(). We even don't
 * need to file the inode to the transaction's list in ordered mode because if
 * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
+ * we are writing back data modified via mmap(), no one guarantees in which
 * transaction the data will hit the disk. In case we are journaling data, we
 * cannot start transaction directly because transaction start ranks above page
 * lock so we have to do some magic.
@@ -2700,84 +2628,57 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
 {
-        int ret = 0;
+        int ret = 0, commit_write = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
-        trace_ext4_writepage(inode, page);
+        trace_ext4_writepage(page);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
        else
                len = PAGE_CACHE_SIZE;
-        if (page_has_buffers(page)) {
+        /*
-                page_bufs = page_buffers(page);
+         * If the page does not have buffers (for whatever reason),
-                if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+         * try to create them using __block_write_begin.  If this
-                                        ext4_bh_delay_or_unwritten)) {
+         * fails, redirty the page and move on.
-                        /*
+         */
-                         * We don't want to do  block allocation
+        if (!page_has_buffers(page)) {
-                         * So redirty the page and return
+                if (__block_write_begin(page, 0, len,
-                         * We may reach here when we do a journal commit
+                                        noalloc_get_block_write)) {
-                         * via journal_submit_inode_data_buffers.
+                redirty_page:
-                         * If we don't have mapping block we just ignore
-                         * them. We can also reach here via shrink_page_list
-                         */
                        redirty_page_for_writepage(wbc, page);
                        unlock_page(page);
                        return 0;
                }
-        } else {
+                commit_write = 1;
+        }
+        page_bufs = page_buffers(page);
+        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                              ext4_bh_delay_or_unwritten)) {
                /*
-                 * The test for page_has_buffers() is subtle:
+                 * We don't want to do block allocation, so redirty
-                 * We know the page is dirty but it lost buffers. That means
+                 * the page and return.  We may reach here when we do
-                 * that at some moment in time after write_begin()/write_end()
+                 * a journal commit via journal_submit_inode_data_buffers.
-                 * has been called all buffers have been clean and thus they
+                 * We can also reach here via shrink_page_list
-                 * must have been written at least once. So they are all
-                 * mapped and we can happily proceed with mapping them
-                 * and writing the page.
-                 *
-                 * Try to initialize the buffer_heads and check whether
-                 * all are mapped and non delay. We don't want to
-                 * do block allocation here.
                 */
-                ret = block_prepare_write(page, 0, len,
+                goto redirty_page;
-                                          noalloc_get_block_write);
+        }
-                if (!ret) {
+        if (commit_write)
-                        page_bufs = page_buffers(page);
-                        /* check whether all are mapped and non delay */
-                        if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                                ext4_bh_delay_or_unwritten)) {
-                                redirty_page_for_writepage(wbc, page);
-                                unlock_page(page);
-                                return 0;
-                        }
-                } else {
-                        /*
-                         * We can't do block allocation here
-                         * so just redity the page and unlock
-                         * and return
-                         */
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return 0;
-                }
                /* now mark the buffer_heads as dirty and uptodate */
                block_commit_write(page, 0, len);
-        }
-        if (PageChecked(page) && ext4_should_journal_data(inode)) {
+        if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
                 * It's mmapped pagecache.  Add buffers and journal it.  There
                 * doesn't seem much point in redirtying the page here.
                 */
-                ClearPageChecked(page);
                return __ext4_journalled_writepage(page, len);
-        }
-        if (page_bufs && buffer_uninit(page_bufs)) {
+        if (buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@ -2790,7 +2691,7 @@ static int ext4_writepage(struct page *page,
 /*
 * This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
+ * calculate the total number of credits to reserve to fit
 * a single extent allocation into a single transaction,
 * ext4_da_writpeages() will loop calling this before
 * the block allocation.
@@ -2815,37 +2716,42 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 /*
 * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and call the callback function (which usually writes
+ * address space and accumulate pages that need writing, and call
- * the pages).
+ * mpage_da_map_and_submit to map a single contiguous memory region
- *
+ * and then write them.
- * This is a forked version of write_cache_pages().  Differences:
- *      Range cyclic is ignored.
- *      no_nrwrite_index_update is always presumed true
 */
 static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
-                                struct mpage_da_data *mpd)
+                                struct mpage_da_data *mpd,
+                                pgoff_t *done_index)
 {
-        int ret = 0;
+        struct buffer_head      *bh, *head;
-        int done = 0;
+        struct inode            *inode = mapping->host;
-        struct pagevec pvec;
+        struct pagevec          pvec;
-        int nr_pages;
+        unsigned int            nr_pages;
-        pgoff_t index;
+        sector_t                logical;
-        pgoff_t end;            /* Inclusive */
+        pgoff_t                 index, end;
-        long nr_to_write = wbc->nr_to_write;
+        long                    nr_to_write = wbc->nr_to_write;
+        int                     i, tag, ret = 0;
+        memset(mpd, 0, sizeof(struct mpage_da_data));
+        mpd->wbc = wbc;
+        mpd->inode = inode;
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
-        while (!done && (index <= end)) {
+        if (wbc->sync_mode == WB_SYNC_ALL)
-                int i;
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+        *done_index = index;
-                              PAGECACHE_TAG_DIRTY,
+        while (index <= end) {
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
-                        break;
+                        return 0;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
@@ -2857,58 +2763,98 @@ static int write_cache_pages_da(struct address_space *mapping,
                         * mapping. However, page->index will not change
                         * because we have a reference on the page.
                         */
-                        if (page->index > end) {
+                        if (page->index > end)
-                                done = 1;
+                                goto out;
-                                break;
+                        *done_index = page->index + 1;
+                        /*
+                         * If we can't merge this page, and we have
+                         * accumulated an contiguous region, write it
+                         */
+                        if ((mpd->next_page != page->index) &&
+                            (mpd->next_page != mpd->first_page)) {
+                                mpage_da_map_and_submit(mpd);
+                                goto ret_extent_tail;
                        }
                        lock_page(page);
                        /*
-                         * Page truncated or invalidated. We can freely skip it
+                         * If the page is no longer dirty, or its
-                         * then, even for data integrity operations: the page
+                         * mapping no longer corresponds to inode we
-                         * has disappeared concurrently, so there could be no
+                         * are writing (which means it has been
-                         * real expectation of this data interity operation
+                         * truncated or invalidated), or the page is
-                         * even if there is now a new, dirty page at the same
+                         * already under writeback and we are not
-                         * pagecache address.
+                         * doing a data integrity writeback, skip the page
                         */
-                        if (unlikely(page->mapping != mapping)) {
+                        if (!PageDirty(page) ||
-continue_unlock:
+                            (PageWriteback(page) &&
+                             (wbc->sync_mode == WB_SYNC_NONE)) ||
+                            unlikely(page->mapping != mapping)) {
                                unlock_page(page);
                                continue;
                        }
-                        if (!PageDirty(page)) {
+                        wait_on_page_writeback(page);
-                                /* someone wrote it for us */
-                                goto continue_unlock;
-                        }
-                        if (PageWriteback(page)) {
-                                if (wbc->sync_mode != WB_SYNC_NONE)
-                                        wait_on_page_writeback(page);
-                                else
-                                        goto continue_unlock;
-                        }
                        BUG_ON(PageWriteback(page));
-                        if (!clear_page_dirty_for_io(page))
-                                goto continue_unlock;
-                        ret = __mpage_da_writepage(page, wbc, mpd);
+                        if (mpd->next_page != page->index)
-                        if (unlikely(ret)) {
+                                mpd->first_page = page->index;
-                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                        mpd->next_page = page->index + 1;
-                                        unlock_page(page);
+                        logical = (sector_t) page->index <<
-                                        ret = 0;
+                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                                } else {
-                                        done = 1;
+                        if (!page_has_buffers(page)) {
-                                        break;
+                                mpage_add_bh_to_extent(mpd, logical,
-                                }
+                                                       PAGE_CACHE_SIZE,
+                                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
+                                if (mpd->io_done)
+                                        goto ret_extent_tail;
+                        } else {
+                                /*
+                                 * Page with regular buffer heads,
+                                 * just add all dirty ones
+                                 */
+                                head = page_buffers(page);
+                                bh = head;
+                                do {
+                                        BUG_ON(buffer_locked(bh));
+                                        /*
+                                         * We need to try to allocate
+                                         * unmapped blocks in the same page.
+                                         * Otherwise we won't make progress
+                                         * with the page in ext4_writepage
+                                         */
+                                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+                                                mpage_add_bh_to_extent(mpd, logical,
+                                                                       bh->b_size,
+                                                                       bh->b_state);
+                                                if (mpd->io_done)
+                                                        goto ret_extent_tail;
+                                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+                                                /*
+                                                 * mapped dirty buffer. We need
+                                                 * to update the b_state
+                                                 * because we look at b_state
+                                                 * in mpage_da_map_blocks.  We
+                                                 * don't update b_size because
+                                                 * if we find an unmapped
+                                                 * buffer_head later we need to
+                                                 * use the b_state flag of that
+                                                 * buffer_head.
+                                                 */
+                                                if (mpd->b_size == 0)
+                                                        mpd->b_state = bh->b_state & BH_FLAGS;
+                                        }
+                                        logical++;
+                                } while ((bh = bh->b_this_page) != head);
                        }
                        if (nr_to_write > 0) {
                                nr_to_write--;
                                if (nr_to_write == 0 &&
-                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                    wbc->sync_mode == WB_SYNC_NONE)
                                        /*
                                         * We stop writing back only if we are
                                         * not doing integrity sync. In case of
@@ -2919,14 +2865,18 @@ continue_unlock:
                                         * pages, but have not synced all of the
                                         * old dirty pages.
                                         */
-                                        done = 1;
+                                        goto out;
-                                        break;
-                                }
                        }
                }
                pagevec_release(&pvec);
                cond_resched();
        }
+        return 0;
+ret_extent_tail:
+        ret = MPAGE_DA_EXTENT_TAIL;
+out:
+        pagevec_release(&pvec);
+        cond_resched();
        return ret;
 }
@@ -2940,13 +2890,14 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
        int pages_written = 0;
-        long pages_skipped;
        unsigned int max_pages;
        int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0;
        long desired_nr_to_write, nr_to_writebump = 0;
        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+        pgoff_t done_index = 0;
+        pgoff_t end;
        trace_ext4_da_writepages(inode, wbc);
@@ -2982,8 +2933,11 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->range_start = index << PAGE_CACHE_SHIFT;
                wbc->range_end  = LLONG_MAX;
                wbc->range_cyclic = 0;
-        } else
+                end = -1;
+        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        }
        /*
         * This works around two forms of stupidity.  The first is in
@@ -3002,9 +2956,12 @@ static int ext4_da_writepages(struct address_space *mapping,
         * sbi->max_writeback_mb_bump whichever is smaller.
         */
        max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-        if (!range_cyclic && range_whole)
+        if (!range_cyclic && range_whole) {
-                desired_nr_to_write = wbc->nr_to_write * 8;
+                if (wbc->nr_to_write == LONG_MAX)
-        else
+                        desired_nr_to_write = wbc->nr_to_write;
+                else
+                        desired_nr_to_write = wbc->nr_to_write * 8;
+        } else
                desired_nr_to_write = ext4_num_dirty_pages(inode, index,
                                                           max_pages);
        if (desired_nr_to_write > max_pages)
@@ -3015,12 +2972,10 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->nr_to_write = desired_nr_to_write;
        }
-        mpd.wbc = wbc;
-        mpd.inode = mapping->host;
-        pages_skipped = wbc->pages_skipped;
 retry:
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag_pages_for_writeback(mapping, index, end);
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -3043,32 +2998,18 @@ retry:
                }
                /*
-                 * Now call __mpage_da_writepage to find the next
+                 * Now call write_cache_pages_da() to find the next
                 * contiguous region of logical blocks that need
-                 * blocks to be allocated by ext4.  We don't actually
+                 * blocks to be allocated by ext4 and submit them.
-                 * submit the blocks for I/O here, even though
-                 * write_cache_pages thinks it will, and will set the
-                 * pages as clean for write before calling
-                 * __mpage_da_writepage().
                 */
-                mpd.b_size = 0;
+                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
-                mpd.b_state = 0;
-                mpd.b_blocknr = 0;
-                mpd.first_page = 0;
-                mpd.next_page = 0;
-                mpd.io_done = 0;
-                mpd.pages_written = 0;
-                mpd.retval = 0;
-                ret = write_cache_pages_da(mapping, wbc, &mpd);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
                 * them for I/O.
                 */
                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                        if (mpage_da_map_blocks(&mpd) == 0)
+                        mpage_da_map_and_submit(&mpd);
-                                mpage_da_submit_io(&mpd);
-                        mpd.io_done = 1;
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
                trace_ext4_da_write_pages(inode, &mpd);
@@ -3082,7 +3023,6 @@ retry:
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
-                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
                        /*
@@ -3090,7 +3030,6 @@ retry:
                         * rest of the pages
                         */
                        pages_written += mpd.pages_written;
-                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
                        io_done = 1;
                } else if (wbc->nr_to_write)
@@ -3108,21 +3047,15 @@ retry:
                wbc->range_end  = mapping->writeback_index - 1;
                goto retry;
        }
-        if (pages_skipped != wbc->pages_skipped)
-                ext4_msg(inode->i_sb, KERN_CRIT,
-                         "This should not happen leaving %s "
-                         "with nr_to_write = %ld ret = %d",
-                         __func__, wbc->nr_to_write, ret);
        /* Update index */
-        index += pages_written;
        wbc->range_cyclic = range_cyclic;
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
-                mapping->writeback_index = index;
+                mapping->writeback_index = done_index;
 out_writepages:
        wbc->nr_to_write -= nr_to_writebump;
@@ -3367,10 +3300,10 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
-         * the pages by calling redirty_page_for_writeback() but that
+         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
-         * simplifying them becuase we wouldn't actually intend to
+         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
@@ -3447,6 +3380,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 static int ext4_readpage(struct file *file, struct page *page)
 {
+        trace_ext4_readpage(page);
        return mpage_readpage(page, ext4_get_block);
 }
@@ -3457,15 +3391,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-        BUG_ON(!io);
-        if (io->page)
-                put_page(io->page);
-        iput(io->inode);
-        kfree(io);
-}
 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
 {
        struct buffer_head *head, *bh;
@@ -3490,6 +3415,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        trace_ext4_invalidatepage(page, offset);
        /*
         * free any io_end structure allocated for buffers to be discarded
         */
@@ -3511,6 +3438,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        trace_ext4_releasepage(page);
        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
@@ -3582,7 +3511,7 @@ retry:
                        loff_t end = offset + iov_length(iov, nr_segs);
                        if (end > isize)
-                                vmtruncate(inode, isize);
+                                ext4_truncate_failed_write(inode);
                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3642,173 +3571,6 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef  EXT4_DEBUG
-        struct list_head *cur, *before, *after;
-        ext4_io_end_t *io, *io0, *io1;
-        unsigned long flags;
-        if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-                ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-                return;
-        }
-        ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-        list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-                cur = &io->list;
-                before = cur->prev;
-                io0 = container_of(before, ext4_io_end_t, list);
-                after = cur->next;
-                io1 = container_of(after, ext4_io_end_t, list);
-                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                            io, inode->i_ino, io0, io1);
-        }
-        spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
-#endif
-}
-/*
- * check a range of space and convert unwritten extents to written.
- */
-static int ext4_end_io_nolock(ext4_io_end_t *io)
-{
-        struct inode *inode = io->inode;
-        loff_t offset = io->offset;
-        ssize_t size = io->size;
-        int ret = 0;
-        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-                   "list->prev 0x%p\n",
-                   io, inode->i_ino, io->list.next, io->list.prev);
-        if (list_empty(&io->list))
-                return ret;
-        if (io->flag != EXT4_IO_UNWRITTEN)
-                return ret;
-        ret = ext4_convert_unwritten_extents(inode, offset, size);
-        if (ret < 0) {
-                printk(KERN_EMERG "%s: failed to convert unwritten"
-                        "extents to written extents, error is %d"
-                        " io is still on inode %lu aio dio list\n",
-                       __func__, ret, inode->i_ino);
-                return ret;
-        }
-        if (io->iocb)
-                aio_complete(io->iocb, io->result, 0);
-        /* clear the DIO AIO unwritten flag */
-        io->flag = 0;
-        return ret;
-}
-/*
- * work on completed aio dio IO, to convert unwritten extents to extents
- */
-static void ext4_end_io_work(struct work_struct *work)
-{
-        ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-        struct inode            *inode = io->inode;
-        struct ext4_inode_info  *ei = EXT4_I(inode);
-        unsigned long           flags;
-        int                     ret;
-        mutex_lock(&inode->i_mutex);
-        ret = ext4_end_io_nolock(io);
-        if (ret < 0) {
-                mutex_unlock(&inode->i_mutex);
-                return;
-        }
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        if (!list_empty(&io->list))
-                list_del_init(&io->list);
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        mutex_unlock(&inode->i_mutex);
-        ext4_free_io_end(io);
-}
-/*
- * This function is called from ext4_sync_file().
- *
- * When IO is completed, the work to convert unwritten extents to
- * written is queued on workqueue but may not get immediately
- * scheduled. When fsync is called, we need to ensure the
- * conversion is complete before fsync returns.
- * The inode keeps track of a list of pending/completed IO that
- * might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents for completed IO
- * to written.
- * The function return the number of pending IOs on success.
- */
-int flush_completed_IO(struct inode *inode)
-{
-        ext4_io_end_t *io;
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long flags;
-        int ret = 0;
-        int ret2 = 0;
-        if (list_empty(&ei->i_completed_io_list))
-                return ret;
-        dump_completed_IO(inode);
-        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-        while (!list_empty(&ei->i_completed_io_list)){
-                io = list_entry(ei->i_completed_io_list.next,
-                                ext4_io_end_t, list);
-                /*
-                 * Calling ext4_end_io_nolock() to convert completed
-                 * IO to written.
-                 *
-                 * When ext4_sync_file() is called, run_queue() may already
-                 * about to flush the work corresponding to this io structure.
-                 * It will be upset if it founds the io structure related
-                 * to the work-to-be schedule is freed.
-                 *
-                 * Thus we need to keep the io structure still valid here after
-                 * convertion finished. The io structure has a flag to
-                 * avoid double converting from both fsync and background work
-                 * queue work.
-                 */
-                spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-                ret = ext4_end_io_nolock(io);
-                spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-                if (ret < 0)
-                        ret2 = ret;
-                else
-                        list_del_init(&io->list);
-        }
-        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-        return (ret2 < 0) ? ret2 : 0;
-}
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
-{
-        ext4_io_end_t *io = NULL;
-        io = kmalloc(sizeof(*io), flags);
-        if (io) {
-                igrab(inode);
-                io->inode = inode;
-                io->flag = 0;
-                io->offset = 0;
-                io->size = 0;
-                io->page = NULL;
-                io->iocb = NULL;
-                io->result = 0;
-                INIT_WORK(&io->work, ext4_end_io_work);
-                INIT_LIST_HEAD(&io->list);
-        }
-        return io;
-}
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private, int ret,
                            bool is_async)
@@ -3828,7 +3590,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                  size);
        /* if not aio dio with unwritten extents, just free io and return */
-        if (io_end->flag != EXT4_IO_UNWRITTEN){
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
                iocb->private = NULL;
 out:
@@ -3845,14 +3607,14 @@ out:
        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-        /* queue the work to convert unwritten extents to written */
-        queue_work(wq, &io_end->work);
        /* Add the io_end to per-inode completed aio dio list*/
        ei = EXT4_I(io_end->inode);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        list_add_tail(&io_end->list, &ei->i_completed_io_list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+        /* queue the work to convert unwritten extents to written */
+        queue_work(wq, &io_end->work);
        iocb->private = NULL;
 }
@@ -3873,7 +3635,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
-        io_end->flag = EXT4_IO_UNWRITTEN;
+        io_end->flag = EXT4_IO_END_UNWRITTEN;
        inode = io_end->inode;
        /* Add the io_end to per-inode completed io list*/
@@ -3901,8 +3663,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
 retry:
        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
        if (!io_end) {
-                if (printk_ratelimit())
+                pr_warn_ratelimited("%s: allocation fail\n", __func__);
-                        printk(KERN_WARNING "%s: allocation fail\n", __func__);
                schedule();
                goto retry;
        }
@@ -3926,13 +3687,13 @@ retry:
 * preallocated extents, and those write extend the file, no need to
 * fall back to buffered IO.
 *
- * For holes, we fallocate those blocks, mark them as unintialized
+ * For holes, we fallocate those blocks, mark them as uninitialized
 * If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * still keep the range to write as uninitialized.
 *
 * The unwrritten extents will be converted to written when DIO is completed.
 * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the convertion
+ * set up an end_io call back function, which will do the conversion
 * when async direct IO completed.
 *
 * If the O_DIRECT write will extend the file then add this inode to the
@@ -3955,7 +3716,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                 * We could direct write to holes and fallocate.
                 *
                 * Allocated blocks to fill the hole are marked as uninitialized
-                 * to prevent paralel buffered read to expose the stale data
+                 * to prevent parallel buffered read to expose the stale data
                 * before DIO complete the data IO.
                 *
                 * As to previously fallocated extents, ext4 get_block
@@ -4016,7 +3777,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                        int err;
                        /*
                         * for non AIO case, since the IO is already
-                         * completed, we could do the convertion right here
+                         * completed, we could do the conversion right here
                         */
                        err = ext4_convert_unwritten_extents(inode,
                                                             offset, ret);
@@ -4037,11 +3798,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        ssize_t ret;
+        trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+                ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+        else
-        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+                ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+        trace_ext4_direct_IO_exit(inode, offset,
+                                iov_length(iov, nr_segs), rw, ret);
+        return ret;
 }
 /*
@@ -4067,7 +3833,6 @@ static const struct address_space_operations ext4_ordered_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_ordered_write_end,
        .bmap                   = ext4_bmap,
@@ -4083,7 +3848,6 @@ static const struct address_space_operations ext4_writeback_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_writeback_write_end,
        .bmap                   = ext4_bmap,
@@ -4099,7 +3863,6 @@ static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
        .set_page_dirty         = ext4_journalled_set_page_dirty,
@@ -4115,7 +3878,6 @@ static const struct address_space_operations ext4_da_aops = {
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
        .writepages             = ext4_da_writepages,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_da_write_begin,
        .write_end              = ext4_da_write_end,
        .bmap                   = ext4_bmap,
@@ -4152,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
 int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
 {
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned length;
+        unsigned blocksize;
+        struct inode *inode = mapping->host;
+        blocksize = inode->i_sb->s_blocksize;
+        length = blocksize - (offset & (blocksize - 1));
+        return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length)
+{
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned blocksize, length, pos;
+        unsigned blocksize, max, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
@@ -4167,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
                return -EINVAL;
        blocksize = inode->i_sb->s_blocksize;
-        length = blocksize - (offset & (blocksize - 1));
+        max = blocksize - (offset & (blocksize - 1));
+        /*
+         * correct length if it does not fall between
+         * 'from' and the end of the block
+         */
+        if (length > max || length < 0)
+                length = max;
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
        if (!page_has_buffers(page))
@@ -4226,7 +4017,7 @@ int ext4_block_truncate_page(handle_t *handle,
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
-                if (ext4_should_order_data(inode))
+                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
        }
@@ -4262,7 +4053,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *
 *      When we do truncate() we may have to clean the ends of several
 *      indirect blocks but leave the blocks themselves alive. Block is
- *      partially truncated if some data below the new i_size is refered
+ *      partially truncated if some data below the new i_size is referred
 *      from it (and it is on the path to the first completely truncated
 *      data block, indeed).  We have to free the top of that path along
 *      with everything to the right of the path. Since no allocation
@@ -4341,6 +4132,9 @@ no_top:
 *
 * We release `count' blocks on disk, but (last - first) may be greater
 * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
 */
 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh,
@@ -4350,6 +4144,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 {
        __le32 *p;
        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     err;
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                flags |= EXT4_FREE_BLOCKS_METADATA;
@@ -4365,22 +4160,33 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_handle_dirty_metadata(handle, inode, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                        if (unlikely(err))
+                                goto out_err;
                }
-                ext4_mark_inode_dirty(handle, inode);
+                err = ext4_mark_inode_dirty(handle, inode);
-                ext4_truncate_restart_trans(handle, inode,
+                if (unlikely(err))
-                                            blocks_for_truncate(inode));
+                        goto out_err;
+                err = ext4_truncate_restart_trans(handle, inode,
+                                                  blocks_for_truncate(inode));
+                if (unlikely(err))
+                        goto out_err;
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err))
+                                goto out_err;
                }
        }
        for (p = first; p < last; p++)
                *p = 0;
-        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+        ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
        return 0;
+out_err:
+        ext4_std_error(inode->i_sb, err);
+        return err;
 }
 /**
@@ -4391,7 +4197,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 * @first:      array of block numbers
 * @last:       points immediately past the end of array
 *
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
 *
 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
@@ -4414,7 +4220,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
        ext4_fsblk_t nr;                    /* Current block # */
        __le32 *p;                          /* Pointer into inode/ind
                                               for current block */
-        int err;
+        int err = 0;
        if (this_bh) {                          /* For indirect block */
                BUFFER_TRACE(this_bh, "get_write_access");
@@ -4436,9 +4242,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
-                                if (ext4_clear_blocks(handle, inode, this_bh,
+                                err = ext4_clear_blocks(handle, inode, this_bh,
-                                                      block_to_free, count,
+                                                        block_to_free, count,
-                                                      block_to_free_p, p))
+                                                        block_to_free_p, p);
+                                if (err)
                                        break;
                                block_to_free = nr;
                                block_to_free_p = p;
@@ -4447,9 +4254,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                }
        }
-        if (count > 0)
+        if (!err && count > 0)
-                ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+                err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
-                                  count, block_to_free_p, p);
+                                        count, block_to_free_p, p);
+        if (err < 0)
+                /* fatal error */
+                return;
        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4479,7 +4289,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 *      @last:  pointer immediately past the end of array
 *      @depth: depth of the branches to free
 *
- *      We are freeing all blocks refered from these branches (numbers are
+ *      We are freeing all blocks referred from these branches (numbers are
 *      stored as little-endian 32-bit) and updating @inode->i_blocks
 *      appropriately.
 */
@@ -4530,6 +4340,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
+                        brelse(bh);
                        /*
                         * Everything below this this pointer has been
@@ -4566,7 +4377,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * transaction where the data blocks are
                         * actually freed.
                         */
-                        ext4_free_blocks(handle, inode, 0, nr, 1,
+                        ext4_free_blocks(handle, inode, NULL, nr, 1,
                                         EXT4_FREE_BLOCKS_METADATA|
                                         EXT4_FREE_BLOCKS_FORGET);
@@ -4596,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 int ext4_can_truncate(struct inode *inode)
 {
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return 0;
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
@@ -4608,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
 }
 /*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode:  File inode
+ * @offset: The offset where the hole will begin
+ * @len:    The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        if (!S_ISREG(inode->i_mode))
+                return -ENOTSUPP;
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+                /* TODO: Add support for non extent hole punching */
+                return -ENOTSUPP;
+        }
+        return ext4_ext_punch_hole(file, offset, length);
+}
+/*
 * ext4_truncate()
 *
 * We block out ext4_get_block() block instantiations across the entire
@@ -4646,10 +4480,12 @@ void ext4_truncate(struct inode *inode)
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
-        int n;
+        int n = 0;
-        ext4_lblk_t last_block;
+        ext4_lblk_t last_block, max_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
+        trace_ext4_truncate_enter(inode);
        if (!ext4_can_truncate(inode))
                return;
@@ -4660,6 +4496,7 @@ void ext4_truncate(struct inode *inode)
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_ext_truncate(inode);
+                trace_ext4_truncate_exit(inode);
                return;
        }
@@ -4669,14 +4506,18 @@ void ext4_truncate(struct inode *inode)
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        if (inode->i_size & (blocksize - 1))
                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
                        goto out_stop;
-        n = ext4_block_to_path(inode, last_block, offsets, NULL);
+        if (last_block != max_block) {
-        if (n == 0)
+                n = ext4_block_to_path(inode, last_block, offsets, NULL);
-                goto out_stop;  /* error */
+                if (n == 0)
+                        goto out_stop;  /* error */
+        }
        /*
         * OK.  This truncate is going to happen.  We add the inode to the
@@ -4707,7 +4548,13 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
-        if (n == 1) {           /* direct blocks */
+        if (last_block == max_block) {
+                /*
+                 * It is unnecessary to free any data blocks if last_block is
+                 * equal to the indirect block limit.
+                 */
+                goto out_unlock;
+        } else if (n == 1) {            /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
                goto do_indirects;
@@ -4767,6 +4614,7 @@ do_indirects:
                ;
        }
+out_unlock:
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -4789,6 +4637,7 @@ out_stop:
                ext4_orphan_del(handle, inode);
        ext4_journal_stop(handle);
+        trace_ext4_truncate_exit(inode);
 }
 /*
@@ -4818,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
        /*
         * Figure out the offset within the block group inode table
         */
-        inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((inode->i_ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -4920,6 +4769,7 @@ make_io:
                 * has in-inode xattrs, or we don't have this inode in memory.
                 * Read the block from disk.
                 */
+                trace_ext4_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(READ_META, bh);
@@ -5025,7 +4875,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
-        iloc.bh = 0;
+        iloc.bh = NULL;
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
@@ -5040,7 +4890,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-        ei->i_state_flags = 0;
+        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
@@ -5299,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
@@ -5464,6 +5314,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        int error, rc = 0;
+        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        error = inode_change_ok(inode, attr);
@@ -5510,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE &&
-            (attr->ia_size < inode->i_size ||
+            (attr->ia_size < inode->i_size)) {
-             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -5519,8 +5369,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        error = PTR_ERR(handle);
                        goto err_out;
                }
+                if (ext4_handle_valid(handle)) {
-                error = ext4_orphan_add(handle, inode);
+                        error = ext4_orphan_add(handle, inode);
+                        orphan = 1;
+                }
                EXT4_I(inode)->i_disksize = attr->ia_size;
                rc = ext4_mark_inode_dirty(handle, inode);
                if (!error)
@@ -5538,18 +5390,20 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                        goto err_out;
                                }
                                ext4_orphan_del(handle, inode);
+                                orphan = 0;
                                ext4_journal_stop(handle);
                                goto err_out;
                        }
                }
-                /* ext4_truncate will clear the flag */
-                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
-                        ext4_truncate(inode);
        }
-        if ((attr->ia_valid & ATTR_SIZE) &&
+        if (attr->ia_valid & ATTR_SIZE) {
-            attr->ia_size != i_size_read(inode))
+                if (attr->ia_size != i_size_read(inode)) {
-                rc = vmtruncate(inode, attr->ia_size);
+                        truncate_setsize(inode, attr->ia_size);
+                        ext4_truncate(inode);
+                } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+                        ext4_truncate(inode);
+        }
        if (!rc) {
                setattr_copy(inode, attr);
@@ -5560,7 +5414,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
-        if (inode->i_nlink)
+        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);
        if (!rc && (ia_valid & ATTR_MODE))
@@ -5592,9 +5446,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
        return 0;
@@ -5608,13 +5460,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
        /* if nrblocks are contiguous */
        if (chunk) {
                /*
-                 * With N contiguous data blocks, it need at most
+                 * With N contiguous data blocks, we need at most
-                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-                 * 2 dindirect blocks
+                 * 2 dindirect blocks, and 1 tindirect block
-                 * 1 tindirect block
                 */
-                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                return DIV_ROUND_UP(nrblocks,
-                return indirects + 3;
+                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
        }
        /*
         * if nrblocks are not contiguous, worse case, each block touch
@@ -5643,7 +5494,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
-int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
@@ -5688,7 +5539,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 /*
- * Calulate the total number of credits to reserve to fit
+ * Calculate the total number of credits to reserve to fit
 * the modification of a single pages into a single transaction,
 * which may include multiple chunks of block allocations.
 *
@@ -5831,6 +5682,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        int err, ret;
        might_sleep();
+        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
@@ -5881,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
 */
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
 {
        handle_t *handle;
@@ -6009,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto out_unlock;
        }
        ret = 0;
-        if (PageMappedToDisk(page))
-                goto out_unlock;
+        lock_page(page);
+        wait_on_page_writeback(page);
+        if (PageMappedToDisk(page)) {
+                up_read(&inode->i_alloc_sem);
+                return VM_FAULT_LOCKED;
+        }
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
        else
                len = PAGE_CACHE_SIZE;
-        lock_page(page);
        /*
         * return if we have all the buffers mapped. This avoid
         * the need to call write_begin/write_end which does a
@@ -6027,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (page_has_buffers(page)) {
                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
                                        ext4_bh_unmapped)) {
-                        unlock_page(page);
+                        up_read(&inode->i_alloc_sem);
-                        goto out_unlock;
+                        return VM_FAULT_LOCKED;
                }
        }
        unlock_page(page);
@@ -6048,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (ret < 0)
                goto out_unlock;
        ret = 0;
+        /*
+         * write_begin/end might have created a dirty page and someone
+         * could wander in and start the IO.  Make sure that hasn't
+         * happened.
+         */
+        lock_page(page);
+        wait_on_page_writeback(page);
+        up_read(&inode->i_alloc_sem);
+        return VM_FAULT_LOCKED;
 out_unlock:
        if (ret)
                ret = VM_FAULT_SIGBUS;