1 files changed, 297 insertions, 367 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cd818d8bb221..88049d8d30cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,10 +132,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
-                                   struct buffer_head *bh_result, int create);
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
@@ -238,7 +234,8 @@ void ext4_evict_inode(struct inode *inode)
         * protection against it
         */
        sb_start_intwrite(inode->i_sb);
-        handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
+        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+                                    ext4_blocks_for_truncate(inode)+3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
@@ -346,7 +343,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
-                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+                ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
@@ -355,10 +352,12 @@ void ext4_da_update_reserve_space(struct inode *inode,
        }
        if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
-                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
+                ext4_warning(inode->i_sb, "ino %lu, allocated %d "
-                         "with only %d reserved metadata blocks\n", __func__,
+                        "with only %d reserved metadata blocks "
-                         inode->i_ino, ei->i_allocated_meta_blocks,
+                        "(releasing %d blocks with reserved %d data blocks)",
-                         ei->i_reserved_meta_blocks);
+                        inode->i_ino, ei->i_allocated_meta_blocks,
+                             ei->i_reserved_meta_blocks, used,
+                             ei->i_reserved_data_blocks);
                WARN_ON(1);
                ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
        }
@@ -508,12 +507,33 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 int ext4_map_blocks(handle_t *handle, struct inode *inode,
                    struct ext4_map_blocks *map, int flags)
 {
+        struct extent_status es;
        int retval;
        map->m_flags = 0;
        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
                  (unsigned long) map->m_lblk);
+        /* Lookup extent status tree firstly */
+        if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+                        map->m_pblk = ext4_es_pblock(&es) +
+                                        map->m_lblk - es.es_lblk;
+                        map->m_flags |= ext4_es_is_written(&es) ?
+                                        EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+                        retval = es.es_len - (map->m_lblk - es.es_lblk);
+                        if (retval > map->m_len)
+                                retval = map->m_len;
+                        map->m_len = retval;
+                } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+                        retval = 0;
+                } else {
+                        BUG_ON(1);
+                }
+                goto found;
+        }
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
@@ -527,20 +547,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                retval = ext4_ind_map_blocks(handle, inode, map, flags &
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
+        if (retval > 0) {
+                int ret;
+                unsigned long long status;
+                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+                    ext4_find_delalloc_range(inode, map->m_lblk,
+                                             map->m_lblk + map->m_len - 1))
+                        status |= EXTENT_STATUS_DELAYED;
+                ret = ext4_es_insert_extent(inode, map->m_lblk,
+                                            map->m_len, map->m_pblk, status);
+                if (ret < 0)
+                        retval = ret;
+        }
        if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
                up_read((&EXT4_I(inode)->i_data_sem));
+found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret;
+                int ret = check_block_validity(inode, map);
-                if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-                        /* delayed alloc may be allocated by fallocate and
-                         * coverted to initialized by directIO.
-                         * we need to handle delayed extent here.
-                         */
-                        down_write((&EXT4_I(inode)->i_data_sem));
-                        goto delayed_mapped;
-                }
-                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -560,16 +587,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                return retval;
        /*
-         * When we call get_blocks without the create flag, the
+         * Here we clear m_flags because after allocating an new extent,
-         * BH_Unwritten flag could have gotten set if the blocks
+         * it will be set again.
-         * requested were part of a uninitialized extent.  We need to
-         * clear this flag now that we are committed to convert all or
-         * part of the uninitialized extent to be an initialized
-         * extent.  This is because we need to avoid the combination
-         * of BH_Unwritten and BH_Mapped flags being simultaneously
-         * set on the buffer_head.
         */
-        map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+        map->m_flags &= ~EXT4_MAP_FLAGS;
        /*
         * New blocks allocate and/or writing to uninitialized extent
@@ -615,18 +636,23 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
-        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
-                if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+        if (retval > 0) {
-                        int ret;
+                int ret;
-delayed_mapped:
+                unsigned long long status;
-                        /* delayed allocation blocks has been allocated */
-                        ret = ext4_es_remove_extent(inode, map->m_lblk,
+                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
-                                                    map->m_len);
+                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
-                        if (ret < 0)
+                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
-                                retval = ret;
+                    ext4_find_delalloc_range(inode, map->m_lblk,
-                }
+                                             map->m_lblk + map->m_len - 1))
+                        status |= EXTENT_STATUS_DELAYED;
+                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+                                            map->m_pblk, status);
+                if (ret < 0)
+                        retval = ret;
        }
        up_write((&EXT4_I(inode)->i_data_sem));
@@ -660,7 +686,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
                if (map.m_len > DIO_MAX_BLOCKS)
                        map.m_len = DIO_MAX_BLOCKS;
                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-                handle = ext4_journal_start(inode, dio_credits);
+                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+                                            dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        return ret;
@@ -707,14 +734,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        /* ensure we send some value back into *errp */
        *errp = 0;
+        if (create && err == 0)
+                err = -ENOSPC;  /* should never happen */
        if (err < 0)
                *errp = err;
        if (err <= 0)
                return NULL;
        bh = sb_getblk(inode->i_sb, map.m_pblk);
-        if (!bh) {
+        if (unlikely(!bh)) {
-                *errp = -EIO;
+                *errp = -ENOMEM;
                return NULL;
        }
        if (map.m_flags & EXT4_MAP_NEW) {
@@ -808,11 +837,10 @@ int ext4_walk_page_buffers(handle_t *handle,
 * and the commit_write().  So doing the jbd2_journal_start at the start of
 * prepare_write() is the right place.
 *
- * Also, this function can nest inside ext4_writepage() ->
+ * Also, this function can nest inside ext4_writepage().  In that case, we
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
- * has generated enough buffer credits to do the whole page.  So we won't
+ * whole page.  So we won't block on the journal in that case, which is good,
- * block on the journal in that case, which is good, because the caller may
+ * because the caller may be PF_MEMALLOC.
- * be PF_MEMALLOC.
 *
 * By accident, ext4 can be reentered when a transaction is open via
 * quota file writes.  If we were to commit the transaction while thus
@@ -878,32 +906,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                    flags, pagep);
                if (ret < 0)
-                        goto out;
+                        return ret;
-                if (ret == 1) {
+                if (ret == 1)
-                        ret = 0;
+                        return 0;
-                        goto out;
-                }
        }
-retry:
+        /*
-        handle = ext4_journal_start(inode, needed_blocks);
+         * grab_cache_page_write_begin() can take a long time if the
+         * system is thrashing due to memory pressure, or if the page
+         * is being written back.  So grab it first before we start
+         * the transaction handle.  This also allows us to allocate
+         * the page (if needed) without using GFP_NOFS.
+         */
+retry_grab:
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page)
+                return -ENOMEM;
+        unlock_page(page);
+retry_journal:
+        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
+                page_cache_release(page);
-                goto out;
+                return PTR_ERR(handle);
        }
-        /* We cannot recurse into the filesystem as the transaction is already
+        lock_page(page);
-         * started */
+        if (page->mapping != mapping) {
-        flags |= AOP_FLAG_NOFS;
+                /* The page got truncated from under us */
+                unlock_page(page);
-        page = grab_cache_page_write_begin(mapping, index, flags);
+                page_cache_release(page);
-        if (!page) {
                ext4_journal_stop(handle);
-                ret = -ENOMEM;
+                goto retry_grab;
-                goto out;
        }
+        wait_on_page_writeback(page);
-        *pagep = page;
        if (ext4_should_dioread_nolock(inode))
                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -918,7 +954,6 @@ retry:
        if (ret) {
                unlock_page(page);
-                page_cache_release(page);
                /*
                 * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
@@ -942,11 +977,14 @@ retry:
                        if (inode->i_nlink)
                                ext4_orphan_del(NULL, inode);
                }
-        }
-        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                if (ret == -ENOSPC &&
-                goto retry;
+                    ext4_should_retry_alloc(inode->i_sb, &retries))
-out:
+                        goto retry_journal;
+                page_cache_release(page);
+                return ret;
+        }
+        *pagep = page;
        return ret;
 }
@@ -1256,7 +1294,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * function is called from invalidate page, it's
                 * harmless to return without any action.
                 */
-                ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+                ext4_warning(inode->i_sb, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
@@ -1357,7 +1395,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
        loff_t size = i_size_read(inode);
        unsigned int len, block_start;
        struct buffer_head *bh, *page_bufs = NULL;
-        int journal_data = ext4_should_journal_data(inode);
        sector_t pblock = 0, cur_logical = 0;
        struct ext4_io_submit io_submit;
@@ -1378,7 +1415,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
-                        int commit_write = 0, skip_page = 0;
+                        int skip_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
@@ -1400,27 +1437,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        /*
-                         * If the page does not have buffers (for
-                         * whatever reason), try to create them using
-                         * __block_write_begin.  If this fails,
-                         * skip the page and move on.
-                         */
-                        if (!page_has_buffers(page)) {
-                                if (__block_write_begin(page, 0, len,
-                                                noalloc_get_block_write)) {
-                                skip_page:
-                                        unlock_page(page);
-                                        continue;
-                                }
-                                commit_write = 1;
-                        }
                        bh = page_bufs = page_buffers(page);
                        block_start = 0;
                        do {
-                                if (!bh)
-                                        goto skip_page;
                                if (map && (cur_logical >= map->m_lblk) &&
                                    (cur_logical <= (map->m_lblk +
                                                     (map->m_len - 1)))) {
@@ -1448,33 +1467,14 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                pblock++;
                        } while (bh != page_bufs);
-                        if (skip_page)
+                        if (skip_page) {
-                                goto skip_page;
+                                unlock_page(page);
+                                continue;
-                        if (commit_write)
+                        }
-                                /* mark the buffer_heads as dirty & uptodate */
-                                block_commit_write(page, 0, len);
                        clear_page_dirty_for_io(page);
-                        /*
+                        err = ext4_bio_write_page(&io_submit, page, len,
-                         * Delalloc doesn't support data journalling,
+                                                  mpd->wbc);
-                         * but eventually maybe we'll lift this
-                         * restriction.
-                         */
-                        if (unlikely(journal_data && PageChecked(page)))
-                                err = __ext4_journalled_writepage(page, len);
-                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
-                                err = ext4_bio_write_page(&io_submit, page,
-                                                          len, mpd->wbc);
-                        else if (buffer_uninit(page_bufs)) {
-                                ext4_set_bh_endio(page_bufs, inode);
-                                err = block_write_full_page_endio(page,
-                                        noalloc_get_block_write,
-                                        mpd->wbc, ext4_end_io_buffer_write);
-                        } else
-                                err = block_write_full_page(page,
-                                        noalloc_get_block_write, mpd->wbc);
                        if (!err)
                                mpd->pages_written++;
                        /*
@@ -1640,7 +1640,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                                 (unsigned long long) next,
                                 mpd->b_size >> mpd->inode->i_blkbits, err);
                        ext4_msg(sb, KERN_CRIT,
-                                "This should not happen!! Data will be lost\n");
+                                "This should not happen!! Data will be lost");
                        if (err == -ENOSPC)
                                ext4_print_free_blocks(mpd->inode);
                }
@@ -1690,16 +1690,16 @@ submit_io:
 *
 * @mpd->lbh - extent of blocks
 * @logical - logical number of the block in the file
- * @bh - bh of the block (used to access block's state)
+ * @b_state - b_state of the buffer head added
 *
 * the function is used to collect contig. blocks in same state
 */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
-                                   sector_t logical, size_t b_size,
                                   unsigned long b_state)
 {
        sector_t next;
-        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+        int blkbits = mpd->inode->i_blkbits;
+        int nrblocks = mpd->b_size >> blkbits;
        /*
         * XXX Don't go larger than mballoc is willing to allocate
@@ -1707,11 +1707,11 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
         * mpage_da_submit_io() into this function and then call
         * ext4_map_blocks() multiple times in a loop
         */
-        if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+        if (nrblocks >= (8*1024*1024 >> blkbits))
                goto flush_it;
-        /* check if thereserved journal credits might overflow */
+        /* check if the reserved journal credits might overflow */
-        if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+        if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
                        /*
                         * With non-extent format we are limited by the journal
@@ -1720,16 +1720,6 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
                         * nrblocks.  So limit nrblocks.
                         */
                        goto flush_it;
-                } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
-                                EXT4_MAX_TRANS_DATA) {
-                        /*
-                         * Adding the new buffer_head would make it cross the
-                         * allowed limit for which we have journal credit
-                         * reserved. So limit the new bh->b_size
-                         */
-                        b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
-                                                mpd->inode->i_blkbits;
-                        /* we will do mpage_da_submit_io in the next loop */
                }
        }
        /*
@@ -1737,7 +1727,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
         */
        if (mpd->b_size == 0) {
                mpd->b_blocknr = logical;
-                mpd->b_size = b_size;
+                mpd->b_size = 1 << blkbits;
                mpd->b_state = b_state & BH_FLAGS;
                return;
        }
@@ -1747,7 +1737,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
         * Can we merge the block to our big extent?
         */
        if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-                mpd->b_size += b_size;
+                mpd->b_size += 1 << blkbits;
                return;
        }
@@ -1775,6 +1765,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                              struct ext4_map_blocks *map,
                              struct buffer_head *bh)
 {
+        struct extent_status es;
        int retval;
        sector_t invalid_block = ~((sector_t) 0xffff);
@@ -1785,6 +1776,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
        ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
                  "logical block %lu\n", inode->i_ino, map->m_len,
                  (unsigned long) map->m_lblk);
+        /* Lookup extent status tree firstly */
+        if (ext4_es_lookup_extent(inode, iblock, &es)) {
+                if (ext4_es_is_hole(&es)) {
+                        retval = 0;
+                        down_read((&EXT4_I(inode)->i_data_sem));
+                        goto add_delayed;
+                }
+                /*
+                 * Delayed extent could be allocated by fallocate.
+                 * So we need to check it.
+                 */
+                if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+                        map_bh(bh, inode->i_sb, invalid_block);
+                        set_buffer_new(bh);
+                        set_buffer_delay(bh);
+                        return 0;
+                }
+                map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+                retval = es.es_len - (iblock - es.es_lblk);
+                if (retval > map->m_len)
+                        retval = map->m_len;
+                map->m_len = retval;
+                if (ext4_es_is_written(&es))
+                        map->m_flags |= EXT4_MAP_MAPPED;
+                else if (ext4_es_is_unwritten(&es))
+                        map->m_flags |= EXT4_MAP_UNWRITTEN;
+                else
+                        BUG_ON(1);
+                return retval;
+        }
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
@@ -1803,11 +1830,15 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                        map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                retval = 0;
        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+                retval = ext4_ext_map_blocks(NULL, inode, map,
+                                             EXT4_GET_BLOCKS_NO_PUT_HOLE);
        else
-                retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+                retval = ext4_ind_map_blocks(NULL, inode, map,
+                                             EXT4_GET_BLOCKS_NO_PUT_HOLE);
+add_delayed:
        if (retval == 0) {
+                int ret;
                /*
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
@@ -1815,15 +1846,20 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                /* If the block was allocated from previously allocated cluster,
                 * then we dont need to reserve it again. */
                if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
-                        retval = ext4_da_reserve_space(inode, iblock);
+                        ret = ext4_da_reserve_space(inode, iblock);
-                        if (retval)
+                        if (ret) {
                                /* not enough space to reserve */
+                                retval = ret;
                                goto out_unlock;
+                        }
                }
-                retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
+                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
-                if (retval)
+                                            ~0, EXTENT_STATUS_DELAYED);
+                if (ret) {
+                        retval = ret;
                        goto out_unlock;
+                }
                /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
                 * and it should not appear on the bh->b_state.
@@ -1833,6 +1869,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                map_bh(bh, inode->i_sb, invalid_block);
                set_buffer_new(bh);
                set_buffer_delay(bh);
+        } else if (retval > 0) {
+                int ret;
+                unsigned long long status;
+                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+                                            map->m_pblk, status);
+                if (ret != 0)
+                        retval = ret;
        }
 out_unlock:
@@ -1890,27 +1936,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
        return 0;
 }
-/*
- * This function is used as a standard get_block_t calback function
- * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_write_begin() and block_write_full_page().
- * These functions should only try to map a single block at a time.
- *
- * Since this function doesn't do block allocations even if the caller
- * requests it by passing in create=1, it is critically important that
- * any caller checks to make sure that any buffer heads are returned
- * by this function are either all already mapped or marked for
- * delayed allocation before calling  block_write_full_page().  Otherwise,
- * b_blocknr could be left unitialized, and the page write functions will
- * be taken by surprise.
- */
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
-                                   struct buffer_head *bh_result, int create)
-{
-        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
-        return _ext4_get_block(inode, iblock, bh_result, 0);
-}
 static int bget_one(handle_t *handle, struct buffer_head *bh)
 {
        get_bh(bh);
@@ -1955,7 +1980,8 @@ static int __ext4_journalled_writepage(struct page *page,
         * references to buffers so we are safe */
        unlock_page(page);
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                                    ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out;
@@ -2035,11 +2061,12 @@ out:
 static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
 {
-        int ret = 0, commit_write = 0;
+        int ret = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
+        struct ext4_io_submit io_submit;
        trace_ext4_writepage(page);
        size = i_size_read(inode);
@@ -2048,39 +2075,29 @@ static int ext4_writepage(struct page *page,
        else
                len = PAGE_CACHE_SIZE;
+        page_bufs = page_buffers(page);
        /*
-         * If the page does not have buffers (for whatever reason),
+         * We cannot do block allocation or other extent handling in this
-         * try to create them using __block_write_begin.  If this
+         * function. If there are buffers needing that, we have to redirty
-         * fails, redirty the page and move on.
+         * the page. But we may reach here when we do a journal commit via
+         * journal_submit_inode_data_buffers() and in that case we must write
+         * allocated buffers to achieve data=ordered mode guarantees.
         */
-        if (!page_has_buffers(page)) {
+        if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                if (__block_write_begin(page, 0, len,
+                                   ext4_bh_delay_or_unwritten)) {
-                                        noalloc_get_block_write)) {
+                redirty_page_for_writepage(wbc, page);
-                redirty_page:
+                if (current->flags & PF_MEMALLOC) {
-                        redirty_page_for_writepage(wbc, page);
+                        /*
+                         * For memory cleaning there's no point in writing only
+                         * some buffers. So just bail out. Warn if we came here
+                         * from direct reclaim.
+                         */
+                        WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+                                                        == PF_MEMALLOC);
                        unlock_page(page);
                        return 0;
                }
-                commit_write = 1;
        }
-        page_bufs = page_buffers(page);
-        if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                   ext4_bh_delay_or_unwritten)) {
-                /*
-                 * We don't want to do block allocation, so redirty
-                 * the page and return.  We may reach here when we do
-                 * a journal commit via journal_submit_inode_data_buffers.
-                 * We can also reach here via shrink_page_list but it
-                 * should never be for direct reclaim so warn if that
-                 * happens
-                 */
-                WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
-                                                                PF_MEMALLOC);
-                goto redirty_page;
-        }
-        if (commit_write)
-                /* now mark the buffer_heads as dirty and uptodate */
-                block_commit_write(page, 0, len);
        if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
@@ -2089,14 +2106,9 @@ static int ext4_writepage(struct page *page,
                 */
                return __ext4_journalled_writepage(page, len);
-        if (buffer_uninit(page_bufs)) {
+        memset(&io_submit, 0, sizeof(io_submit));
-                ext4_set_bh_endio(page_bufs, inode);
+        ret = ext4_bio_write_page(&io_submit, page, len, wbc);
-                ret = block_write_full_page_endio(page, noalloc_get_block_write,
+        ext4_io_submit(&io_submit);
-                                            wbc, ext4_end_io_buffer_write);
-        } else
-                ret = block_write_full_page(page, noalloc_get_block_write,
-                                            wbc);
        return ret;
 }
@@ -2228,51 +2240,38 @@ static int write_cache_pages_da(handle_t *handle,
                        logical = (sector_t) page->index <<
                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                        if (!page_has_buffers(page)) {
+                        /* Add all dirty buffers to mpd */
-                                mpage_add_bh_to_extent(mpd, logical,
+                        head = page_buffers(page);
-                                                       PAGE_CACHE_SIZE,
+                        bh = head;
-                                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
+                        do {
-                                if (mpd->io_done)
+                                BUG_ON(buffer_locked(bh));
-                                        goto ret_extent_tail;
-                        } else {
                                /*
-                                 * Page with regular buffer heads,
+                                 * We need to try to allocate unmapped blocks
-                                 * just add all dirty ones
+                                 * in the same page.  Otherwise we won't make
+                                 * progress with the page in ext4_writepage
                                 */
-                                head = page_buffers(page);
+                                if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                                bh = head;
+                                        mpage_add_bh_to_extent(mpd, logical,
-                                do {
+                                                               bh->b_state);
-                                        BUG_ON(buffer_locked(bh));
+                                        if (mpd->io_done)
+                                                goto ret_extent_tail;
+                                } else if (buffer_dirty(bh) &&
+                                           buffer_mapped(bh)) {
                                        /*
-                                         * We need to try to allocate
+                                         * mapped dirty buffer. We need to
-                                         * unmapped blocks in the same page.
+                                         * update the b_state because we look
-                                         * Otherwise we won't make progress
+                                         * at b_state in mpage_da_map_blocks.
-                                         * with the page in ext4_writepage
+                                         * We don't update b_size because if we
+                                         * find an unmapped buffer_head later
+                                         * we need to use the b_state flag of
+                                         * that buffer_head.
                                         */
-                                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+                                        if (mpd->b_size == 0)
-                                                mpage_add_bh_to_extent(mpd, logical,
+                                                mpd->b_state =
-                                                                       bh->b_size,
+                                                        bh->b_state & BH_FLAGS;
-                                                                       bh->b_state);
+                                }
-                                                if (mpd->io_done)
+                                logical++;
-                                                        goto ret_extent_tail;
+                        } while ((bh = bh->b_this_page) != head);
-                                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
-                                                /*
-                                                 * mapped dirty buffer. We need
-                                                 * to update the b_state
-                                                 * because we look at b_state
-                                                 * in mpage_da_map_blocks.  We
-                                                 * don't update b_size because
-                                                 * if we find an unmapped
-                                                 * buffer_head later we need to
-                                                 * use the b_state flag of that
-                                                 * buffer_head.
-                                                 */
-                                                if (mpd->b_size == 0)
-                                                        mpd->b_state = bh->b_state & BH_FLAGS;
-                                        }
-                                        logical++;
-                                } while ((bh = bh->b_this_page) != head);
-                        }
                        if (nr_to_write > 0) {
                                nr_to_write--;
@@ -2413,7 +2412,8 @@ retry:
                needed_blocks = ext4_da_writepages_trans_blocks(inode);
                /* start a new transaction*/
-                handle = ext4_journal_start(inode, needed_blocks);
+                handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                                            needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
@@ -2555,42 +2555,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                                      pos, len, flags,
                                                      pagep, fsdata);
                if (ret < 0)
-                        goto out;
+                        return ret;
-                if (ret == 1) {
+                if (ret == 1)
-                        ret = 0;
+                        return 0;
-                        goto out;
-                }
        }
-retry:
+        /*
+         * grab_cache_page_write_begin() can take a long time if the
+         * system is thrashing due to memory pressure, or if the page
+         * is being written back.  So grab it first before we start
+         * the transaction handle.  This also allows us to allocate
+         * the page (if needed) without using GFP_NOFS.
+         */
+retry_grab:
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page)
+                return -ENOMEM;
+        unlock_page(page);
        /*
         * With delayed allocation, we don't log the i_disksize update
         * if there is delayed block allocation. But we still need
         * to journalling the i_disksize update if writes to the end
         * of file which has an already mapped buffer.
         */
-        handle = ext4_journal_start(inode, 1);
+retry_journal:
+        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
+                page_cache_release(page);
-                goto out;
+                return PTR_ERR(handle);
        }
-        /* We cannot recurse into the filesystem as the transaction is already
-         * started */
-        flags |= AOP_FLAG_NOFS;
-        page = grab_cache_page_write_begin(mapping, index, flags);
+        lock_page(page);
-        if (!page) {
+        if (page->mapping != mapping) {
+                /* The page got truncated from under us */
+                unlock_page(page);
+                page_cache_release(page);
                ext4_journal_stop(handle);
-                ret = -ENOMEM;
+                goto retry_grab;
-                goto out;
        }
-        *pagep = page;
+        /* In case writeback began while the page was unlocked */
+        wait_on_page_writeback(page);
        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
-                page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
@@ -2598,11 +2608,16 @@ retry:
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);
+                if (ret == -ENOSPC &&
+                    ext4_should_retry_alloc(inode->i_sb, &retries))
+                        goto retry_journal;
+                page_cache_release(page);
+                return ret;
        }
-        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+        *pagep = page;
-                goto retry;
-out:
        return ret;
 }
@@ -2858,36 +2873,10 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
-static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
-{
-        struct buffer_head *head, *bh;
-        unsigned int curr_off = 0;
-        if (!page_has_buffers(page))
-                return;
-        head = bh = page_buffers(page);
-        do {
-                if (offset <= curr_off && test_clear_buffer_uninit(bh)
-                                        && bh->b_private) {
-                        ext4_free_io_end(bh->b_private);
-                        bh->b_private = NULL;
-                        bh->b_end_io = NULL;
-                }
-                curr_off = curr_off + bh->b_size;
-                bh = bh->b_this_page;
-        } while (bh != head);
-}
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
        trace_ext4_invalidatepage(page, offset);
-        /*
-         * free any io_end structure allocated for buffers to be discarded
-         */
-        if (ext4_should_dioread_nolock(page->mapping->host))
-                ext4_invalidatepage_free_endio(page, offset);
        /* No journalling happens on data buffers when this function is used */
        WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
@@ -2977,9 +2966,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
 out:
+                inode_dio_done(inode);
                if (is_async)
                        aio_complete(iocb, ret, 0);
-                inode_dio_done(inode);
                return;
        }
@@ -2993,65 +2982,6 @@ out:
        ext4_add_complete_io(io_end);
 }
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
-{
-        ext4_io_end_t *io_end = bh->b_private;
-        struct inode *inode;
-        if (!test_clear_buffer_uninit(bh) || !io_end)
-                goto out;
-        if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
-                ext4_msg(io_end->inode->i_sb, KERN_INFO,
-                         "sb umounted, discard end_io request for inode %lu",
-                         io_end->inode->i_ino);
-                ext4_free_io_end(io_end);
-                goto out;
-        }
-        /*
-         * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
-         * but being more careful is always safe for the future change.
-         */
-        inode = io_end->inode;
-        ext4_set_io_unwritten_flag(inode, io_end);
-        ext4_add_complete_io(io_end);
-out:
-        bh->b_private = NULL;
-        bh->b_end_io = NULL;
-        clear_buffer_uninit(bh);
-        end_buffer_async_write(bh, uptodate);
-}
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
-{
-        ext4_io_end_t *io_end;
-        struct page *page = bh->b_page;
-        loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
-        size_t size = bh->b_size;
-retry:
-        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
-        if (!io_end) {
-                pr_warn_ratelimited("%s: allocation fail\n", __func__);
-                schedule();
-                goto retry;
-        }
-        io_end->offset = offset;
-        io_end->size = size;
-        /*
-         * We need to hold a reference to the page to make sure it
-         * doesn't get evicted before ext4_end_io_work() has a chance
-         * to convert the extent from written to unwritten.
-         */
-        io_end->page = page;
-        get_page(io_end->page);
-        bh->b_private = io_end;
-        bh->b_end_io = ext4_end_io_buffer_write;
-        return 0;
-}
 /*
 * For ext4 extent files, ext4 will do direct-io write to holes,
 * preallocated extents, and those write extend the file, no need to
@@ -3557,16 +3487,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
-        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                /* TODO: Add support for non extent hole punching */
+                return ext4_ind_punch_hole(file, offset, length);
-                return -EOPNOTSUPP;
-        }
        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
                /* TODO: Add support for bigalloc file systems */
                return -EOPNOTSUPP;
        }
+        trace_ext4_punch_hole(inode, offset, length);
        return ext4_ext_punch_hole(file, offset, length);
 }
@@ -3660,11 +3590,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
        bh = sb_getblk(sb, block);
-        if (!bh) {
+        if (unlikely(!bh))
-                EXT4_ERROR_INODE_BLOCK(inode, block,
+                return -ENOMEM;
-                                       "unable to read itable block");
-                return -EIO;
-        }
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
@@ -3696,7 +3623,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
                        /* Is the inode bitmap in cache? */
                        bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
-                        if (!bitmap_bh)
+                        if (unlikely(!bitmap_bh))
                                goto make_io;
                        /*
@@ -4404,8 +4331,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
-                handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+                handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
-                                        EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+                        (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
@@ -4440,7 +4368,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
            (attr->ia_size < inode->i_size)) {
                handle_t *handle;
-                handle = ext4_journal_start(inode, 3);
+                handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
@@ -4460,7 +4388,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                                            attr->ia_size);
                        if (error) {
                                /* Do as much error cleanup as possible */
-                                handle = ext4_journal_start(inode, 3);
+                                handle = ext4_journal_start(inode,
+                                                            EXT4_HT_INODE, 3);
                                if (IS_ERR(handle)) {
                                        ext4_orphan_del(NULL, inode);
                                        goto err_out;
@@ -4801,7 +4730,7 @@ void ext4_dirty_inode(struct inode *inode, int flags)
 {
        handle_t *handle;
-        handle = ext4_journal_start(inode, 2);
+        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                goto out;
@@ -4902,7 +4831,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        /* Finally we can mark the inode as dirty. */
-        handle = ext4_journal_start(inode, 1);
+        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -4980,7 +4909,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        else
                get_block = ext4_get_block;
 retry_alloc:
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                                    ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = VM_FAULT_SIGBUS;
                goto out;