Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Theodore Ts'o: "The one new feature added in this patch series is the ability to use the "punch hole" functionality for inodes that are not using extent maps. In the bug fix category, we fixed some races in the AIO and fstrim code, and some potential NULL pointer dereferences and memory leaks in error handling code paths. In the optimization category, we fixed a performance regression in the jbd2 layer introduced by commit d9b01934d56a ("jbd: fix fsync() tid wraparound bug", introduced in v3.0) which shows up in the AIM7 benchmark. We also further optimized jbd2 by minimize the amount of time that transaction handles are held active. This patch series also features some additional enhancement of the extent status tree, which is now used to cache extent information in a more efficient/compact form than what we use on-disk." * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (65 commits) ext4: fix free clusters calculation in bigalloc filesystem ext4: no need to remove extent if len is 0 in ext4_es_remove_extent() ext4: fix xattr block allocation/release with bigalloc ext4: reclaim extents from extent status tree ext4: adjust some functions for reclaiming extents from extent status tree ext4: remove single extent cache ext4: lookup block mapping in extent status tree ext4: track all extent status in extent status tree ext4: let ext4_ext_map_blocks return EXT4_MAP_UNWRITTEN flag ext4: rename and improbe ext4_es_find_extent() ext4: add physical block and status member into extent status tree ext4: refine extent status tree ext4: use ERR_PTR() abstraction for ext4_append() ext4: refactor code to read directory blocks into ext4_read_dirblock() ext4: add debugging context for warning in ext4_da_update_reserve_space() ext4: use KERN_WARNING for warning messages jbd2: use module parameters instead of debugfs for jbd_debug ext4: use module parameters instead of debugfs for mballoc_debug ext4: start handle at the last possible moment when creating inodes ext4: fix the number of credits needed for acl ops with inline data ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-02-26 17:52:45 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-02-26 17:52:45 -0500
commit: 6515925b8259549b7f2187e25d3260306e3e85e5 (patch)
tree: 7d51487f308f8f0ac95d3113606c39ba592111ba /fs/ext4/inode.c
parent: bbbd27e694ce2c5fde9c8fcedbea618dd9153fe7 (diff)
parent: 304e220f0879198b1f5309ad6f0be862b4009491 (diff)
1 files changed, 297 insertions, 367 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cd818d8bb221..88049d8d30cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,10 +132,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 }
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
-                                   struct buffer_head *bh_result, int create);
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
 static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
@@ -238,7 +234,8 @@ void ext4_evict_inode(struct inode *inode)
         * protection against it
         */
        sb_start_intwrite(inode->i_sb);
-        handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
+        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+                                    ext4_blocks_for_truncate(inode)+3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
@@ -346,7 +343,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
-                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+                ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
@@ -355,10 +352,12 @@ void ext4_da_update_reserve_space(struct inode *inode,
        }
        if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
-                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
+                ext4_warning(inode->i_sb, "ino %lu, allocated %d "
-                         "with only %d reserved metadata blocks\n", __func__,
+                        "with only %d reserved metadata blocks "
-                         inode->i_ino, ei->i_allocated_meta_blocks,
+                        "(releasing %d blocks with reserved %d data blocks)",
-                         ei->i_reserved_meta_blocks);
+                        inode->i_ino, ei->i_allocated_meta_blocks,
+                             ei->i_reserved_meta_blocks, used,
+                             ei->i_reserved_data_blocks);
                WARN_ON(1);
                ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
        }
@@ -508,12 +507,33 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 int ext4_map_blocks(handle_t *handle, struct inode *inode,
                    struct ext4_map_blocks *map, int flags)
 {
+        struct extent_status es;
        int retval;
        map->m_flags = 0;
        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
                  (unsigned long) map->m_lblk);
+        /* Lookup extent status tree firstly */
+        if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+                        map->m_pblk = ext4_es_pblock(&es) +
+                                        map->m_lblk - es.es_lblk;
+                        map->m_flags |= ext4_es_is_written(&es) ?
+                                        EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+                        retval = es.es_len - (map->m_lblk - es.es_lblk);
+                        if (retval > map->m_len)
+                                retval = map->m_len;
+                        map->m_len = retval;
+                } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+                        retval = 0;
+                } else {
+                        BUG_ON(1);
+                }
+                goto found;
+        }
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
@@ -527,20 +547,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                retval = ext4_ind_map_blocks(handle, inode, map, flags &
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
+        if (retval > 0) {
+                int ret;
+                unsigned long long status;
+                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+                    ext4_find_delalloc_range(inode, map->m_lblk,
+                                             map->m_lblk + map->m_len - 1))
+                        status |= EXTENT_STATUS_DELAYED;
+                ret = ext4_es_insert_extent(inode, map->m_lblk,
+                                            map->m_len, map->m_pblk, status);
+                if (ret < 0)
+                        retval = ret;
+        }
        if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
                up_read((&EXT4_I(inode)->i_data_sem));
+found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret;
+                int ret = check_block_validity(inode, map);
-                if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-                        /* delayed alloc may be allocated by fallocate and
-                         * coverted to initialized by directIO.
-                         * we need to handle delayed extent here.
-                         */
-                        down_write((&EXT4_I(inode)->i_data_sem));
-                        goto delayed_mapped;
-                }
-                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -560,16 +587,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                return retval;
        /*
-         * When we call get_blocks without the create flag, the
+         * Here we clear m_flags because after allocating an new extent,
-         * BH_Unwritten flag could have gotten set if the blocks
+         * it will be set again.
-         * requested were part of a uninitialized extent.  We need to
-         * clear this flag now that we are committed to convert all or
-         * part of the uninitialized extent to be an initialized
-         * extent.  This is because we need to avoid the combination
-         * of BH_Unwritten and BH_Mapped flags being simultaneously
-         * set on the buffer_head.
         */
-        map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+        map->m_flags &= ~EXT4_MAP_FLAGS;
        /*
         * New blocks allocate and/or writing to uninitialized extent
@@ -615,18 +636,23 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
-        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
-                if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+        if (retval > 0) {
-                        int ret;
+                int ret;
-delayed_mapped:
+                unsigned long long status;
-                        /* delayed allocation blocks has been allocated */
-                        ret = ext4_es_remove_extent(inode, map->m_lblk,
+                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
-                                                    map->m_len);
+                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
-                        if (ret < 0)
+                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
-                                retval = ret;
+                    ext4_find_delalloc_range(inode, map->m_lblk,
-                }
+                                             map->m_lblk + map->m_len - 1))
+                        status |= EXTENT_STATUS_DELAYED;
+                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+                                            map->m_pblk, status);
+                if (ret < 0)
+                        retval = ret;
        }
        up_write((&EXT4_I(inode)->i_data_sem));
@@ -660,7 +686,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
                if (map.m_len > DIO_MAX_BLOCKS)
                        map.m_len = DIO_MAX_BLOCKS;
                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-                handle = ext4_journal_start(inode, dio_credits);
+                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+                                            dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        return ret;
@@ -707,14 +734,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        /* ensure we send some value back into *errp */
        *errp = 0;
+        if (create && err == 0)
+                err = -ENOSPC;  /* should never happen */
        if (err < 0)
                *errp = err;
        if (err <= 0)
                return NULL;
        bh = sb_getblk(inode->i_sb, map.m_pblk);
-        if (!bh) {
+        if (unlikely(!bh)) {
-                *errp = -EIO;
+                *errp = -ENOMEM;
                return NULL;
        }
        if (map.m_flags & EXT4_MAP_NEW) {
@@ -808,11 +837,10 @@ int ext4_walk_page_buffers(handle_t *handle,
 * and the commit_write().  So doing the jbd2_journal_start at the start of
 * prepare_write() is the right place.
 *
- * Also, this function can nest inside ext4_writepage() ->
+ * Also, this function can nest inside ext4_writepage().  In that case, we
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
- * has generated enough buffer credits to do the whole page.  So we won't
+ * whole page.  So we won't block on the journal in that case, which is good,
- * block on the journal in that case, which is good, because the caller may
+ * because the caller may be PF_MEMALLOC.
- * be PF_MEMALLOC.
 *
 * By accident, ext4 can be reentered when a transaction is open via
 * quota file writes.  If we were to commit the transaction while thus
@@ -878,32 +906,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                    flags, pagep);
                if (ret < 0)
-                        goto out;
+                        return ret;
-                if (ret == 1) {
+                if (ret == 1)
-                        ret = 0;
+                        return 0;
-                        goto out;
-                }
        }
-retry:
+        /*
-        handle = ext4_journal_start(inode, needed_blocks);
+         * grab_cache_page_write_begin() can take a long time if the
+         * system is thrashing due to memory pressure, or if the page
+         * is being written back.  So grab it first before we start
+         * the transaction handle.  This also allows us to allocate
+         * the page (if needed) without using GFP_NOFS.
+         */
+retry_grab:
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page)
+                return -ENOMEM;
+        unlock_page(page);
+retry_journal:
+        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
+                page_cache_release(page);
-                goto out;
+                return PTR_ERR(handle);
        }
-        /* We cannot recurse into the filesystem as the transaction is already
+        lock_page(page);
-         * started */
+        if (page->mapping != mapping) {
-        flags |= AOP_FLAG_NOFS;
+                /* The page got truncated from under us */
+                unlock_page(page);
-        page = grab_cache_page_write_begin(mapping, index, flags);
+                page_cache_release(page);
-        if (!page) {
                ext4_journal_stop(handle);
-                ret = -ENOMEM;
+                goto retry_grab;
-                goto out;
        }
+        wait_on_page_writeback(page);
-        *pagep = page;
        if (ext4_should_dioread_nolock(inode))
                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -918,7 +954,6 @@ retry:
        if (ret) {
                unlock_page(page);
-                page_cache_release(page);
                /*
                 * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
@@ -942,11 +977,14 @@ retry:
                        if (inode->i_nlink)
                                ext4_orphan_del(NULL, inode);
                }
-        }
-        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                if (ret == -ENOSPC &&
-                goto retry;
+                    ext4_should_retry_alloc(inode->i_sb, &retries))
-out:
+                        goto retry_journal;
+                page_cache_release(page);
+                return ret;
+        }
+        *pagep = page;
        return ret;
 }
@@ -1256,7 +1294,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * function is called from invalidate page, it's
                 * harmless to return without any action.
                 */
-                ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+                ext4_warning(inode->i_sb, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
@@ -1357,7 +1395,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
        loff_t size = i_size_read(inode);
        unsigned int len, block_start;
        struct buffer_head *bh, *page_bufs = NULL;
-        int journal_data = ext4_should_journal_data(inode);
        sector_t pblock = 0, cur_logical = 0;
        struct ext4_io_submit io_submit;
@@ -1378,7 +1415,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
-                        int commit_write = 0, skip_page = 0;
+                        int skip_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
@@ -1400,27 +1437,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
-                        /*
-                         * If the page does not have buffers (for
-                         * whatever reason), try to create them using
-                         * __block_write_begin.  If this fails,
-                         * skip the page and move on.
-                         */
-                        if (!page_has_buffers(page)) {
-                                if (__block_write_begin(page, 0, len,
-                                                noalloc_get_block_write)) {
-                                skip_page:
-                                        unlock_page(page);
-                                        continue;
-                                }
-                                commit_write = 1;
-                        }
                        bh = page_bufs = page_buffers(page);
                        block_start = 0;
                        do {
-                                if (!bh)
-                                        goto skip_page;
                                if (map && (cur_logical >= map->m_lblk) &&
                                    (cur_logical <= (map->m_lblk +
                                                     (map->m_len - 1)))) {
@@ -1448,33 +1467,14 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                pblock++;
                        } while (bh != page_bufs);
-                        if (skip_page)
+                        if (skip_page) {
-                                goto skip_page;
+                                unlock_page(page);
+                                continue;
-                        if (commit_write)
+                        }
-                                /* mark the buffer_heads as dirty & uptodate */
-                                block_commit_write(page, 0, len);
                        clear_page_dirty_for_io(page);
-                        /*
+                        err = ext4_bio_write_page(&io_submit, page, len,
-                         * Delalloc doesn't support data journalling,
+                                                  mpd->wbc);
-                         * but eventually maybe we'll lift this
-                         * restriction.
-                         */
-                        if (unlikely(journal_data && PageChecked(page)))
-                                err = __ext4_journalled_writepage(page, len);
-                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
-                                err = ext4_bio_write_page(&io_submit, page,
-                                                          len, mpd->wbc);
-                        else if (buffer_uninit(page_bufs)) {
-                                ext4_set_bh_endio(page_bufs, inode);
-                                err = block_write_full_page_endio(page,
-                                        noalloc_get_block_write,
-                                        mpd->wbc, ext4_end_io_buffer_write);
-                        } else
-                                err = block_write_full_page(page,
-                                        noalloc_get_block_write, mpd->wbc);
                        if (!err)
                                mpd->pages_written++;
                        /*
@@ -1640,7 +1640,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                                 (unsigned long long) next,
                                 mpd->b_size >> mpd->inode->i_blkbits, err);
                        ext4_msg(sb, KERN_CRIT,
-                                "This should not happen!! Data will be lost\n");
+                                "This should not happen!! Data will be lost");
                        if (err == -ENOSPC)
                                ext4_print_free_blocks(mpd->inode);
                }
@@ -1690,16 +1690,16 @@ submit_io:
 *
 * @mpd->lbh - extent of blocks
 * @logical - logical number of the block in the file
- * @bh - bh of the block (used to access block's state)
+ * @b_state - b_state of the buffer head added
 *
 * the function is used to collect contig. blocks in same state
 */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
-                                   sector_t logical, size_t b_size,
                                   unsigned long b_state)
 {
        sector_t next;
-        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+        int blkbits = mpd->inode->i_blkbits;
+        int nrblocks = mpd->b_size >> blkbits;
        /*
         * XXX Don't go larger than mballoc is willing to allocate
@@ -1707,11 +1707,11 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
         * mpage_da_submit_io() into this function and then call
         * ext4_map_blocks() multiple times in a loop
         */
-        if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+        if (nrblocks >= (8*1024*1024 >> blkbits))
                goto flush_it;
-        /* check if thereserved journal credits might overflow */
+        /* check if the reserved journal credits might overflow */
-        if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+        if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
                        /*
                         * With non-extent format we are limited by the journal
@@ -1720,16 +1720,6 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
                         * nrblocks.  So limit nrblocks.
                         */
                        goto flush_it;
-                } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
-                                EXT4_MAX_TRANS_DATA) {
-                        /*
-                         * Adding the new buffer_head would make it cross the
-                         * allowed limit for which we have journal credit
-                         * reserved. So limit the new bh->b_size
-                         */
-                        b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
-                                                mpd->inode->i_blkbits;
-                        /* we will do mpage_da_submit_io in the next loop */
                }
        }
        /*
@@ -1737,7 +1727,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
         */
        if (mpd->b_size == 0) {
                mpd->b_blocknr = logical;
-                mpd->b_size = b_size;
+                mpd->b_size = 1 << blkbits;
                mpd->b_state = b_state & BH_FLAGS;
                return;
        }
@@ -1747,7 +1737,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
         * Can we merge the block to our big extent?
         */
        if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-                mpd->b_size += b_size;
+                mpd->b_size += 1 << blkbits;
                return;
        }
@@ -1775,6 +1765,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                              struct ext4_map_blocks *map,
                              struct buffer_head *bh)
 {
+        struct extent_status es;
        int retval;
        sector_t invalid_block = ~((sector_t) 0xffff);
@@ -1785,6 +1776,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
        ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
                  "logical block %lu\n", inode->i_ino, map->m_len,
                  (unsigned long) map->m_lblk);
+        /* Lookup extent status tree firstly */
+        if (ext4_es_lookup_extent(inode, iblock, &es)) {
+                if (ext4_es_is_hole(&es)) {
+                        retval = 0;
+                        down_read((&EXT4_I(inode)->i_data_sem));
+                        goto add_delayed;
+                }
+                /*
+                 * Delayed extent could be allocated by fallocate.
+                 * So we need to check it.
+                 */
+                if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+                        map_bh(bh, inode->i_sb, invalid_block);
+                        set_buffer_new(bh);
+                        set_buffer_delay(bh);
+                        return 0;
+                }
+                map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+                retval = es.es_len - (iblock - es.es_lblk);
+                if (retval > map->m_len)
+                        retval = map->m_len;
+                map->m_len = retval;
+                if (ext4_es_is_written(&es))
+                        map->m_flags |= EXT4_MAP_MAPPED;
+                else if (ext4_es_is_unwritten(&es))
+                        map->m_flags |= EXT4_MAP_UNWRITTEN;
+                else
+                        BUG_ON(1);
+                return retval;
+        }
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
@@ -1803,11 +1830,15 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                        map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                retval = 0;
        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+                retval = ext4_ext_map_blocks(NULL, inode, map,
+                                             EXT4_GET_BLOCKS_NO_PUT_HOLE);
        else
-                retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+                retval = ext4_ind_map_blocks(NULL, inode, map,
+                                             EXT4_GET_BLOCKS_NO_PUT_HOLE);
+add_delayed:
        if (retval == 0) {
+                int ret;
                /*
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
@@ -1815,15 +1846,20 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                /* If the block was allocated from previously allocated cluster,
                 * then we dont need to reserve it again. */
                if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
-                        retval = ext4_da_reserve_space(inode, iblock);
+                        ret = ext4_da_reserve_space(inode, iblock);
-                        if (retval)
+                        if (ret) {
                                /* not enough space to reserve */
+                                retval = ret;
                                goto out_unlock;
+                        }
                }
-                retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
+                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
-                if (retval)
+                                            ~0, EXTENT_STATUS_DELAYED);
+                if (ret) {
+                        retval = ret;
                        goto out_unlock;
+                }
                /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
                 * and it should not appear on the bh->b_state.
@@ -1833,6 +1869,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                map_bh(bh, inode->i_sb, invalid_block);
                set_buffer_new(bh);
                set_buffer_delay(bh);
+        } else if (retval > 0) {
+                int ret;
+                unsigned long long status;
+                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+                                            map->m_pblk, status);
+                if (ret != 0)
+                        retval = ret;
        }
 out_unlock:
@@ -1890,27 +1936,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
        return 0;
 }
-/*
- * This function is used as a standard get_block_t calback function
- * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_write_begin() and block_write_full_page().
- * These functions should only try to map a single block at a time.
- *
- * Since this function doesn't do block allocations even if the caller
- * requests it by passing in create=1, it is critically important that
- * any caller checks to make sure that any buffer heads are returned
- * by this function are either all already mapped or marked for
- * delayed allocation before calling  block_write_full_page().  Otherwise,
- * b_blocknr could be left unitialized, and the page write functions will
- * be taken by surprise.
- */
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
-                                   struct buffer_head *bh_result, int create)
-{
-        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
-        return _ext4_get_block(inode, iblock, bh_result, 0);
-}
 static int bget_one(handle_t *handle, struct buffer_head *bh)
 {
        get_bh(bh);
@@ -1955,7 +1980,8 @@ static int __ext4_journalled_writepage(struct page *page,
         * references to buffers so we are safe */
        unlock_page(page);
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                                    ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out;
@@ -2035,11 +2061,12 @@ out:
 static int ext4_writepage(struct page *page,
                          struct writeback_control *wbc)
 {
-        int ret = 0, commit_write = 0;
+        int ret = 0;
        loff_t size;
        unsigned int len;
        struct buffer_head *page_bufs = NULL;
        struct inode *inode = page->mapping->host;
+        struct ext4_io_submit io_submit;
        trace_ext4_writepage(page);
        size = i_size_read(inode);
@@ -2048,39 +2075,29 @@ static int ext4_writepage(struct page *page,
        else
                len = PAGE_CACHE_SIZE;
+        page_bufs = page_buffers(page);
        /*
-         * If the page does not have buffers (for whatever reason),
+         * We cannot do block allocation or other extent handling in this
-         * try to create them using __block_write_begin.  If this
+         * function. If there are buffers needing that, we have to redirty
-         * fails, redirty the page and move on.
+         * the page. But we may reach here when we do a journal commit via
+         * journal_submit_inode_data_buffers() and in that case we must write
+         * allocated buffers to achieve data=ordered mode guarantees.
         */
-        if (!page_has_buffers(page)) {
+        if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                if (__block_write_begin(page, 0, len,
+                                   ext4_bh_delay_or_unwritten)) {
-                                        noalloc_get_block_write)) {
+                redirty_page_for_writepage(wbc, page);
-                redirty_page:
+                if (current->flags & PF_MEMALLOC) {
-                        redirty_page_for_writepage(wbc, page);
+                        /*
+                         * For memory cleaning there's no point in writing only
+                         * some buffers. So just bail out. Warn if we came here
+                         * from direct reclaim.
+                         */
+                        WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+                                                        == PF_MEMALLOC);
                        unlock_page(page);
                        return 0;
                }
-                commit_write = 1;
        }
-        page_bufs = page_buffers(page);
-        if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                   ext4_bh_delay_or_unwritten)) {
-                /*
-                 * We don't want to do block allocation, so redirty
-                 * the page and return.  We may reach here when we do
-                 * a journal commit via journal_submit_inode_data_buffers.
-                 * We can also reach here via shrink_page_list but it
-                 * should never be for direct reclaim so warn if that
-                 * happens
-                 */
-                WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
-                                                                PF_MEMALLOC);
-                goto redirty_page;
-        }
-        if (commit_write)
-                /* now mark the buffer_heads as dirty and uptodate */
-                block_commit_write(page, 0, len);
        if (PageChecked(page) && ext4_should_journal_data(inode))
                /*
@@ -2089,14 +2106,9 @@ static int ext4_writepage(struct page *page,
                 */
                return __ext4_journalled_writepage(page, len);
-        if (buffer_uninit(page_bufs)) {
+        memset(&io_submit, 0, sizeof(io_submit));
-                ext4_set_bh_endio(page_bufs, inode);
+        ret = ext4_bio_write_page(&io_submit, page, len, wbc);
-                ret = block_write_full_page_endio(page, noalloc_get_block_write,
+        ext4_io_submit(&io_submit);
-                                            wbc, ext4_end_io_buffer_write);
-        } else
-                ret = block_write_full_page(page, noalloc_get_block_write,
-                                            wbc);
        return ret;
 }
@@ -2228,51 +2240,38 @@ static int write_cache_pages_da(handle_t *handle,
                        logical = (sector_t) page->index <<
                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                        if (!page_has_buffers(page)) {
+                        /* Add all dirty buffers to mpd */
-                                mpage_add_bh_to_extent(mpd, logical,
+                        head = page_buffers(page);
-                                                       PAGE_CACHE_SIZE,
+                        bh = head;
-                                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
+                        do {
-                                if (mpd->io_done)
+                                BUG_ON(buffer_locked(bh));
-                                        goto ret_extent_tail;
-                        } else {
                                /*
-                                 * Page with regular buffer heads,
+                                 * We need to try to allocate unmapped blocks
-                                 * just add all dirty ones
+                                 * in the same page.  Otherwise we won't make
+                                 * progress with the page in ext4_writepage
                                 */
-                                head = page_buffers(page);
+                                if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                                bh = head;
+                                        mpage_add_bh_to_extent(mpd, logical,
-                                do {
+                                                               bh->b_state);
-                                        BUG_ON(buffer_locked(bh));
+                                        if (mpd->io_done)
+                                                goto ret_extent_tail;
+                                } else if (buffer_dirty(bh) &&
+                                           buffer_mapped(bh)) {
                                        /*
-                                         * We need to try to allocate
+                                         * mapped dirty buffer. We need to
-                                         * unmapped blocks in the same page.
+                                         * update the b_state because we look
-                                         * Otherwise we won't make progress
+                                         * at b_state in mpage_da_map_blocks.
-                                         * with the page in ext4_writepage
+                                         * We don't update b_size because if we
+                                         * find an unmapped buffer_head later
+                                         * we need to use the b_state flag of
+                                         * that buffer_head.
                                         */
-                                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+                                        if (mpd->b_size == 0)
-                                                mpage_add_bh_to_extent(mpd, logical,
+                                                mpd->b_state =
-                                                                       bh->b_size,
+                                                        bh->b_state & BH_FLAGS;
-                                                                       bh->b_state);
+                                }
-                                                if (mpd->io_done)
+                                logical++;
-                                                        goto ret_extent_tail;
+                        } while ((bh = bh->b_this_page) != head);
-                                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
-                                                /*
-                                                 * mapped dirty buffer. We need
-                                                 * to update the b_state
-                                                 * because we look at b_state
-                                                 * in mpage_da_map_blocks.  We
-                                                 * don't update b_size because
-                                                 * if we find an unmapped
-                                                 * buffer_head later we need to
-                                                 * use the b_state flag of that
-                                                 * buffer_head.
-                                                 */
-                                                if (mpd->b_size == 0)
-                                                        mpd->b_state = bh->b_state & BH_FLAGS;
-                                        }
-                                        logical++;
-                                } while ((bh = bh->b_this_page) != head);
-                        }
                        if (nr_to_write > 0) {
                                nr_to_write--;
@@ -2413,7 +2412,8 @@ retry:
                needed_blocks = ext4_da_writepages_trans_blocks(inode);
                /* start a new transaction*/
-                handle = ext4_journal_start(inode, needed_blocks);
+                handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                                            needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
@@ -2555,42 +2555,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                                      pos, len, flags,
                                                      pagep, fsdata);
                if (ret < 0)
-                        goto out;
+                        return ret;
-                if (ret == 1) {
+                if (ret == 1)
-                        ret = 0;
+                        return 0;
-                        goto out;
-                }
        }
-retry:
+        /*
+         * grab_cache_page_write_begin() can take a long time if the
+         * system is thrashing due to memory pressure, or if the page
+         * is being written back.  So grab it first before we start
+         * the transaction handle.  This also allows us to allocate
+         * the page (if needed) without using GFP_NOFS.
+         */
+retry_grab:
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page)
+                return -ENOMEM;
+        unlock_page(page);
        /*
         * With delayed allocation, we don't log the i_disksize update
         * if there is delayed block allocation. But we still need
         * to journalling the i_disksize update if writes to the end
         * of file which has an already mapped buffer.
         */
-        handle = ext4_journal_start(inode, 1);
+retry_journal:
+        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
+                page_cache_release(page);
-                goto out;
+                return PTR_ERR(handle);
        }
-        /* We cannot recurse into the filesystem as the transaction is already
-         * started */
-        flags |= AOP_FLAG_NOFS;
-        page = grab_cache_page_write_begin(mapping, index, flags);
+        lock_page(page);
-        if (!page) {
+        if (page->mapping != mapping) {
+                /* The page got truncated from under us */
+                unlock_page(page);
+                page_cache_release(page);
                ext4_journal_stop(handle);
-                ret = -ENOMEM;
+                goto retry_grab;
-                goto out;
        }
-        *pagep = page;
+        /* In case writeback began while the page was unlocked */
+        wait_on_page_writeback(page);
        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
-                page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
@@ -2598,11 +2608,16 @@ retry:
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);
+                if (ret == -ENOSPC &&
+                    ext4_should_retry_alloc(inode->i_sb, &retries))
+                        goto retry_journal;
+                page_cache_release(page);
+                return ret;
        }
-        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+        *pagep = page;
-                goto retry;
-out:
        return ret;
 }
@@ -2858,36 +2873,10 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
-static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
-{
-        struct buffer_head *head, *bh;
-        unsigned int curr_off = 0;
-        if (!page_has_buffers(page))
-                return;
-        head = bh = page_buffers(page);
-        do {
-                if (offset <= curr_off && test_clear_buffer_uninit(bh)
-                                        && bh->b_private) {
-                        ext4_free_io_end(bh->b_private);
-                        bh->b_private = NULL;
-                        bh->b_end_io = NULL;
-                }
-                curr_off = curr_off + bh->b_size;
-                bh = bh->b_this_page;
-        } while (bh != head);
-}
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
        trace_ext4_invalidatepage(page, offset);
-        /*
-         * free any io_end structure allocated for buffers to be discarded
-         */
-        if (ext4_should_dioread_nolock(page->mapping->host))
-                ext4_invalidatepage_free_endio(page, offset);
        /* No journalling happens on data buffers when this function is used */
        WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
@@ -2977,9 +2966,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                ext4_free_io_end(io_end);
 out:
+                inode_dio_done(inode);
                if (is_async)
                        aio_complete(iocb, ret, 0);
-                inode_dio_done(inode);
                return;
        }
@@ -2993,65 +2982,6 @@ out:
        ext4_add_complete_io(io_end);
 }
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
-{
-        ext4_io_end_t *io_end = bh->b_private;
-        struct inode *inode;
-        if (!test_clear_buffer_uninit(bh) || !io_end)
-                goto out;
-        if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
-                ext4_msg(io_end->inode->i_sb, KERN_INFO,
-                         "sb umounted, discard end_io request for inode %lu",
-                         io_end->inode->i_ino);
-                ext4_free_io_end(io_end);
-                goto out;
-        }
-        /*
-         * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
-         * but being more careful is always safe for the future change.
-         */
-        inode = io_end->inode;
-        ext4_set_io_unwritten_flag(inode, io_end);
-        ext4_add_complete_io(io_end);
-out:
-        bh->b_private = NULL;
-        bh->b_end_io = NULL;
-        clear_buffer_uninit(bh);
-        end_buffer_async_write(bh, uptodate);
-}
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
-{
-        ext4_io_end_t *io_end;
-        struct page *page = bh->b_page;
-        loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
-        size_t size = bh->b_size;
-retry:
-        io_end = ext4_init_io_end(inode, GFP_ATOMIC);
-        if (!io_end) {
-                pr_warn_ratelimited("%s: allocation fail\n", __func__);
-                schedule();
-                goto retry;
-        }
-        io_end->offset = offset;
-        io_end->size = size;
-        /*
-         * We need to hold a reference to the page to make sure it
-         * doesn't get evicted before ext4_end_io_work() has a chance
-         * to convert the extent from written to unwritten.
-         */
-        io_end->page = page;
-        get_page(io_end->page);
-        bh->b_private = io_end;
-        bh->b_end_io = ext4_end_io_buffer_write;
-        return 0;
-}
 /*
 * For ext4 extent files, ext4 will do direct-io write to holes,
 * preallocated extents, and those write extend the file, no need to
@@ -3557,16 +3487,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
-        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                /* TODO: Add support for non extent hole punching */
+                return ext4_ind_punch_hole(file, offset, length);
-                return -EOPNOTSUPP;
-        }
        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
                /* TODO: Add support for bigalloc file systems */
                return -EOPNOTSUPP;
        }
+        trace_ext4_punch_hole(inode, offset, length);
        return ext4_ext_punch_hole(file, offset, length);
 }
@@ -3660,11 +3590,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
        bh = sb_getblk(sb, block);
-        if (!bh) {
+        if (unlikely(!bh))
-                EXT4_ERROR_INODE_BLOCK(inode, block,
+                return -ENOMEM;
-                                       "unable to read itable block");
-                return -EIO;
-        }
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
@@ -3696,7 +3623,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
                        /* Is the inode bitmap in cache? */
                        bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
-                        if (!bitmap_bh)
+                        if (unlikely(!bitmap_bh))
                                goto make_io;
                        /*
@@ -4404,8 +4331,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
-                handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+                handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
-                                        EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+                        (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
@@ -4440,7 +4368,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
            (attr->ia_size < inode->i_size)) {
                handle_t *handle;
-                handle = ext4_journal_start(inode, 3);
+                handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
@@ -4460,7 +4388,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                                            attr->ia_size);
                        if (error) {
                                /* Do as much error cleanup as possible */
-                                handle = ext4_journal_start(inode, 3);
+                                handle = ext4_journal_start(inode,
+                                                            EXT4_HT_INODE, 3);
                                if (IS_ERR(handle)) {
                                        ext4_orphan_del(NULL, inode);
                                        goto err_out;
@@ -4801,7 +4730,7 @@ void ext4_dirty_inode(struct inode *inode, int flags)
 {
        handle_t *handle;
-        handle = ext4_journal_start(inode, 2);
+        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                goto out;
@@ -4902,7 +4831,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        /* Finally we can mark the inode as dirty. */
-        handle = ext4_journal_start(inode, 1);
+        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -4980,7 +4909,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        else
                get_block = ext4_get_block;
 retry_alloc:
-        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                                    ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-02-26 17:52:45 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-02-26 17:52:45 -0500
commit	6515925b8259549b7f2187e25d3260306e3e85e5 (patch)
tree	7d51487f308f8f0ac95d3113606c39ba592111ba /fs/ext4/inode.c
parent	bbbd27e694ce2c5fde9c8fcedbea618dd9153fe7 (diff)
parent	304e220f0879198b1f5309ad6f0be862b4009491 (diff)