1 files changed, 382 insertions, 645 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a9ebabfe7587..d445a64b979e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,21 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+/* flags for direct write completions */
+#define XFS_DIO_FLAG_UNWRITTEN  (1 << 0)
+#define XFS_DIO_FLAG_APPEND     (1 << 1)
+/*
+ * structure owned by writepages passed to individual writepage calls
+ */
+struct xfs_writepage_ctx {
+        struct xfs_bmbt_irec    imap;
+        bool                    imap_valid;
+        unsigned int            io_type;
+        struct xfs_ioend        *ioend;
+        sector_t                last_block;
+};
 void
 xfs_count_page_state(
        struct page             *page,
@@ -214,10 +229,12 @@ xfs_end_io(
        struct xfs_inode *ip = XFS_I(ioend->io_inode);
        int             error = 0;
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        /*
+         * Set an error if the mount has shut down and proceed with end I/O
+         * processing so it can perform whatever cleanups are necessary.
+         */
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                ioend->io_error = -EIO;
-                goto done;
-        }
        /*
         * For unwritten extents we need to issue transactions to convert a
@@ -265,7 +282,7 @@ xfs_alloc_ioend(
         */
        atomic_set(&ioend->io_remaining, 1);
        ioend->io_error = 0;
-        ioend->io_list = NULL;
+        INIT_LIST_HEAD(&ioend->io_list);
        ioend->io_type = type;
        ioend->io_inode = inode;
        ioend->io_buffer_head = NULL;
@@ -283,8 +300,7 @@ xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
        struct xfs_bmbt_irec    *imap,
-        int                     type,
+        int                     type)
-        int                     nonblocking)
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
@@ -300,12 +316,7 @@ xfs_map_blocks(
        if (type == XFS_IO_UNWRITTEN)
                bmapi_flags |= XFS_BMAPI_IGSTATE;
-        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
-                if (nonblocking)
-                        return -EAGAIN;
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
-        }
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               (ip->i_df.if_flags & XFS_IFEXTENTS));
        ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -341,7 +352,7 @@ xfs_map_blocks(
        return 0;
 }
-STATIC int
+STATIC bool
 xfs_imap_valid(
        struct inode            *inode,
        struct xfs_bmbt_irec    *imap,
@@ -414,8 +425,7 @@ xfs_start_buffer_writeback(
 STATIC void
 xfs_start_page_writeback(
        struct page             *page,
-        int                     clear_dirty,
+        int                     clear_dirty)
-        int                     buffers)
 {
        ASSERT(PageLocked(page));
        ASSERT(!PageWriteback(page));
@@ -434,10 +444,6 @@ xfs_start_page_writeback(
                set_page_writeback_keepwrite(page);
        unlock_page(page);
-        /* If no buffers on the page are to be written, finish it here */
-        if (!buffers)
-                end_page_writeback(page);
 }
 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -446,153 +452,101 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 }
 /*
- * Submit all of the bios for all of the ioends we have saved up, covering the
+ * Submit all of the bios for an ioend. We are only passed a single ioend at a
- * initial writepage page and also any probed pages.
+ * time; the caller is responsible for chaining prior to submission.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * buffer_heads, and then submit them for I/O on the second pass.
 *
 * If @fail is non-zero, it means that we have a situation where some part of
 * the submission process has failed after we have marked paged for writeback
 * and unlocked them. In this situation, we need to fail the ioend chain rather
 * than submit it to IO. This typically only happens on a filesystem shutdown.
 */
-STATIC void
+STATIC int
 xfs_submit_ioend(
        struct writeback_control *wbc,
        xfs_ioend_t             *ioend,
-        int                     fail)
+        int                     status)
 {
-        xfs_ioend_t             *head = ioend;
-        xfs_ioend_t             *next;
        struct buffer_head      *bh;
        struct bio              *bio;
        sector_t                lastblock = 0;
-        /* Pass 1 - start writeback */
+        /* Reserve log space if we might write beyond the on-disk inode size. */
-        do {
+        if (!status &&
-                next = ioend->io_list;
+             ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+                status = xfs_setfilesize_trans_alloc(ioend);
-                        xfs_start_buffer_writeback(bh);
+        /*
-        } while ((ioend = next) != NULL);
+         * If we are failing the IO now, just mark the ioend with an
+         * error and finish it. This will run IO completion immediately
+         * as there is only one reference to the ioend at this point in
+         * time.
+         */
+        if (status) {
+                ioend->io_error = status;
+                xfs_finish_ioend(ioend);
+                return status;
+        }
-        /* Pass 2 - submit I/O */
+        bio = NULL;
-        ioend = head;
+        for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-        do {
-                next = ioend->io_list;
-                bio = NULL;
-                /*
+                if (!bio) {
-                 * If we are failing the IO now, just mark the ioend with an
+retry:
-                 * error and finish it. This will run IO completion immediately
+                        bio = xfs_alloc_ioend_bio(bh);
-                 * as there is only one reference to the ioend at this point in
+                } else if (bh->b_blocknr != lastblock + 1) {
-                 * time.
+                        xfs_submit_ioend_bio(wbc, ioend, bio);
-                 */
+                        goto retry;
-                if (fail) {
-                        ioend->io_error = fail;
-                        xfs_finish_ioend(ioend);
-                        continue;
                }
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-                        if (!bio) {
- retry:
-                                bio = xfs_alloc_ioend_bio(bh);
-                        } else if (bh->b_blocknr != lastblock + 1) {
-                                xfs_submit_ioend_bio(wbc, ioend, bio);
-                                goto retry;
-                        }
-                        if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-                                xfs_submit_ioend_bio(wbc, ioend, bio);
-                                goto retry;
-                        }
-                        lastblock = bh->b_blocknr;
-                }
-                if (bio)
                        xfs_submit_ioend_bio(wbc, ioend, bio);
-                xfs_finish_ioend(ioend);
+                        goto retry;
-        } while ((ioend = next) != NULL);
+                }
-}
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too.  Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
-        xfs_ioend_t             *ioend)
-{
-        xfs_ioend_t             *next;
-        struct buffer_head      *bh, *next_bh;
-        do {
-                next = ioend->io_list;
-                bh = ioend->io_buffer_head;
-                do {
-                        next_bh = bh->b_private;
-                        clear_buffer_async_write(bh);
-                        /*
-                         * The unwritten flag is cleared when added to the
-                         * ioend. We're not submitting for I/O so mark the
-                         * buffer unwritten again for next time around.
-                         */
-                        if (ioend->io_type == XFS_IO_UNWRITTEN)
-                                set_buffer_unwritten(bh);
-                        unlock_buffer(bh);
-                } while ((bh = next_bh) != NULL);
-                mempool_free(ioend, xfs_ioend_pool);
+                lastblock = bh->b_blocknr;
-        } while ((ioend = next) != NULL);
+        }
+        if (bio)
+                xfs_submit_ioend_bio(wbc, ioend, bio);
+        xfs_finish_ioend(ioend);
+        return 0;
 }
 /*
 * Test to see if we've been building up a completion structure for
 * earlier buffers -- if so, we try to append to this ioend if we
 * can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
+ * Return the ioend we finished off so that the caller can submit it
+ * once it has finished processing the dirty page.
 */
 STATIC void
 xfs_add_to_ioend(
        struct inode            *inode,
        struct buffer_head      *bh,
        xfs_off_t               offset,
-        unsigned int            type,
+        struct xfs_writepage_ctx *wpc,
-        xfs_ioend_t             **result,
+        struct list_head        *iolist)
-        int                     need_ioend)
 {
-        xfs_ioend_t             *ioend = *result;
+        if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+            bh->b_blocknr != wpc->last_block + 1 ||
-        if (!ioend || need_ioend || type != ioend->io_type) {
+            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
-                xfs_ioend_t     *previous = *result;
+                struct xfs_ioend        *new;
-                ioend = xfs_alloc_ioend(inode, type);
+                if (wpc->ioend)
-                ioend->io_offset = offset;
+                        list_add(&wpc->ioend->io_list, iolist);
-                ioend->io_buffer_head = bh;
-                ioend->io_buffer_tail = bh;
+                new = xfs_alloc_ioend(inode, wpc->io_type);
-                if (previous)
+                new->io_offset = offset;
-                        previous->io_list = ioend;
+                new->io_buffer_head = bh;
-                *result = ioend;
+                new->io_buffer_tail = bh;
+                wpc->ioend = new;
        } else {
-                ioend->io_buffer_tail->b_private = bh;
+                wpc->ioend->io_buffer_tail->b_private = bh;
-                ioend->io_buffer_tail = bh;
+                wpc->ioend->io_buffer_tail = bh;
        }
        bh->b_private = NULL;
-        ioend->io_size += bh->b_size;
+        wpc->ioend->io_size += bh->b_size;
+        wpc->last_block = bh->b_blocknr;
+        xfs_start_buffer_writeback(bh);
 }
 STATIC void
@@ -678,183 +632,6 @@ xfs_check_page_type(
        return false;
 }
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
-        struct inode            *inode,
-        struct page             *page,
-        loff_t                  tindex,
-        struct xfs_bmbt_irec    *imap,
-        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc)
-{
-        struct buffer_head      *bh, *head;
-        xfs_off_t               end_offset;
-        unsigned long           p_offset;
-        unsigned int            type;
-        int                     len, page_dirty;
-        int                     count = 0, done = 0, uptodate = 1;
-        xfs_off_t               offset = page_offset(page);
-        if (page->index != tindex)
-                goto fail;
-        if (!trylock_page(page))
-                goto fail;
-        if (PageWriteback(page))
-                goto fail_unlock_page;
-        if (page->mapping != inode->i_mapping)
-                goto fail_unlock_page;
-        if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
-                goto fail_unlock_page;
-        /*
-         * page_dirty is initially a count of buffers on the page before
-         * EOF and is decremented as we move each into a cleanable state.
-         *
-         * Derivation:
-         *
-         * End offset is the highest offset that this page should represent.
-         * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-         * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-         * hence give us the correct page_dirty count. On any other page,
-         * it will be zero and in that case we need page_dirty to be the
-         * count of buffers on the page.
-         */
-        end_offset = min_t(unsigned long long,
-                        (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
-                        i_size_read(inode));
-        /*
-         * If the current map does not span the entire page we are about to try
-         * to write, then give up. The only way we can write a page that spans
-         * multiple mappings in a single writeback iteration is via the
-         * xfs_vm_writepage() function. Data integrity writeback requires the
-         * entire page to be written in a single attempt, otherwise the part of
-         * the page we don't write here doesn't get written as part of the data
-         * integrity sync.
-         *
-         * For normal writeback, we also don't attempt to write partial pages
-         * here as it simply means that write_cache_pages() will see it under
-         * writeback and ignore the page until some point in the future, at
-         * which time this will be the only page in the file that needs
-         * writeback.  Hence for more optimal IO patterns, we should always
-         * avoid partial page writeback due to multiple mappings on a page here.
-         */
-        if (!xfs_imap_valid(inode, imap, end_offset))
-                goto fail_unlock_page;
-        len = 1 << inode->i_blkbits;
-        p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-                                        PAGE_CACHE_SIZE);
-        p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-        page_dirty = p_offset / len;
-        /*
-         * The moment we find a buffer that doesn't match our current type
-         * specification or can't be written, abort the loop and start
-         * writeback. As per the above xfs_imap_valid() check, only
-         * xfs_vm_writepage() can handle partial page writeback fully - we are
-         * limited here to the buffers that are contiguous with the current
-         * ioend, and hence a buffer we can't write breaks that contiguity and
-         * we have to defer the rest of the IO to xfs_vm_writepage().
-         */
-        bh = head = page_buffers(page);
-        do {
-                if (offset >= end_offset)
-                        break;
-                if (!buffer_uptodate(bh))
-                        uptodate = 0;
-                if (!(PageUptodate(page) || buffer_uptodate(bh))) {
-                        done = 1;
-                        break;
-                }
-                if (buffer_unwritten(bh) || buffer_delay(bh) ||
-                    buffer_mapped(bh)) {
-                        if (buffer_unwritten(bh))
-                                type = XFS_IO_UNWRITTEN;
-                        else if (buffer_delay(bh))
-                                type = XFS_IO_DELALLOC;
-                        else
-                                type = XFS_IO_OVERWRITE;
-                        /*
-                         * imap should always be valid because of the above
-                         * partial page end_offset check on the imap.
-                         */
-                        ASSERT(xfs_imap_valid(inode, imap, offset));
-                        lock_buffer(bh);
-                        if (type != XFS_IO_OVERWRITE)
-                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_add_to_ioend(inode, bh, offset, type,
-                                         ioendp, done);
-                        page_dirty--;
-                        count++;
-                } else {
-                        done = 1;
-                        break;
-                }
-        } while (offset += len, (bh = bh->b_this_page) != head);
-        if (uptodate && bh == head)
-                SetPageUptodate(page);
-        if (count) {
-                if (--wbc->nr_to_write <= 0 &&
-                    wbc->sync_mode == WB_SYNC_NONE)
-                        done = 1;
-        }
-        xfs_start_page_writeback(page, !page_dirty, count);
-        return done;
- fail_unlock_page:
-        unlock_page(page);
- fail:
-        return 1;
-}
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
-        struct inode            *inode,
-        pgoff_t                 tindex,
-        struct xfs_bmbt_irec    *imap,
-        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
-        pgoff_t                 tlast)
-{
-        struct pagevec          pvec;
-        int                     done = 0, i;
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tlast) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc);
-                        if (done)
-                                break;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-}
 STATIC void
 xfs_vm_invalidatepage(
        struct page             *page,
@@ -932,6 +709,164 @@ out_invalidate:
 }
 /*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding buffers to is cached on the writepage context, and if the new buffer
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected.  While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+static int
+xfs_writepage_map(
+        struct xfs_writepage_ctx *wpc,
+        struct writeback_control *wbc,
+        struct inode            *inode,
+        struct page             *page,
+        loff_t                  offset,
+        __uint64_t              end_offset)
+{
+        LIST_HEAD(submit_list);
+        struct xfs_ioend        *ioend, *next;
+        struct buffer_head      *bh, *head;
+        ssize_t                 len = 1 << inode->i_blkbits;
+        int                     error = 0;
+        int                     count = 0;
+        int                     uptodate = 1;
+        bh = head = page_buffers(page);
+        offset = page_offset(page);
+        do {
+                if (offset >= end_offset)
+                        break;
+                if (!buffer_uptodate(bh))
+                        uptodate = 0;
+                /*
+                 * set_page_dirty dirties all buffers in a page, independent
+                 * of their state.  The dirty state however is entirely
+                 * meaningless for holes (!mapped && uptodate), so skip
+                 * buffers covering holes here.
+                 */
+                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+                        wpc->imap_valid = false;
+                        continue;
+                }
+                if (buffer_unwritten(bh)) {
+                        if (wpc->io_type != XFS_IO_UNWRITTEN) {
+                                wpc->io_type = XFS_IO_UNWRITTEN;
+                                wpc->imap_valid = false;
+                        }
+                } else if (buffer_delay(bh)) {
+                        if (wpc->io_type != XFS_IO_DELALLOC) {
+                                wpc->io_type = XFS_IO_DELALLOC;
+                                wpc->imap_valid = false;
+                        }
+                } else if (buffer_uptodate(bh)) {
+                        if (wpc->io_type != XFS_IO_OVERWRITE) {
+                                wpc->io_type = XFS_IO_OVERWRITE;
+                                wpc->imap_valid = false;
+                        }
+                } else {
+                        if (PageUptodate(page))
+                                ASSERT(buffer_mapped(bh));
+                        /*
+                         * This buffer is not uptodate and will not be
+                         * written to disk.  Ensure that we will put any
+                         * subsequent writeable buffers into a new
+                         * ioend.
+                         */
+                        wpc->imap_valid = false;
+                        continue;
+                }
+                if (wpc->imap_valid)
+                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+                                                         offset);
+                if (!wpc->imap_valid) {
+                        error = xfs_map_blocks(inode, offset, &wpc->imap,
+                                             wpc->io_type);
+                        if (error)
+                                goto out;
+                        wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+                                                         offset);
+                }
+                if (wpc->imap_valid) {
+                        lock_buffer(bh);
+                        if (wpc->io_type != XFS_IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+                        xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+                        count++;
+                }
+        } while (offset += len, ((bh = bh->b_this_page) != head));
+        if (uptodate && bh == head)
+                SetPageUptodate(page);
+        ASSERT(wpc->ioend || list_empty(&submit_list));
+out:
+        /*
+         * On error, we have to fail the ioend here because we have locked
+         * buffers in the ioend. If we don't do this, we'll deadlock
+         * invalidating the page as that tries to lock the buffers on the page.
+         * Also, because we may have set pages under writeback, we have to make
+         * sure we run IO completion to mark the error state of the IO
+         * appropriately, so we can't cancel the ioend directly here. That means
+         * we have to mark this page as under writeback if we included any
+         * buffers from it in the ioend chain so that completion treats it
+         * correctly.
+         *
+         * If we didn't include the page in the ioend, the on error we can
+         * simply discard and unlock it as there are no other users of the page
+         * or it's buffers right now. The caller will still need to trigger
+         * submission of outstanding ioends on the writepage context so they are
+         * treated correctly on error.
+         */
+        if (count) {
+                xfs_start_page_writeback(page, !error);
+                /*
+                 * Preserve the original error if there was one, otherwise catch
+                 * submission errors here and propagate into subsequent ioend
+                 * submissions.
+                 */
+                list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+                        int error2;
+                        list_del_init(&ioend->io_list);
+                        error2 = xfs_submit_ioend(wbc, ioend, error);
+                        if (error2 && !error)
+                                error = error2;
+                }
+        } else if (error) {
+                xfs_aops_discard_page(page);
+                ClearPageUptodate(page);
+                unlock_page(page);
+        } else {
+                /*
+                 * We can end up here with no error and nothing to write if we
+                 * race with a partial page truncate on a sub-page block sized
+                 * filesystem. In that case we need to mark the page clean.
+                 */
+                xfs_start_page_writeback(page, 1);
+                end_page_writeback(page);
+        }
+        mapping_set_error(page->mapping, error);
+        return error;
+}
+/*
 * Write out a dirty page.
 *
 * For delalloc space on the page we need to allocate space and flush it.
@@ -940,22 +875,16 @@ out_invalidate:
 * For any other dirty buffer heads on the page we should flush them.
 */
 STATIC int
-xfs_vm_writepage(
+xfs_do_writepage(
        struct page             *page,
-        struct writeback_control *wbc)
+        struct writeback_control *wbc,
+        void                    *data)
 {
+        struct xfs_writepage_ctx *wpc = data;
        struct inode            *inode = page->mapping->host;
-        struct buffer_head      *bh, *head;
-        struct xfs_bmbt_irec    imap;
-        xfs_ioend_t             *ioend = NULL, *iohead = NULL;
        loff_t                  offset;
-        unsigned int            type;
        __uint64_t              end_offset;
-        pgoff_t                 end_index, last_index;
+        pgoff_t                 end_index;
-        ssize_t                 len;
-        int                     err, imap_valid = 0, uptodate = 1;
-        int                     count = 0;
-        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0, 0);
@@ -982,12 +911,9 @@ xfs_vm_writepage(
        if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
                goto redirty;
-        /* Is this page beyond the end of the file? */
-        offset = i_size_read(inode);
-        end_index = offset >> PAGE_CACHE_SHIFT;
-        last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
        /*
+         * Is this page beyond the end of the file?
+         *
         * The page index is less than the end_index, adjust the end_offset
         * to the highest offset that this page should represent.
         * -----------------------------------------------------
@@ -998,6 +924,8 @@ xfs_vm_writepage(
         * |     desired writeback range    |      see else    |
         * ---------------------------------^------------------|
         */
+        offset = i_size_read(inode);
+        end_index = offset >> PAGE_CACHE_SHIFT;
        if (page->index < end_index)
                end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
        else {
@@ -1049,152 +977,7 @@ xfs_vm_writepage(
                end_offset = offset;
        }
-        len = 1 << inode->i_blkbits;
+        return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
-        bh = head = page_buffers(page);
-        offset = page_offset(page);
-        type = XFS_IO_OVERWRITE;
-        if (wbc->sync_mode == WB_SYNC_NONE)
-                nonblocking = 1;
-        do {
-                int new_ioend = 0;
-                if (offset >= end_offset)
-                        break;
-                if (!buffer_uptodate(bh))
-                        uptodate = 0;
-                /*
-                 * set_page_dirty dirties all buffers in a page, independent
-                 * of their state.  The dirty state however is entirely
-                 * meaningless for holes (!mapped && uptodate), so skip
-                 * buffers covering holes here.
-                 */
-                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-                        imap_valid = 0;
-                        continue;
-                }
-                if (buffer_unwritten(bh)) {
-                        if (type != XFS_IO_UNWRITTEN) {
-                                type = XFS_IO_UNWRITTEN;
-                                imap_valid = 0;
-                        }
-                } else if (buffer_delay(bh)) {
-                        if (type != XFS_IO_DELALLOC) {
-                                type = XFS_IO_DELALLOC;
-                                imap_valid = 0;
-                        }
-                } else if (buffer_uptodate(bh)) {
-                        if (type != XFS_IO_OVERWRITE) {
-                                type = XFS_IO_OVERWRITE;
-                                imap_valid = 0;
-                        }
-                } else {
-                        if (PageUptodate(page))
-                                ASSERT(buffer_mapped(bh));
-                        /*
-                         * This buffer is not uptodate and will not be
-                         * written to disk.  Ensure that we will put any
-                         * subsequent writeable buffers into a new
-                         * ioend.
-                         */
-                        imap_valid = 0;
-                        continue;
-                }
-                if (imap_valid)
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                if (!imap_valid) {
-                        /*
-                         * If we didn't have a valid mapping then we need to
-                         * put the new mapping into a separate ioend structure.
-                         * This ensures non-contiguous extents always have
-                         * separate ioends, which is particularly important
-                         * for unwritten extent conversion at I/O completion
-                         * time.
-                         */
-                        new_ioend = 1;
-                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                             nonblocking);
-                        if (err)
-                                goto error;
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                }
-                if (imap_valid) {
-                        lock_buffer(bh);
-                        if (type != XFS_IO_OVERWRITE)
-                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                                         new_ioend);
-                        count++;
-                }
-                if (!iohead)
-                        iohead = ioend;
-        } while (offset += len, ((bh = bh->b_this_page) != head));
-        if (uptodate && bh == head)
-                SetPageUptodate(page);
-        xfs_start_page_writeback(page, 1, count);
-        /* if there is no IO to be submitted for this page, we are done */
-        if (!ioend)
-                return 0;
-        ASSERT(iohead);
-        /*
-         * Any errors from this point onwards need tobe reported through the IO
-         * completion path as we have marked the initial page as under writeback
-         * and unlocked it.
-         */
-        if (imap_valid) {
-                xfs_off_t               end_index;
-                end_index = imap.br_startoff + imap.br_blockcount;
-                /* to bytes */
-                end_index <<= inode->i_blkbits;
-                /* to pages */
-                end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
-                /* check against file size */
-                if (end_index > last_index)
-                        end_index = last_index;
-                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                  wbc, end_index);
-        }
-        /*
-         * Reserve log space if we might write beyond the on-disk inode size.
-         */
-        err = 0;
-        if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
-                err = xfs_setfilesize_trans_alloc(ioend);
-        xfs_submit_ioend(wbc, iohead, err);
-        return 0;
-error:
-        if (iohead)
-                xfs_cancel_ioend(iohead);
-        if (err == -EAGAIN)
-                goto redirty;
-        xfs_aops_discard_page(page);
-        ClearPageUptodate(page);
-        unlock_page(page);
-        return err;
 redirty:
        redirty_page_for_writepage(wbc, page);
@@ -1203,16 +986,40 @@ redirty:
 }
 STATIC int
+xfs_vm_writepage(
+        struct page             *page,
+        struct writeback_control *wbc)
+{
+        struct xfs_writepage_ctx wpc = {
+                .io_type = XFS_IO_INVALID,
+        };
+        int                     ret;
+        ret = xfs_do_writepage(page, wbc, &wpc);
+        if (wpc.ioend)
+                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+        return ret;
+}
+STATIC int
 xfs_vm_writepages(
        struct address_space    *mapping,
        struct writeback_control *wbc)
 {
+        struct xfs_writepage_ctx wpc = {
+                .io_type = XFS_IO_INVALID,
+        };
+        int                     ret;
        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
        if (dax_mapping(mapping))
                return dax_writeback_mapping_range(mapping,
                                xfs_find_bdev_for_inode(mapping->host), wbc);
-        return generic_writepages(mapping, wbc);
+        ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
+        if (wpc.ioend)
+                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+        return ret;
 }
 /*
@@ -1242,27 +1049,8 @@ xfs_vm_releasepage(
 }
 /*
- * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * When we map a DIO buffer, we may need to pass flags to
- * type of write IO we are doing. This passes to the completion function the
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
- * operations it needs to perform. If the mapping is for an overwrite wholly
- * within the EOF then we don't need an ioend and so we don't allocate one.
- * This avoids the unnecessary overhead of allocating and freeing ioends for
- * workloads that don't require transactions on IO completion.
- *
- * If we get multiple mappings in a single IO, we might be mapping different
- * types. But because the direct IO can only have a single private pointer, we
- * need to ensure that:
- *
- * a) i) the ioend spans the entire region of unwritten mappings; or
- *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
- * b) if it contains unwritten extents, it is *permanently* marked as such
- *
- * We could do this by chaining ioends like buffered IO does, but we only
- * actually get one IO completion callback from the direct IO, and that spans
- * the entire IO regardless of how many mappings and IOs are needed to complete
- * the DIO. There is only going to be one reference to the ioend and its life
- * cycle is constrained by the DIO completion code. hence we don't need
- * reference counting here.
 *
 * Note that for DIO, an IO to the highest supported file block offset (i.e.
 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
@@ -1270,68 +1058,26 @@ xfs_vm_releasepage(
 * extending the file size. We won't know for sure until IO completion is run
 * and the actual max write offset is communicated to the IO completion
 * routine.
- *
- * For DAX page faults, we are preparing to never see unwritten extents here,
- * nor should we ever extend the inode size. Hence we will soon have nothing to
- * do here for this case, ensuring we don't have to provide an IO completion
- * callback to free an ioend that we don't actually need for a fault into the
- * page at offset (2^63 - 1FSB) bytes.
 */
 static void
 xfs_map_direct(
        struct inode            *inode,
        struct buffer_head      *bh_result,
        struct xfs_bmbt_irec    *imap,
-        xfs_off_t               offset,
+        xfs_off_t               offset)
-        bool                    dax_fault)
 {
-        struct xfs_ioend        *ioend;
+        uintptr_t               *flags = (uintptr_t *)&bh_result->b_private;
        xfs_off_t               size = bh_result->b_size;
-        int                     type;
-        if (ISUNWRITTEN(imap))
-                type = XFS_IO_UNWRITTEN;
-        else
-                type = XFS_IO_OVERWRITE;
-        trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+        trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
+                ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
-        if (dax_fault) {
-                ASSERT(type == XFS_IO_OVERWRITE);
-                trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-                                            imap);
-                return;
-        }
-        if (bh_result->b_private) {
+        if (ISUNWRITTEN(imap)) {
-                ioend = bh_result->b_private;
+                *flags |= XFS_DIO_FLAG_UNWRITTEN;
-                ASSERT(ioend->io_size > 0);
+                set_buffer_defer_completion(bh_result);
-                ASSERT(offset >= ioend->io_offset);
+        } else if (offset + size > i_size_read(inode) || offset + size < 0) {
-                if (offset + size > ioend->io_offset + ioend->io_size)
+                *flags |= XFS_DIO_FLAG_APPEND;
-                        ioend->io_size = offset - ioend->io_offset + size;
-                if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
-                        ioend->io_type = XFS_IO_UNWRITTEN;
-                trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
-                                              ioend->io_size, ioend->io_type,
-                                              imap);
-        } else if (type == XFS_IO_UNWRITTEN ||
-                   offset + size > i_size_read(inode) ||
-                   offset + size < 0) {
-                ioend = xfs_alloc_ioend(inode, type);
-                ioend->io_offset = offset;
-                ioend->io_size = size;
-                bh_result->b_private = ioend;
                set_buffer_defer_completion(bh_result);
-                trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
-                                           imap);
-        } else {
-                trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-                                            imap);
        }
 }
@@ -1502,9 +1248,12 @@ __xfs_get_blocks(
                if (ISUNWRITTEN(&imap))
                        set_buffer_unwritten(bh_result);
                /* direct IO needs special help */
-                if (create && direct)
+                if (create && direct) {
-                        xfs_map_direct(inode, bh_result, &imap, offset,
+                        if (dax_fault)
-                                       dax_fault);
+                                ASSERT(!ISUNWRITTEN(&imap));
+                        else
+                                xfs_map_direct(inode, bh_result, &imap, offset);
+                }
        }
        /*
@@ -1574,42 +1323,50 @@ xfs_get_blocks_dax_fault(
        return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
 }
-static void
+/*
-__xfs_end_io_direct_write(
+ * Complete a direct I/O write request.
-        struct inode            *inode,
+ *
-        struct xfs_ioend        *ioend,
+ * xfs_map_direct passes us some flags in the private data to tell us what to
+ * do.  If no flags are set, then the write IO is an overwrite wholly within
+ * the existing allocated file size and so there is nothing for us to do.
+ *
+ * Note that in this case the completion can be called in interrupt context,
+ * whereas if we have flags set we will always be called in task context
+ * (i.e. from a workqueue).
+ */
+STATIC int
+xfs_end_io_direct_write(
+        struct kiocb            *iocb,
        loff_t                  offset,
-        ssize_t                 size)
+        ssize_t                 size,
+        void                    *private)
 {
-        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
+        struct inode            *inode = file_inode(iocb->ki_filp);
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        uintptr_t               flags = (uintptr_t)private;
+        int                     error = 0;
-        if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
+        trace_xfs_end_io_direct_write(ip, offset, size);
-                goto out_end_io;
-        /*
+        if (XFS_FORCED_SHUTDOWN(mp))
-         * dio completion end_io functions are only called on writes if more
+                return -EIO;
-         * than 0 bytes was written.
-         */
-        ASSERT(size > 0);
-        /*
+        if (size <= 0)
-         * The ioend only maps whole blocks, while the IO may be sector aligned.
+                return size;
-         * Hence the ioend offset/size may not match the IO offset/size exactly.
-         * Because we don't map overwrites within EOF into the ioend, the offset
-         * may not match, but only if the endio spans EOF.  Either way, write
-         * the IO sizes into the ioend so that completion processing does the
-         * right thing.
-         */
-        ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
-        ioend->io_size = size;
-        ioend->io_offset = offset;
        /*
-         * The ioend tells us whether we are doing unwritten extent conversion
+         * The flags tell us whether we are doing unwritten extent conversions
         * or an append transaction that updates the on-disk file size. These
         * cases are the only cases where we should *potentially* be needing
         * to update the VFS inode size.
-         *
+         */
+        if (flags == 0) {
+                ASSERT(offset + size <= i_size_read(inode));
+                return 0;
+        }
+        /*
         * We need to update the in-core inode size here so that we don't end up
         * with the on-disk inode size being outside the in-core inode size. We
         * have no other method of updating EOF for AIO, so always do it here
@@ -1620,91 +1377,56 @@ __xfs_end_io_direct_write(
         * here can result in EOF moving backwards and Bad Things Happen when
         * that occurs.
         */
-        spin_lock(&XFS_I(inode)->i_flags_lock);
+        spin_lock(&ip->i_flags_lock);
        if (offset + size > i_size_read(inode))
                i_size_write(inode, offset + size);
-        spin_unlock(&XFS_I(inode)->i_flags_lock);
+        spin_unlock(&ip->i_flags_lock);
-        /*
+        if (flags & XFS_DIO_FLAG_UNWRITTEN) {
-         * If we are doing an append IO that needs to update the EOF on disk,
+                trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-         * do the transaction reserve now so we can use common end io
-         * processing. Stashing the error (if there is one) in the ioend will
-         * result in the ioend processing passing on the error if it is
-         * possible as we can't return it from here.
-         */
-        if (ioend->io_type == XFS_IO_OVERWRITE)
-                ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
-out_end_io:
+                error = xfs_iomap_write_unwritten(ip, offset, size);
-        xfs_end_io(&ioend->io_work);
+        } else if (flags & XFS_DIO_FLAG_APPEND) {
-        return;
+                struct xfs_trans *tp;
-}
-/*
+                trace_xfs_end_io_direct_write_append(ip, offset, size);
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
-        struct kiocb            *iocb,
-        loff_t                  offset,
-        ssize_t                 size,
-        void                    *private)
-{
-        struct inode            *inode = file_inode(iocb->ki_filp);
-        struct xfs_ioend        *ioend = private;
-        trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
-                                     ioend ? ioend->io_type : 0, NULL);
-        if (!ioend) {
+                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-                ASSERT(offset + size <= i_size_read(inode));
+                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-                return;
+                if (error) {
+                        xfs_trans_cancel(tp);
+                        return error;
+                }
+                error = xfs_setfilesize(ip, tp, offset, size);
        }
-        __xfs_end_io_direct_write(inode, ioend, offset, size);
+        return error;
 }
-static inline ssize_t
+STATIC ssize_t
-xfs_vm_do_dio(
+xfs_vm_direct_IO(
-        struct inode            *inode,
        struct kiocb            *iocb,
        struct iov_iter         *iter,
-        loff_t                  offset,
+        loff_t                  offset)
-        void                    (*endio)(struct kiocb   *iocb,
-                                         loff_t         offset,
-                                         ssize_t        size,
-                                         void           *private),
-        int                     flags)
 {
+        struct inode            *inode = iocb->ki_filp->f_mapping->host;
+        dio_iodone_t            *endio = NULL;
+        int                     flags = 0;
        struct block_device     *bdev;
-        if (IS_DAX(inode))
+        if (iov_iter_rw(iter) == WRITE) {
+                endio = xfs_end_io_direct_write;
+                flags = DIO_ASYNC_EXTEND;
+        }
+        if (IS_DAX(inode)) {
                return dax_do_io(iocb, inode, iter, offset,
                                 xfs_get_blocks_direct, endio, 0);
+        }
        bdev = xfs_find_bdev_for_inode(inode);
        return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-                                     xfs_get_blocks_direct, endio, NULL, flags);
+                        xfs_get_blocks_direct, endio, NULL, flags);
-}
-STATIC ssize_t
-xfs_vm_direct_IO(
-        struct kiocb            *iocb,
-        struct iov_iter         *iter,
-        loff_t                  offset)
-{
-        struct inode            *inode = iocb->ki_filp->f_mapping->host;
-        if (iov_iter_rw(iter) == WRITE)
-                return xfs_vm_do_dio(inode, iocb, iter, offset,
-                                     xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
-        return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
 }
 /*
@@ -1756,6 +1478,7 @@ xfs_vm_write_failed(
        loff_t                  from = pos & (PAGE_CACHE_SIZE - 1);
        loff_t                  to = from + len;
        struct buffer_head      *bh, *head;
+        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
        /*
         * The request pos offset might be 32 or 64 bit, this is all fine
@@ -1787,14 +1510,23 @@ xfs_vm_write_failed(
                if (block_start >= to)
                        break;
-                if (!buffer_delay(bh))
+                /*
+                 * Process delalloc and unwritten buffers beyond EOF. We can
+                 * encounter unwritten buffers in the event that a file has
+                 * post-EOF unwritten extents and an extending write happens to
+                 * fail (e.g., an unaligned write that also involves a delalloc
+                 * to the same page).
+                 */
+                if (!buffer_delay(bh) && !buffer_unwritten(bh))
                        continue;
-                if (!buffer_new(bh) && block_offset < i_size_read(inode))
+                if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
+                    block_offset < i_size_read(inode))
                        continue;
-                xfs_vm_kill_delalloc_range(inode, block_offset,
+                if (buffer_delay(bh))
-                                           block_offset + bh->b_size);
+                        xfs_vm_kill_delalloc_range(inode, block_offset,
+                                                   block_offset + bh->b_size);
                /*
                 * This buffer does not contain data anymore. make sure anyone
@@ -1805,6 +1537,7 @@ xfs_vm_write_failed(
                clear_buffer_mapped(bh);
                clear_buffer_new(bh);
                clear_buffer_dirty(bh);
+                clear_buffer_unwritten(bh);
        }
 }
@@ -1828,6 +1561,7 @@ xfs_vm_write_begin(
        pgoff_t                 index = pos >> PAGE_CACHE_SHIFT;
        struct page             *page;
        int                     status;
+        struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
        ASSERT(len <= PAGE_CACHE_SIZE);
@@ -1836,6 +1570,8 @@ xfs_vm_write_begin(
                return -ENOMEM;
        status = __block_write_begin(page, pos, len, xfs_get_blocks);
+        if (xfs_mp_fail_writes(mp))
+                status = -EIO;
        if (unlikely(status)) {
                struct inode    *inode = mapping->host;
                size_t          isize = i_size_read(inode);
@@ -1848,6 +1584,8 @@ xfs_vm_write_begin(
                 * allocated in this write, not blocks that were previously
                 * written successfully.
                 */
+                if (xfs_mp_fail_writes(mp))
+                        isize = 0;
                if (pos + len > isize) {
                        ssize_t start = max_t(ssize_t, pos, isize);
@@ -1957,7 +1695,6 @@ xfs_vm_set_page_dirty(
        loff_t                  end_offset;
        loff_t                  offset;
        int                     newly_dirty;
-        struct mem_cgroup       *memcg;
        if (unlikely(!mapping))
                return !TestSetPageDirty(page);
@@ -1978,10 +1715,10 @@ xfs_vm_set_page_dirty(
                } while (bh != head);
        }
        /*
-         * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+         * Lock out page->mem_cgroup migration to keep PageDirty
-         * per-memcg dirty page counters.
+         * synchronized with per-memcg dirty page counters.
         */
-        memcg = mem_cgroup_begin_page_stat(page);
+        lock_page_memcg(page);
        newly_dirty = !TestSetPageDirty(page);
        spin_unlock(&mapping->private_lock);
@@ -1992,13 +1729,13 @@ xfs_vm_set_page_dirty(
                spin_lock_irqsave(&mapping->tree_lock, flags);
                if (page->mapping) {    /* Race with truncate? */
                        WARN_ON_ONCE(!PageUptodate(page));
-                        account_page_dirtied(page, mapping, memcg);
+                        account_page_dirtied(page, mapping);
                        radix_tree_tag_set(&mapping->page_tree,
                                        page_index(page), PAGECACHE_TAG_DIRTY);
                }
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        }
-        mem_cgroup_end_page_stat(memcg);
+        unlock_page_memcg(page);
        if (newly_dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        return newly_dirty;