6 files changed, 107 insertions, 61 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 563254869e2f..b53e66d9abd7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -265,12 +265,24 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
        if (ret == 0)
                ret = transferred;
+        if (dio->end_io) {
+                // XXX: ki_pos??
+                err = dio->end_io(dio->iocb, offset, ret, dio->private);
+                if (err)
+                        ret = err;
+        }
        /*
         * Try again to invalidate clean pages which might have been cached by
         * non-direct readahead, or faulted in by get_user_pages() if the source
         * of the write was an mmap'ed region of the file we're writing.  Either
         * one is a pretty crazy thing to do, so we don't support it 100%.  If
         * this invalidation fails, tough, the write still worked...
+         *
+         * And this page cache invalidation has to be after dio->end_io(), as
+         * some filesystems convert unwritten extents to real allocations in
+         * end_io() when necessary, otherwise a racing buffer read would cache
+         * zeros from unwritten extents.
         */
        if (flags & DIO_COMPLETE_INVALIDATE &&
            ret > 0 && dio->op == REQ_OP_WRITE &&
@@ -281,14 +293,6 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
                WARN_ON_ONCE(err);
        }
-        if (dio->end_io) {
-                // XXX: ki_pos??
-                err = dio->end_io(dio->iocb, offset, ret, dio->private);
-                if (err)
-                        ret = err;
-        }
        if (!(dio->flags & DIO_SKIP_DIO_COUNT))
                inode_dio_end(dio->inode);
diff --git a/fs/iomap.c b/fs/iomap.c
index be61cf742b5e..d4801f8dd4fd 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -714,23 +714,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 {
        struct kiocb *iocb = dio->iocb;
        struct inode *inode = file_inode(iocb->ki_filp);
+        loff_t offset = iocb->ki_pos;
        ssize_t ret;
-        /*
-         * Try again to invalidate clean pages which might have been cached by
-         * non-direct readahead, or faulted in by get_user_pages() if the source
-         * of the write was an mmap'ed region of the file we're writing.  Either
-         * one is a pretty crazy thing to do, so we don't support it 100%.  If
-         * this invalidation fails, tough, the write still worked...
-         */
-        if (!dio->error &&
-            (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
-                ret = invalidate_inode_pages2_range(inode->i_mapping,
-                                iocb->ki_pos >> PAGE_SHIFT,
-                                (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
-                WARN_ON_ONCE(ret);
-        }
        if (dio->end_io) {
                ret = dio->end_io(iocb,
                                dio->error ? dio->error : dio->size,
@@ -742,12 +728,33 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
        if (likely(!ret)) {
                ret = dio->size;
                /* check for short read */
-                if (iocb->ki_pos + ret > dio->i_size &&
+                if (offset + ret > dio->i_size &&
                    !(dio->flags & IOMAP_DIO_WRITE))
-                        ret = dio->i_size - iocb->ki_pos;
+                        ret = dio->i_size - offset;
                iocb->ki_pos += ret;
        }
+        /*
+         * Try again to invalidate clean pages which might have been cached by
+         * non-direct readahead, or faulted in by get_user_pages() if the source
+         * of the write was an mmap'ed region of the file we're writing.  Either
+         * one is a pretty crazy thing to do, so we don't support it 100%.  If
+         * this invalidation fails, tough, the write still worked...
+         *
+         * And this page cache invalidation has to be after dio->end_io(), as
+         * some filesystems convert unwritten extents to real allocations in
+         * end_io() when necessary, otherwise a racing buffer read would cache
+         * zeros from unwritten extents.
+         */
+        if (!dio->error &&
+            (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+                int err;
+                err = invalidate_inode_pages2_range(inode->i_mapping,
+                                offset >> PAGE_SHIFT,
+                                (offset + dio->size - 1) >> PAGE_SHIFT);
+                WARN_ON_ONCE(err);
+        }
        inode_dio_end(file_inode(iocb->ki_filp));
        kfree(dio);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index def32fa1c225..89263797cf32 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3852,6 +3852,17 @@ xfs_trim_extent(
        }
 }
+/* trim extent to within eof */
+void
+xfs_trim_extent_eof(
+        struct xfs_bmbt_irec    *irec,
+        struct xfs_inode        *ip)
+{
+        xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
+                                              i_size_read(VFS_I(ip))));
+}
 /*
 * Trim the returned map to the required bounds
 */
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 851982a5dfbc..502e0d8fb4ff 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -208,6 +208,7 @@ void	xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
 void    xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
                xfs_filblks_t len);
+void    xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
 int     xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
 void    xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
 void    xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f18e5932aec4..a3eeaba156c5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -446,6 +446,19 @@ xfs_imap_valid(
 {
        offset >>= inode->i_blkbits;
+        /*
+         * We have to make sure the cached mapping is within EOF to protect
+         * against eofblocks trimming on file release leaving us with a stale
+         * mapping. Otherwise, a page for a subsequent file extending buffered
+         * write could get picked up by this writeback cycle and written to the
+         * wrong blocks.
+         *
+         * Note that what we really want here is a generic mapping invalidation
+         * mechanism to protect us from arbitrary extent modifying contexts, not
+         * just eofblocks.
+         */
+        xfs_trim_extent_eof(imap, XFS_I(inode));
        return offset >= imap->br_startoff &&
                offset < imap->br_startoff + imap->br_blockcount;
 }
@@ -735,6 +748,14 @@ xfs_vm_invalidatepage(
 {
        trace_xfs_invalidatepage(page->mapping->host, page, offset,
                                 length);
+        /*
+         * If we are invalidating the entire page, clear the dirty state from it
+         * so that we can check for attempts to release dirty cached pages in
+         * xfs_vm_releasepage().
+         */
+        if (offset == 0 && length >= PAGE_SIZE)
+                cancel_dirty_page(page);
        block_invalidatepage(page, offset, length);
 }
@@ -1190,25 +1211,27 @@ xfs_vm_releasepage(
         * mm accommodates an old ext3 case where clean pages might not have had
         * the dirty bit cleared. Thus, it can send actual dirty pages to
         * ->releasepage() via shrink_active_list(). Conversely,
-         * block_invalidatepage() can send pages that are still marked dirty
+         * block_invalidatepage() can send pages that are still marked dirty but
-         * but otherwise have invalidated buffers.
+         * otherwise have invalidated buffers.
         *
         * We want to release the latter to avoid unnecessary buildup of the
-         * LRU, skip the former and warn if we've left any lingering
+         * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
-         * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
+         * that are entirely invalidated and need to be released.  Hence the
-         * or unwritten buffers and warn if the page is not dirty. Otherwise
+         * only time we should get dirty pages here is through
-         * try to release the buffers.
+         * shrink_active_list() and so we can simply skip those now.
+         *
+         * warn if we've left any lingering delalloc/unwritten buffers on clean
+         * or invalidated pages we are about to release.
         */
+        if (PageDirty(page))
+                return 0;
        xfs_count_page_state(page, &delalloc, &unwritten);
-        if (delalloc) {
+        if (WARN_ON_ONCE(delalloc))
-                WARN_ON_ONCE(!PageDirty(page));
                return 0;
-        }
+        if (WARN_ON_ONCE(unwritten))
-        if (unwritten) {
-                WARN_ON_ONCE(!PageDirty(page));
                return 0;
-        }
        return try_to_free_buffers(page);
 }
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 560e0b40ac1b..43cfc07996a4 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -367,29 +367,6 @@ xfs_getfsmap_datadev_helper(
        return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
 }
-/* Transform a rtbitmap "record" into a fsmap */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap_helper(
-        struct xfs_trans                *tp,
-        struct xfs_rtalloc_rec          *rec,
-        void                            *priv)
-{
-        struct xfs_mount                *mp = tp->t_mountp;
-        struct xfs_getfsmap_info        *info = priv;
-        struct xfs_rmap_irec            irec;
-        xfs_daddr_t                     rec_daddr;
-        rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
-        irec.rm_startblock = rec->ar_startblock;
-        irec.rm_blockcount = rec->ar_blockcount;
-        irec.rm_owner = XFS_RMAP_OWN_NULL;      /* "free" */
-        irec.rm_offset = 0;
-        irec.rm_flags = 0;
-        return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
-}
 /* Transform a bnobt irec into a fsmap */
 STATIC int
 xfs_getfsmap_datadev_bnobt_helper(
@@ -475,6 +452,30 @@ xfs_getfsmap_logdev(
        return xfs_getfsmap_helper(tp, info, &rmap, 0);
 }
+#ifdef CONFIG_XFS_RT
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+        struct xfs_trans                *tp,
+        struct xfs_rtalloc_rec          *rec,
+        void                            *priv)
+{
+        struct xfs_mount                *mp = tp->t_mountp;
+        struct xfs_getfsmap_info        *info = priv;
+        struct xfs_rmap_irec            irec;
+        xfs_daddr_t                     rec_daddr;
+        rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
+        irec.rm_startblock = rec->ar_startblock;
+        irec.rm_blockcount = rec->ar_blockcount;
+        irec.rm_owner = XFS_RMAP_OWN_NULL;      /* "free" */
+        irec.rm_offset = 0;
+        irec.rm_flags = 0;
+        return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+}
 /* Execute a getfsmap query against the realtime device. */
 STATIC int
 __xfs_getfsmap_rtdev(
@@ -521,7 +522,6 @@ __xfs_getfsmap_rtdev(
        return query_fn(tp, info);
 }
-#ifdef CONFIG_XFS_RT
 /* Actually query the realtime bitmap. */
 STATIC int
 xfs_getfsmap_rtdev_rtbitmap_query(