xfs: implement iomap based buffered write path

Convert XFS to use the new iomap based multipage write path. This involves implementing the ->iomap_begin and ->iomap_end methods, and switching the buffered file write, page_mkwrite and xfs_iozero paths to the new iomap helpers. With this change __xfs_get_blocks will never be used for buffered writes, and the code handling them can be removed. Based on earlier code from Dave Chinner. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
author: Christoph Hellwig <hch@lst.de> 2016-06-20 19:53:44 -0400
committer: Dave Chinner <david@fromorbit.com> 2016-06-20 19:53:44 -0400
commit: 68a9f5e7007c1afa2cf6830b690a90d0187c0684 (patch)
tree: 986de78ca7e20e49604faecccc95685ce52b4090
parent: f0c6bcba74ac51cb77aadb33ad35cb2dc1ad1506 (diff)
7 files changed, 187 insertions, 258 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5d47b4df61ea..35faf128f36d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
        depends on (64BIT || LBDAF)
        select EXPORTFS
        select LIBCRC32C
+        select FS_IOMAP
        help
          XFS is a high performance journaling filesystem which originated
          on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4c463b99fe57..2ac9f7e5f504 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1427,216 +1427,6 @@ xfs_vm_direct_IO(
                        xfs_get_blocks_direct, endio, NULL, flags);
 }
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
-        struct inode            *inode,
-        loff_t                  start,
-        loff_t                  end)
-{
-        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fileoff_t           start_fsb;
-        xfs_fileoff_t           end_fsb;
-        int                     error;
-        start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-        end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-        if (end_fsb <= start_fsb)
-                return;
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-                                                end_fsb - start_fsb);
-        if (error) {
-                /* something screwed, just bail */
-                if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                        xfs_alert(ip->i_mount,
-                "xfs_vm_write_failed: unable to clean up ino %lld",
-                                        ip->i_ino);
-                }
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-STATIC void
-xfs_vm_write_failed(
-        struct inode            *inode,
-        struct page             *page,
-        loff_t                  pos,
-        unsigned                len)
-{
-        loff_t                  block_offset;
-        loff_t                  block_start;
-        loff_t                  block_end;
-        loff_t                  from = pos & (PAGE_SIZE - 1);
-        loff_t                  to = from + len;
-        struct buffer_head      *bh, *head;
-        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
-        /*
-         * The request pos offset might be 32 or 64 bit, this is all fine
-         * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-         * platform, the high 32-bit will be masked off if we evaluate the
-         * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-         * 0xfffff000 as an unsigned long, hence the result is incorrect
-         * which could cause the following ASSERT failed in most cases.
-         * In order to avoid this, we can evaluate the block_offset of the
-         * start of the page by using shifts rather than masks the mismatch
-         * problem.
-         */
-        block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
-        ASSERT(block_offset + from == pos);
-        head = page_buffers(page);
-        block_start = 0;
-        for (bh = head; bh != head || !block_start;
-             bh = bh->b_this_page, block_start = block_end,
-                                   block_offset += bh->b_size) {
-                block_end = block_start + bh->b_size;
-                /* skip buffers before the write */
-                if (block_end <= from)
-                        continue;
-                /* if the buffer is after the write, we're done */
-                if (block_start >= to)
-                        break;
-                /*
-                 * Process delalloc and unwritten buffers beyond EOF. We can
-                 * encounter unwritten buffers in the event that a file has
-                 * post-EOF unwritten extents and an extending write happens to
-                 * fail (e.g., an unaligned write that also involves a delalloc
-                 * to the same page).
-                 */
-                if (!buffer_delay(bh) && !buffer_unwritten(bh))
-                        continue;
-                if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-                    block_offset < i_size_read(inode))
-                        continue;
-                if (buffer_delay(bh))
-                        xfs_vm_kill_delalloc_range(inode, block_offset,
-                                                   block_offset + bh->b_size);
-                /*
-                 * This buffer does not contain data anymore. make sure anyone
-                 * who finds it knows that for certain.
-                 */
-                clear_buffer_delay(bh);
-                clear_buffer_uptodate(bh);
-                clear_buffer_mapped(bh);
-                clear_buffer_new(bh);
-                clear_buffer_dirty(bh);
-                clear_buffer_unwritten(bh);
-        }
-}
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
-        struct file             *file,
-        struct address_space    *mapping,
-        loff_t                  pos,
-        unsigned                len,
-        unsigned                flags,
-        struct page             **pagep,
-        void                    **fsdata)
-{
-        pgoff_t                 index = pos >> PAGE_SHIFT;
-        struct page             *page;
-        int                     status;
-        struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
-        ASSERT(len <= PAGE_SIZE);
-        page = grab_cache_page_write_begin(mapping, index, flags);
-        if (!page)
-                return -ENOMEM;
-        status = __block_write_begin(page, pos, len, xfs_get_blocks);
-        if (xfs_mp_fail_writes(mp))
-                status = -EIO;
-        if (unlikely(status)) {
-                struct inode    *inode = mapping->host;
-                size_t          isize = i_size_read(inode);
-                xfs_vm_write_failed(inode, page, pos, len);
-                unlock_page(page);
-                /*
-                 * If the write is beyond EOF, we only want to kill blocks
-                 * allocated in this write, not blocks that were previously
-                 * written successfully.
-                 */
-                if (xfs_mp_fail_writes(mp))
-                        isize = 0;
-                if (pos + len > isize) {
-                        ssize_t start = max_t(ssize_t, pos, isize);
-                        truncate_pagecache_range(inode, start, pos + len);
-                }
-                put_page(page);
-                page = NULL;
-        }
-        *pagep = page;
-        return status;
-}
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
-        struct file             *file,
-        struct address_space    *mapping,
-        loff_t                  pos,
-        unsigned                len,
-        unsigned                copied,
-        struct page             *page,
-        void                    *fsdata)
-{
-        int                     ret;
-        ASSERT(len <= PAGE_SIZE);
-        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (unlikely(ret < len)) {
-                struct inode    *inode = mapping->host;
-                size_t          isize = i_size_read(inode);
-                loff_t          to = pos + len;
-                if (to > isize) {
-                        /* only kill blocks in this write beyond EOF */
-                        if (pos > isize)
-                                isize = pos;
-                        xfs_vm_kill_delalloc_range(inode, isize, to);
-                        truncate_pagecache_range(inode, isize, to);
-                }
-        }
-        return ret;
-}
 STATIC sector_t
 xfs_vm_bmap(
        struct address_space    *mapping,
@@ -1747,8 +1537,6 @@ const struct address_space_operations xfs_address_space_operations = {
        .set_page_dirty         = xfs_vm_set_page_dirty,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
-        .write_begin            = xfs_vm_write_begin,
-        .write_end              = xfs_vm_write_end,
        .bmap                   = xfs_vm_bmap,
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 47fc63295422..7316d3841c53 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
 #include "xfs_log.h"
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 #include <linux/dcache.h>
 #include <linux/falloc.h>
@@ -79,57 +80,27 @@ xfs_rw_ilock_demote(
                inode_unlock(VFS_I(ip));
 }
-/*
+static int
- * xfs_iozero clears the specified range supplied via the page cache (except in
+xfs_dax_zero_range(
- * the DAX case). Writes through the page cache will allocate blocks over holes,
+        struct inode            *inode,
- * though the callers usually map the holes first and avoid them. If a block is
+        loff_t                  pos,
- * not completely zeroed, then it will be read from disk before being partially
+        size_t                  count)
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
- */
-int
-xfs_iozero(
-        struct xfs_inode        *ip,    /* inode                        */
-        loff_t                  pos,    /* offset in file               */
-        size_t                  count)  /* size of data to zero         */
 {
-        struct page             *page;
-        struct address_space    *mapping;
        int                     status = 0;
-        mapping = VFS_I(ip)->i_mapping;
        do {
                unsigned offset, bytes;
-                void *fsdata;
                offset = (pos & (PAGE_SIZE -1)); /* Within page */
                bytes = PAGE_SIZE - offset;
                if (bytes > count)
                        bytes = count;
-                if (IS_DAX(VFS_I(ip))) {
+                status = dax_zero_page_range(inode, pos, bytes,
-                        status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+                                             xfs_get_blocks_direct);
-                                                     xfs_get_blocks_direct);
+                if (status)
-                        if (status)
+                        break;
-                                break;
-                } else {
-                        status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                                AOP_FLAG_UNINTERRUPTIBLE,
-                                                &page, &fsdata);
-                        if (status)
-                                break;
-                        zero_user(page, offset, bytes);
-                        status = pagecache_write_end(NULL, mapping, pos, bytes,
-                                                bytes, page, fsdata);
-                        WARN_ON(status <= 0); /* can't return less than zero! */
-                        status = 0;
-                }
                pos += bytes;
                count -= bytes;
        } while (count);
@@ -137,6 +108,24 @@ xfs_iozero(
        return status;
 }
+/*
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
+ */
+int
+xfs_iozero(
+        struct xfs_inode        *ip,
+        loff_t                  pos,
+        size_t                  count)
+{
+        struct inode            *inode = VFS_I(ip);
+        if (IS_DAX(VFS_I(ip)))
+                return xfs_dax_zero_range(inode, pos, count);
+        else
+                return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops);
+}
 int
 xfs_update_prealloc_flags(
        struct xfs_inode        *ip,
@@ -841,7 +830,7 @@ xfs_file_buffered_aio_write(
 write_retry:
        trace_xfs_file_buffered_write(ip, iov_iter_count(from),
                                      iocb->ki_pos, 0);
-        ret = generic_perform_write(file, from, iocb->ki_pos);
+        ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
        if (likely(ret >= 0))
                iocb->ki_pos += ret;
@@ -1553,7 +1542,7 @@ xfs_filemap_page_mkwrite(
        if (IS_DAX(inode)) {
                ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
        } else {
-                ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+                ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                ret = block_page_mkwrite_return(ret);
        }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2f3719461cbd..620fc9120444 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -967,3 +967,147 @@ xfs_bmbt_to_iomap(
        iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
        iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
 }
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+        return !nimaps ||
+                imap->br_startblock == HOLESTARTBLOCK ||
+                imap->br_startblock == DELAYSTARTBLOCK;
+}
+static int
+xfs_file_iomap_begin(
+        struct inode            *inode,
+        loff_t                  offset,
+        loff_t                  length,
+        unsigned                flags,
+        struct iomap            *iomap)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_bmbt_irec    imap;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     nimaps = 1, error = 0;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        ASSERT(offset <= mp->m_super->s_maxbytes);
+        if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+                length = mp->m_super->s_maxbytes - offset;
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        end_fsb = XFS_B_TO_FSB(mp, offset + length);
+        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+                               &nimaps, XFS_BMAPI_ENTIRE);
+        if (error) {
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                return error;
+        }
+        if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+                /*
+                 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+                 * pages to keep the chunks of work done where somewhat symmetric
+                 * with the work writeback does. This is a completely arbitrary
+                 * number pulled out of thin air as a best guess for initial
+                 * testing.
+                 *
+                 * Note that the values needs to be less than 32-bits wide until
+                 * the lower level functions are updated.
+                 */
+                length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+                if (xfs_get_extsz_hint(ip)) {
+                        /*
+                         * xfs_iomap_write_direct() expects the shared lock. It
+                         * is unlocked on return.
+                         */
+                        xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+                        error = xfs_iomap_write_direct(ip, offset, length, &imap,
+                                        nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, length, &imap);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                }
+                if (error)
+                        return error;
+                trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+                xfs_bmbt_to_iomap(ip, iomap, &imap);
+        } else if (nimaps) {
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+                xfs_bmbt_to_iomap(ip, iomap, &imap);
+        } else {
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+                iomap->blkno = IOMAP_NULL_BLOCK;
+                iomap->type = IOMAP_HOLE;
+                iomap->offset = offset;
+                iomap->length = length;
+        }
+        return 0;
+}
+static int
+xfs_file_iomap_end_delalloc(
+        struct xfs_inode        *ip,
+        loff_t                  offset,
+        loff_t                  length,
+        ssize_t                 written)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           start_fsb;
+        xfs_fileoff_t           end_fsb;
+        int                     error = 0;
+        start_fsb = XFS_B_TO_FSB(mp, offset + written);
+        end_fsb = XFS_B_TO_FSB(mp, offset + length);
+        /*
+         * Trim back delalloc blocks if we didn't manage to write the whole
+         * range reserved.
+         *
+         * We don't need to care about racing delalloc as we hold i_mutex
+         * across the reserve/allocate/unreserve calls. If there are delalloc
+         * blocks in the range, they are ours.
+         */
+        if (start_fsb < end_fsb) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                               end_fsb - start_fsb);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+                        xfs_alert(mp, "%s: unable to clean up ino %lld",
+                                __func__, ip->i_ino);
+                        return error;
+                }
+        }
+        return 0;
+}
+static int
+xfs_file_iomap_end(
+        struct inode            *inode,
+        loff_t                  offset,
+        loff_t                  length,
+        ssize_t                 written,
+        unsigned                flags,
+        struct iomap            *iomap)
+{
+        if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+                return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+                                length, written);
+        return 0;
+}
+struct iomap_ops xfs_iomap_ops = {
+        .iomap_begin            = xfs_file_iomap_begin,
+        .iomap_end              = xfs_file_iomap_end,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 718f07c5c0d2..e066d045e2ff 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,7 +18,8 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-struct iomap;
+#include <linux/iomap.h>
 struct xfs_inode;
 struct xfs_bmbt_irec;
@@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
                struct xfs_bmbt_irec *);
+extern struct iomap_ops xfs_iomap_ops;
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1a5ca4b4a866..5d1fdae4e39b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -822,8 +823,8 @@ xfs_setattr_size(
                        error = dax_truncate_page(inode, newsize,
                                        xfs_get_blocks_direct);
                } else {
-                        error = block_truncate_page(inode->i_mapping, newsize,
+                        error = iomap_truncate_page(inode, newsize,
-                                        xfs_get_blocks);
+                                        &did_zeroing, &xfs_iomap_ops);
                }
        }
@@ -838,8 +839,8 @@ xfs_setattr_size(
         * problem. Note that this includes any block zeroing we did above;
         * otherwise those blocks may not be zeroed after a crash.
         */
-        if (newsize > ip->i_d.di_size &&
+        if (did_zeroing ||
-            (oldsize != ip->i_d.di_size || did_zeroing)) {
+            (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
                error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                                      ip->i_d.di_size, newsize);
                if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ea94ee0fe5ea..bb24ce7b0280 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
author	Christoph Hellwig <hch@lst.de>	2016-06-20 19:53:44 -0400
committer	Dave Chinner <david@fromorbit.com>	2016-06-20 19:53:44 -0400
commit	68a9f5e7007c1afa2cf6830b690a90d0187c0684 (patch)
tree	986de78ca7e20e49604faecccc95685ce52b4090
parent	f0c6bcba74ac51cb77aadb33ad35cb2dc1ad1506 (diff)