Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs

Pull xfs updates from Dave Chinner: "The major addition is the new iomap based block mapping infrastructure. We've been kicking this about locally for years, but there are other filesystems want to use it too (e.g. gfs2). Now it is fully working, reviewed and ready for merge and be used by other filesystems. There are a lot of other fixes and cleanups in the tree, but those are XFS internal things and none are of the scale or visibility of the iomap changes. See below for details. I am likely to send another pull request next week - we're just about ready to merge some new functionality (on disk block->owner reverse mapping infrastructure), but that's a huge chunk of code (74 files changed, 7283 insertions(+), 1114 deletions(-)) so I'm keeping that separate to all the "normal" pull request changes so they don't get lost in the noise. Summary of changes in this update: - generic iomap based IO path infrastructure - generic iomap based fiemap implementation - xfs iomap based Io path implementation - buffer error handling fixes - tracking of in flight buffer IO for unmount serialisation - direct IO and DAX io path separation and simplification - shortform directory format definition changes for wider platform compatibility - various buffer cache fixes - cleanups in preparation for rmap merge - error injection cleanups and fixes - log item format buffer memory allocation restructuring to prevent rare OOM reclaim deadlocks - sparse inode chunks are now fully supported" * tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (53 commits) xfs: remove EXPERIMENTAL tag from sparse inode feature xfs: bufferhead chains are invalid after end_page_writeback xfs: allocate log vector buffers outside CIL context lock libxfs: directory node splitting does not have an extra block xfs: remove dax code from object file when disabled xfs: skip dirty pages in ->releasepage() xfs: remove __arch_pack xfs: kill xfs_dir2_inou_t xfs: kill xfs_dir2_sf_off_t xfs: split direct I/O and DAX path xfs: direct calls in the direct I/O path xfs: stop using generic_file_read_iter for direct I/O xfs: split xfs_file_read_iter into buffered and direct I/O helpers xfs: remove s_maxbytes enforcement in xfs_file_read_iter xfs: kill ioflags xfs: don't pass ioflags around in the ioctl path xfs: track and serialize in-flight async buffers against unmount xfs: exclude never-released buffers from buftarg I/O accounting xfs: don't reset b_retries to 0 on every failure xfs: remove extraneous buffer flag changes ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-07-27 12:53:35 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-07-27 12:53:35 -0400
commit: 0e6acf0204da5b8705722a5f6806a4f55ed379d6 (patch)
tree: 4a8a9bf9daba9c734a0fdde417ae1cb472ca396d
parent: 0e06f5c0deeef0332a5da2ecb8f1fcf3e024d958 (diff)
parent: f2bdfda9a1c668539bc85baf5625f6f14bc510b1 (diff)
66 files changed, 2026 insertions, 1440 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index b8fcb416be72..4524916fa200 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
 if BLOCK
+config FS_IOMAP
+        bool
 source "fs/ext2/Kconfig"
 source "fs/ext4/Kconfig"
 source "fs/jbd2/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 85b6e13b62d3..ed2b63257ba9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP)		+= coredump.o
 obj-$(CONFIG_SYSCTL)            += drop_caches.o
 obj-$(CONFIG_FHANDLE)           += fhandle.o
+obj-$(CONFIG_FS_IOMAP)          += iomap.o
 obj-y                           += quota/
diff --git a/fs/buffer.c b/fs/buffer.c
index b9fa1be75e69..9c8eb9b6db6a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
@@ -1892,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+static void
-                get_block_t *get_block)
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+                struct iomap *iomap)
+{
+        loff_t offset = block << inode->i_blkbits;
+        bh->b_bdev = iomap->bdev;
+        /*
+         * Block points to offset in file we need to map, iomap contains
+         * the offset at which the map starts. If the map ends before the
+         * current block, then do not map the buffer and let the caller
+         * handle it.
+         */
+        BUG_ON(offset >= iomap->offset + iomap->length);
+        switch (iomap->type) {
+        case IOMAP_HOLE:
+                /*
+                 * If the buffer is not up to date or beyond the current EOF,
+                 * we need to mark it as new to ensure sub-block zeroing is
+                 * executed if necessary.
+                 */
+                if (!buffer_uptodate(bh) ||
+                    (offset >= i_size_read(inode)))
+                        set_buffer_new(bh);
+                break;
+        case IOMAP_DELALLOC:
+                if (!buffer_uptodate(bh) ||
+                    (offset >= i_size_read(inode)))
+                        set_buffer_new(bh);
+                set_buffer_uptodate(bh);
+                set_buffer_mapped(bh);
+                set_buffer_delay(bh);
+                break;
+        case IOMAP_UNWRITTEN:
+                /*
+                 * For unwritten regions, we always need to ensure that
+                 * sub-block writes cause the regions in the block we are not
+                 * writing to are zeroed. Set the buffer as new to ensure this.
+                 */
+                set_buffer_new(bh);
+                set_buffer_unwritten(bh);
+                /* FALLTHRU */
+        case IOMAP_MAPPED:
+                if (offset >= i_size_read(inode))
+                        set_buffer_new(bh);
+                bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+                                ((offset - iomap->offset) >> inode->i_blkbits);
+                set_buffer_mapped(bh);
+                break;
+        }
+}
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+                get_block_t *get_block, struct iomap *iomap)
 {
        unsigned from = pos & (PAGE_SIZE - 1);
        unsigned to = from + len;
@@ -1929,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
-                        err = get_block(inode, block, bh, 1);
+                        if (get_block) {
-                        if (err)
+                                err = get_block(inode, block, bh, 1);
-                                break;
+                                if (err)
+                                        break;
+                        } else {
+                                iomap_to_bh(inode, block, bh, iomap);
+                        }
                        if (buffer_new(bh)) {
                                unmap_underlying_metadata(bh->b_bdev,
                                                        bh->b_blocknr);
@@ -1972,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                page_zero_new_buffers(page, from, to);
        return err;
 }
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+                get_block_t *get_block)
+{
+        return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
 EXPORT_SYMBOL(__block_write_begin);
 static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h
index f57ced528cde..cef0913e5d41 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 struct super_block;
 struct file_system_type;
+struct iomap;
 struct linux_binprm;
 struct path;
 struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
 * buffer.c
 */
 extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+                get_block_t *get_block, struct iomap *iomap);
 /*
 * char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 000000000000..48141b8eff5f
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/dax.h>
+#include "internal.h"
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+                void *data, struct iomap *iomap);
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+                struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+        struct iomap iomap = { 0 };
+        loff_t written = 0, ret;
+        /*
+         * Need to map a range from start position for length bytes. This can
+         * span multiple pages - it is only guaranteed to return a range of a
+         * single type of pages (e.g. all into a hole, all mapped or all
+         * unwritten). Failure at this point has nothing to undo.
+         *
+         * If allocation is required for this range, reserve the space now so
+         * that the allocation is guaranteed to succeed later on. Once we copy
+         * the data into the page cache pages, then we cannot fail otherwise we
+         * expose transient stale data. If the reserve fails, we can safely
+         * back out at this point as there is nothing to undo.
+         */
+        ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+        if (ret)
+                return ret;
+        if (WARN_ON(iomap.offset > pos))
+                return -EIO;
+        /*
+         * Cut down the length to the one actually provided by the filesystem,
+         * as it might not be able to give us the whole size that we requested.
+         */
+        if (iomap.offset + iomap.length < pos + length)
+                length = iomap.offset + iomap.length - pos;
+        /*
+         * Now that we have guaranteed that the space allocation will succeed.
+         * we can do the copy-in page by page without having to worry about
+         * failures exposing transient data.
+         */
+        written = actor(inode, pos, length, data, &iomap);
+        /*
+         * Now the data has been copied, commit the range we've copied.  This
+         * should not fail unless the filesystem has had a fatal error.
+         */
+        ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+                        flags, &iomap);
+        return written ? written : ret;
+}
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+        loff_t i_size = i_size_read(inode);
+        /*
+         * Only truncate newly allocated pages beyoned EOF, even if the
+         * write started inside the existing inode size.
+         */
+        if (pos + len > i_size)
+                truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, struct iomap *iomap)
+{
+        pgoff_t index = pos >> PAGE_SHIFT;
+        struct page *page;
+        int status = 0;
+        BUG_ON(pos + len > iomap->offset + iomap->length);
+        page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+        if (!page)
+                return -ENOMEM;
+        status = __block_write_begin_int(page, pos, len, NULL, iomap);
+        if (unlikely(status)) {
+                unlock_page(page);
+                put_page(page);
+                page = NULL;
+                iomap_write_failed(inode, pos, len);
+        }
+        *pagep = page;
+        return status;
+}
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+                unsigned copied, struct page *page)
+{
+        int ret;
+        ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+                        copied, page, NULL);
+        if (ret < len)
+                iomap_write_failed(inode, pos, len);
+        return ret;
+}
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+                struct iomap *iomap)
+{
+        struct iov_iter *i = data;
+        long status = 0;
+        ssize_t written = 0;
+        unsigned int flags = AOP_FLAG_NOFS;
+        /*
+         * Copies from kernel address space cannot fail (NFSD is a big user).
+         */
+        if (!iter_is_iovec(i))
+                flags |= AOP_FLAG_UNINTERRUPTIBLE;
+        do {
+                struct page *page;
+                unsigned long offset;   /* Offset into pagecache page */
+                unsigned long bytes;    /* Bytes to write to page */
+                size_t copied;          /* Bytes copied from user */
+                offset = (pos & (PAGE_SIZE - 1));
+                bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                                iov_iter_count(i));
+again:
+                if (bytes > length)
+                        bytes = length;
+                /*
+                 * Bring in the user page that we will copy from _first_.
+                 * Otherwise there's a nasty deadlock on copying from the
+                 * same page as we're writing to, without it being marked
+                 * up-to-date.
+                 *
+                 * Not only is this an optimisation, but it is also required
+                 * to check that the address is actually valid, when atomic
+                 * usercopies are used, below.
+                 */
+                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                        status = -EFAULT;
+                        break;
+                }
+                status = iomap_write_begin(inode, pos, bytes, flags, &page,
+                                iomap);
+                if (unlikely(status))
+                        break;
+                if (mapping_writably_mapped(inode->i_mapping))
+                        flush_dcache_page(page);
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+                pagefault_enable();
+                flush_dcache_page(page);
+                mark_page_accessed(page);
+                status = iomap_write_end(inode, pos, bytes, copied, page);
+                if (unlikely(status < 0))
+                        break;
+                copied = status;
+                cond_resched();
+                iov_iter_advance(i, copied);
+                if (unlikely(copied == 0)) {
+                        /*
+                         * If we were unable to copy any data at all, we must
+                         * fall back to a single segment length write.
+                         *
+                         * If we didn't fallback here, we could livelock
+                         * because not all segments in the iov can be copied at
+                         * once without a pagefault.
+                         */
+                        bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                                iov_iter_single_seg_count(i));
+                        goto again;
+                }
+                pos += copied;
+                written += copied;
+                length -= copied;
+                balance_dirty_pages_ratelimited(inode->i_mapping);
+        } while (iov_iter_count(i) && length);
+        return written ? written : status;
+}
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+                struct iomap_ops *ops)
+{
+        struct inode *inode = iocb->ki_filp->f_mapping->host;
+        loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+        while (iov_iter_count(iter)) {
+                ret = iomap_apply(inode, pos, iov_iter_count(iter),
+                                IOMAP_WRITE, ops, iter, iomap_write_actor);
+                if (ret <= 0)
+                        break;
+                pos += ret;
+                written += ret;
+        }
+        return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
+                unsigned bytes, struct iomap *iomap)
+{
+        struct page *page;
+        int status;
+        status = iomap_write_begin(inode, pos, bytes,
+                        AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+        if (status)
+                return status;
+        zero_user(page, offset, bytes);
+        mark_page_accessed(page);
+        return iomap_write_end(inode, pos, bytes, bytes, page);
+}
+static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
+                struct iomap *iomap)
+{
+        sector_t sector = iomap->blkno +
+                (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+        return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+}
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+                void *data, struct iomap *iomap)
+{
+        bool *did_zero = data;
+        loff_t written = 0;
+        int status;
+        /* already zeroed?  we're done. */
+        if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+                return count;
+        do {
+                unsigned offset, bytes;
+                offset = pos & (PAGE_SIZE - 1); /* Within page */
+                bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+                if (IS_DAX(inode))
+                        status = iomap_dax_zero(pos, offset, bytes, iomap);
+                else
+                        status = iomap_zero(inode, pos, offset, bytes, iomap);
+                if (status < 0)
+                        return status;
+                pos += bytes;
+                count -= bytes;
+                written += bytes;
+                if (did_zero)
+                        *did_zero = true;
+        } while (count > 0);
+        return written;
+}
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+                struct iomap_ops *ops)
+{
+        loff_t ret;
+        while (len > 0) {
+                ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+                                ops, did_zero, iomap_zero_range_actor);
+                if (ret <= 0)
+                        return ret;
+                pos += ret;
+                len -= ret;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+                struct iomap_ops *ops)
+{
+        unsigned blocksize = (1 << inode->i_blkbits);
+        unsigned off = pos & (blocksize - 1);
+        /* Block boundary? Nothing to do */
+        if (!off)
+                return 0;
+        return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+                void *data, struct iomap *iomap)
+{
+        struct page *page = data;
+        int ret;
+        ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+                        NULL, iomap);
+        if (ret)
+                return ret;
+        block_commit_write(page, 0, length);
+        return length;
+}
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                struct iomap_ops *ops)
+{
+        struct page *page = vmf->page;
+        struct inode *inode = file_inode(vma->vm_file);
+        unsigned long length;
+        loff_t offset, size;
+        ssize_t ret;
+        lock_page(page);
+        size = i_size_read(inode);
+        if ((page->mapping != inode->i_mapping) ||
+            (page_offset(page) > size)) {
+                /* We overload EFAULT to mean page got truncated */
+                ret = -EFAULT;
+                goto out_unlock;
+        }
+        /* page is wholly or partially inside EOF */
+        if (((page->index + 1) << PAGE_SHIFT) > size)
+                length = size & ~PAGE_MASK;
+        else
+                length = PAGE_SIZE;
+        offset = page_offset(page);
+        while (length > 0) {
+                ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+                                ops, page, iomap_page_mkwrite_actor);
+                if (unlikely(ret <= 0))
+                        goto out_unlock;
+                offset += ret;
+                length -= ret;
+        }
+        set_page_dirty(page);
+        wait_for_stable_page(page);
+        return 0;
+out_unlock:
+        unlock_page(page);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+struct fiemap_ctx {
+        struct fiemap_extent_info *fi;
+        struct iomap prev;
+};
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+                struct iomap *iomap, u32 flags)
+{
+        switch (iomap->type) {
+        case IOMAP_HOLE:
+                /* skip holes */
+                return 0;
+        case IOMAP_DELALLOC:
+                flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+                break;
+        case IOMAP_UNWRITTEN:
+                flags |= FIEMAP_EXTENT_UNWRITTEN;
+                break;
+        case IOMAP_MAPPED:
+                break;
+        }
+        return fiemap_fill_next_extent(fi, iomap->offset,
+                        iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+                        iomap->length, flags | FIEMAP_EXTENT_MERGED);
+}
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+                struct iomap *iomap)
+{
+        struct fiemap_ctx *ctx = data;
+        loff_t ret = length;
+        if (iomap->type == IOMAP_HOLE)
+                return length;
+        ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+        ctx->prev = *iomap;
+        switch (ret) {
+        case 0:         /* success */
+                return length;
+        case 1:         /* extent array full */
+                return 0;
+        default:
+                return ret;
+        }
+}
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+                loff_t start, loff_t len, struct iomap_ops *ops)
+{
+        struct fiemap_ctx ctx;
+        loff_t ret;
+        memset(&ctx, 0, sizeof(ctx));
+        ctx.fi = fi;
+        ctx.prev.type = IOMAP_HOLE;
+        ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        ret = filemap_write_and_wait(inode->i_mapping);
+        if (ret)
+                return ret;
+        while (len > 0) {
+                ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+                                iomap_fiemap_actor);
+                if (ret < 0)
+                        return ret;
+                if (ret == 0)
+                        break;
+                start += ret;
+                len -= ret;
+        }
+        if (ctx.prev.type != IOMAP_HOLE) {
+                ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+                if (ret < 0)
+                        return ret;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 31f3df193bdb..ad2c05e80a83 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
 * Copyright (c) 2014-2016 Christoph Hellwig.
 */
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/genhd.h>
 #include <linux/slab.h>
 #include <linux/pr.h>
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6c3b316f932e..4ebaaf4b8d8a 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
 */
 #include <linux/sunrpc/svc.h>
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/nfs4.h>
 #include "nfsd.h"
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5d47b4df61ea..35faf128f36d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
        depends on (64BIT || LBDAF)
        select EXPORTFS
        select LIBCRC32C
+        select FS_IOMAP
        help
          XFS is a high performance journaling filesystem which originated
          on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a708e38b494c..88c26b827a2d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -84,7 +84,7 @@ xfs_alloc_lookup_ge(
 * Lookup the first record less than or equal to [bno, len]
 * in the btree given by cur.
 */
-int                                     /* error */
+static int                              /* error */
 xfs_alloc_lookup_le(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           bno,    /* starting block of extent */
@@ -1839,19 +1839,8 @@ void
 xfs_alloc_compute_maxlevels(
        xfs_mount_t     *mp)    /* file system mount structure */
 {
-        int             level;
+        mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
-        uint            maxblocks;
+                        (mp->m_sb.sb_agblocks + 1) / 2);
-        uint            maxleafents;
-        int             minleafrecs;
-        int             minnoderecs;
-        maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
-        minleafrecs = mp->m_alloc_mnr[0];
-        minnoderecs = mp->m_alloc_mnr[1];
-        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-        for (level = 1; maxblocks > 1; level++)
-                maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-        mp->m_ag_maxlevels = level;
 }
 /*
@@ -2658,55 +2647,79 @@ error0:
        return error;
 }
-/*
+/* Ensure that the freelist is at full capacity. */
- * Free an extent.
+int
- * Just break up the extent address and hand off to xfs_free_ag_extent
+xfs_free_extent_fix_freelist(
- * after fixing up the freelist.
+        struct xfs_trans        *tp,
- */
+        xfs_agnumber_t          agno,
-int                             /* error */
+        struct xfs_buf          **agbp)
-xfs_free_extent(
-        xfs_trans_t     *tp,    /* transaction pointer */
-        xfs_fsblock_t   bno,    /* starting block number of extent */
-        xfs_extlen_t    len)    /* length of extent */
 {
-        xfs_alloc_arg_t args;
+        struct xfs_alloc_arg    args;
-        int             error;
+        int                     error;
-        ASSERT(len != 0);
+        memset(&args, 0, sizeof(struct xfs_alloc_arg));
-        memset(&args, 0, sizeof(xfs_alloc_arg_t));
        args.tp = tp;
        args.mp = tp->t_mountp;
+        args.agno = agno;
        /*
         * validate that the block number is legal - the enables us to detect
         * and handle a silent filesystem corruption rather than crashing.
         */
-        args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
        if (args.agno >= args.mp->m_sb.sb_agcount)
                return -EFSCORRUPTED;
-        args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-        if (args.agbno >= args.mp->m_sb.sb_agblocks)
-                return -EFSCORRUPTED;
        args.pag = xfs_perag_get(args.mp, args.agno);
        ASSERT(args.pag);
        error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
        if (error)
-                goto error0;
+                goto out;
+        *agbp = args.agbp;
+out:
+        xfs_perag_put(args.pag);
+        return error;
+}
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int                             /* error */
+xfs_free_extent(
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_fsblock_t           bno,    /* starting block number of extent */
+        xfs_extlen_t            len)    /* length of extent */
+{
+        struct xfs_mount        *mp = tp->t_mountp;
+        struct xfs_buf          *agbp;
+        xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, bno);
+        xfs_agblock_t           agbno = XFS_FSB_TO_AGBNO(mp, bno);
+        int                     error;
+        ASSERT(len != 0);
+        error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
        /* validate the extent size is legal now we have the agf locked */
-        if (args.agbno + len >
+        XFS_WANT_CORRUPTED_GOTO(mp,
-                        be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
+                agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
-                error = -EFSCORRUPTED;
+                                err);
-                goto error0;
-        }
-        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+        error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
-        if (!error)
+        if (error)
-                xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
+                goto err;
-error0:
-        xfs_perag_put(args.pag);
+        xfs_extent_busy_insert(tp, agno, agbno, len, 0);
+        return 0;
+err:
+        xfs_trans_brelse(tp, agbp);
        return error;
 }
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 135eb3d24db7..cf268b2d0b6c 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -212,13 +212,6 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
-int                                     /* error */
-xfs_alloc_lookup_le(
-        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agblock_t           bno,    /* starting block of extent */
-        xfs_extlen_t            len,    /* length of extent */
-        int                     *stat); /* success/failure */
 int                             /* error */
 xfs_alloc_lookup_ge(
        struct xfs_btree_cur    *cur,   /* btree cursor */
@@ -236,5 +229,7 @@ xfs_alloc_get_rec(
 int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
+                struct xfs_buf **agbp);
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 882c8d338891..4f2aed04f827 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -50,7 +50,6 @@ int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
 int     xfs_attr_shortform_getvalue(struct xfs_da_args *args);
 int     xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
 int     xfs_attr_shortform_remove(struct xfs_da_args *args);
-int     xfs_attr_shortform_list(struct xfs_attr_list_context *context);
 int     xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
 int     xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
 void    xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
@@ -88,8 +87,6 @@ int	xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
 void    xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
                                       struct xfs_da_state_blk *drop_blk,
                                       struct xfs_da_state_blk *save_blk);
-int     xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
 /*
 * Utility routines.
 */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 932381caef1b..2f2c85cc8117 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -570,14 +570,12 @@ xfs_bmap_validate_ret(
 */
 void
 xfs_bmap_add_free(
+        struct xfs_mount        *mp,            /* mount point structure */
+        struct xfs_bmap_free    *flist,         /* list of extents */
        xfs_fsblock_t           bno,            /* fs block number of extent */
-        xfs_filblks_t           len,            /* length of extent */
+        xfs_filblks_t           len)            /* length of extent */
-        xfs_bmap_free_t         *flist,         /* list of extents */
-        xfs_mount_t             *mp)            /* mount point structure */
 {
-        xfs_bmap_free_item_t    *cur;           /* current (next) element */
+        struct xfs_bmap_free_item       *new;           /* new element */
-        xfs_bmap_free_item_t    *new;           /* new element */
-        xfs_bmap_free_item_t    *prev;          /* previous element */
 #ifdef DEBUG
        xfs_agnumber_t          agno;
        xfs_agblock_t           agbno;
@@ -597,17 +595,7 @@ xfs_bmap_add_free(
        new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
        new->xbfi_startblock = bno;
        new->xbfi_blockcount = (xfs_extlen_t)len;
-        for (prev = NULL, cur = flist->xbf_first;
+        list_add(&new->xbfi_list, &flist->xbf_flist);
-             cur != NULL;
-             prev = cur, cur = cur->xbfi_next) {
-                if (cur->xbfi_startblock >= bno)
-                        break;
-        }
-        if (prev)
-                prev->xbfi_next = new;
-        else
-                flist->xbf_first = new;
-        new->xbfi_next = cur;
        flist->xbf_count++;
 }
@@ -617,14 +605,10 @@ xfs_bmap_add_free(
 */
 void
 xfs_bmap_del_free(
-        xfs_bmap_free_t         *flist, /* free item list header */
+        struct xfs_bmap_free            *flist, /* free item list header */
-        xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
+        struct xfs_bmap_free_item       *free)  /* list item to be freed */
-        xfs_bmap_free_item_t    *free)  /* list item to be freed */
 {
-        if (prev)
+        list_del(&free->xbfi_list);
-                prev->xbfi_next = free->xbfi_next;
-        else
-                flist->xbf_first = free->xbfi_next;
        flist->xbf_count--;
        kmem_zone_free(xfs_bmap_free_item_zone, free);
 }
@@ -634,17 +618,16 @@ xfs_bmap_del_free(
 */
 void
 xfs_bmap_cancel(
-        xfs_bmap_free_t         *flist) /* list of bmap_free_items */
+        struct xfs_bmap_free            *flist) /* list of bmap_free_items */
 {
-        xfs_bmap_free_item_t    *free;  /* free list item */
+        struct xfs_bmap_free_item       *free;  /* free list item */
-        xfs_bmap_free_item_t    *next;
        if (flist->xbf_count == 0)
                return;
-        ASSERT(flist->xbf_first != NULL);
+        while (!list_empty(&flist->xbf_flist)) {
-        for (free = flist->xbf_first; free; free = next) {
+                free = list_first_entry(&flist->xbf_flist,
-                next = free->xbfi_next;
+                                struct xfs_bmap_free_item, xbfi_list);
-                xfs_bmap_del_free(flist, NULL, free);
+                xfs_bmap_del_free(flist, free);
        }
        ASSERT(flist->xbf_count == 0);
 }
@@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents(
        cblock = XFS_BUF_TO_BLOCK(cbp);
        if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
                return error;
-        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+        xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
        ip->i_d.di_nblocks--;
        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
        xfs_trans_binval(tp, cbp);
@@ -5073,8 +5056,8 @@ xfs_bmap_del_extent(
         * If we need to, add to list of extents to delete.
         */
        if (do_fx)
-                xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+                xfs_bmap_add_free(mp, flist, del->br_startblock,
-                        mp);
+                        del->br_blockcount);
        /*
         * Adjust inode # blocks in the file.
         */
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 423a34e832bd..f1f3ae6c0a3f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -62,12 +62,12 @@ struct xfs_bmalloca {
 * List of extents to be free "later".
 * The list is kept sorted on xbf_startblock.
 */
-typedef struct xfs_bmap_free_item
+struct xfs_bmap_free_item
 {
        xfs_fsblock_t           xbfi_startblock;/* starting fs block number */
        xfs_extlen_t            xbfi_blockcount;/* number of blocks in extent */
-        struct xfs_bmap_free_item *xbfi_next;   /* link to next entry */
+        struct list_head        xbfi_list;
-} xfs_bmap_free_item_t;
+};
 /*
 * Header for free extent list.
@@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item
 */
 typedef struct xfs_bmap_free
 {
-        xfs_bmap_free_item_t    *xbf_first;     /* list of to-be-free extents */
+        struct list_head        xbf_flist;      /* list of to-be-free extents */
        int                     xbf_count;      /* count of items on list */
        int                     xbf_low;        /* alloc in low mode */
 } xfs_bmap_free_t;
@@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w)
 static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 {
-        ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
+        INIT_LIST_HEAD(&flp->xbf_flist);
-                (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+        flp->xbf_count = 0;
+        flp->xbf_low = 0;
+        *fbp = NULLFSBLOCK;
 }
 /*
@@ -191,8 +193,8 @@ void	xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
 int     xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
 void    xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void    xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
+void    xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
-                struct xfs_bmap_free *flist, struct xfs_mount *mp);
+                          xfs_fsblock_t bno, xfs_filblks_t len);
 void    xfs_bmap_cancel(struct xfs_bmap_free *flist);
 int     xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
                        struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6282f6e708af..db0c71e470c9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -526,7 +526,7 @@ xfs_bmbt_free_block(
        struct xfs_trans        *tp = cur->bc_tp;
        xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
-        xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+        xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
        ip->i_d.di_nblocks--;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 1f88e1ce770f..07eeb0b4ca74 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -543,12 +543,12 @@ xfs_btree_ptr_addr(
 */
 STATIC struct xfs_btree_block *
 xfs_btree_get_iroot(
-       struct xfs_btree_cur    *cur)
+        struct xfs_btree_cur    *cur)
 {
-       struct xfs_ifork        *ifp;
+        struct xfs_ifork        *ifp;
-       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+        ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
-       return (struct xfs_btree_block *)ifp->if_broot;
+        return (struct xfs_btree_block *)ifp->if_broot;
 }
 /*
@@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify(
        return true;
 }
+/*
+ * Calculate the number of btree levels needed to store a given number of
+ * records in a short-format btree.
+ */
+uint
+xfs_btree_compute_maxlevels(
+        struct xfs_mount        *mp,
+        uint                    *limits,
+        unsigned long           len)
+{
+        uint                    level;
+        unsigned long           maxblocks;
+        maxblocks = (len + limits[0] - 1) / limits[0];
+        for (level = 1; maxblocks > 1; level++)
+                maxblocks = (maxblocks + limits[1] - 1) / limits[1];
+        return level;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 2e874be70209..785a99682159 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
 bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
 bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
+                                 unsigned long len);
 #endif  /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 097bf7717d80..0f1f165f4048 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -356,7 +356,6 @@ xfs_da3_split(
        struct xfs_da_state_blk *newblk;
        struct xfs_da_state_blk *addblk;
        struct xfs_da_intnode   *node;
-        struct xfs_buf          *bp;
        int                     max;
        int                     action = 0;
        int                     error;
@@ -397,7 +396,9 @@ xfs_da3_split(
                                break;
                        }
                        /*
-                         * Entry wouldn't fit, split the leaf again.
+                         * Entry wouldn't fit, split the leaf again. The new
+                         * extrablk will be consumed by xfs_da3_node_split if
+                         * the node is split.
                         */
                        state->extravalid = 1;
                        if (state->inleaf) {
@@ -446,6 +447,14 @@ xfs_da3_split(
                return 0;
        /*
+         * xfs_da3_node_split() should have consumed any extra blocks we added
+         * during a double leaf split in the attr fork. This is guaranteed as
+         * we can't be here if the attr fork only has a single leaf block.
+         */
+        ASSERT(state->extravalid == 0 ||
+               state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+        /*
         * Split the root node.
         */
        ASSERT(state->path.active == 0);
@@ -457,43 +466,33 @@ xfs_da3_split(
        }
        /*
-         * Update pointers to the node which used to be block 0 and
+         * Update pointers to the node which used to be block 0 and just got
-         * just got bumped because of the addition of a new root node.
+         * bumped because of the addition of a new root node.  Note that the
-         * There might be three blocks involved if a double split occurred,
+         * original block 0 could be at any position in the list of blocks in
-         * and the original block 0 could be at any position in the list.
+         * the tree.
         *
-         * Note: the magic numbers and sibling pointers are in the same
+         * Note: the magic numbers and sibling pointers are in the same physical
-         * physical place for both v2 and v3 headers (by design). Hence it
+         * place for both v2 and v3 headers (by design). Hence it doesn't matter
-         * doesn't matter which version of the xfs_da_intnode structure we use
+         * which version of the xfs_da_intnode structure we use here as the
-         * here as the result will be the same using either structure.
+         * result will be the same using either structure.
         */
        node = oldblk->bp->b_addr;
        if (node->hdr.info.forw) {
-                if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
+                ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
-                        bp = addblk->bp;
+                node = addblk->bp->b_addr;
-                } else {
-                        ASSERT(state->extravalid);
-                        bp = state->extrablk.bp;
-                }
-                node = bp->b_addr;
                node->hdr.info.back = cpu_to_be32(oldblk->blkno);
-                xfs_trans_log_buf(state->args->trans, bp,
+                xfs_trans_log_buf(state->args->trans, addblk->bp,
-                    XFS_DA_LOGRANGE(node, &node->hdr.info,
+                                  XFS_DA_LOGRANGE(node, &node->hdr.info,
-                    sizeof(node->hdr.info)));
+                                  sizeof(node->hdr.info)));
        }
        node = oldblk->bp->b_addr;
        if (node->hdr.info.back) {
-                if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
+                ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
-                        bp = addblk->bp;
+                node = addblk->bp->b_addr;
-                } else {
-                        ASSERT(state->extravalid);
-                        bp = state->extrablk.bp;
-                }
-                node = bp->b_addr;
                node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
-                xfs_trans_log_buf(state->args->trans, bp,
+                xfs_trans_log_buf(state->args->trans, addblk->bp,
-                    XFS_DA_LOGRANGE(node, &node->hdr.info,
+                                  XFS_DA_LOGRANGE(node, &node->hdr.info,
-                    sizeof(node->hdr.info)));
+                                  sizeof(node->hdr.info)));
        }
        addblk->bp = NULL;
        return 0;
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index 9d624a622946..f1e8d4dbb600 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -40,8 +40,7 @@ xfs_dir2_sf_entsize(
        int count = sizeof(struct xfs_dir2_sf_entry);   /* namelen + offset */
        count += len;                                   /* name */
-        count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+        count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
-                                sizeof(xfs_dir2_ino4_t); /* ino # */
        return count;
 }
@@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype(
 static xfs_ino_t
 xfs_dir2_sf_get_ino(
        struct xfs_dir2_sf_hdr  *hdr,
-        xfs_dir2_inou_t         *from)
+        __uint8_t               *from)
 {
        if (hdr->i8count)
-                return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+                return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
        else
-                return get_unaligned_be32(&from->i4.i);
+                return get_unaligned_be32(from);
 }
 static void
 xfs_dir2_sf_put_ino(
        struct xfs_dir2_sf_hdr  *hdr,
-        xfs_dir2_inou_t         *to,
+        __uint8_t               *to,
        xfs_ino_t               ino)
 {
        ASSERT((ino & 0xff00000000000000ULL) == 0);
        if (hdr->i8count)
-                put_unaligned_be64(ino, &to->i8.i);
+                put_unaligned_be64(ino, to);
        else
-                put_unaligned_be32(ino, &to->i4.i);
+                put_unaligned_be32(ino, to);
 }
 static xfs_ino_t
 xfs_dir2_sf_get_parent_ino(
        struct xfs_dir2_sf_hdr  *hdr)
 {
-        return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+        return xfs_dir2_sf_get_ino(hdr, hdr->parent);
 }
 static void
@@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino(
        struct xfs_dir2_sf_hdr  *hdr,
        xfs_ino_t               ino)
 {
-        xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+        xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
 }
 /*
@@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino(
        struct xfs_dir2_sf_hdr  *hdr,
        struct xfs_dir2_sf_entry *sfep)
 {
-        return xfs_dir2_sf_get_ino(hdr,
+        return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
-                                (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
 }
 static void
@@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino(
        struct xfs_dir2_sf_entry *sfep,
        xfs_ino_t               ino)
 {
-        xfs_dir2_sf_put_ino(hdr,
+        xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
-                            (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
 }
 static xfs_ino_t
@@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino(
        struct xfs_dir2_sf_hdr  *hdr,
        struct xfs_dir2_sf_entry *sfep)
 {
-        return xfs_dir2_sf_get_ino(hdr,
+        return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
-                        (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
 }
 static void
@@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino(
        struct xfs_dir2_sf_entry *sfep,
        xfs_ino_t               ino)
 {
-        xfs_dir2_sf_put_ino(hdr,
+        xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
-                        (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
 }
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 8d4d8bce41bf..685f23b67056 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -192,12 +192,6 @@ typedef	__uint16_t	xfs_dir2_data_off_t;
 typedef uint            xfs_dir2_data_aoff_t;   /* argument form */
 /*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
-/*
 * Offset in data space of a data entry.
 */
 typedef __uint32_t      xfs_dir2_dataptr_t;
@@ -214,22 +208,10 @@ typedef	xfs_off_t	xfs_dir2_off_t;
 */
 typedef __uint32_t      xfs_dir2_db_t;
-/*
+#define XFS_INO32_SIZE  4
- * Inode number stored as 8 8-bit values.
+#define XFS_INO64_SIZE  8
- */
+#define XFS_INO64_DIFF  (XFS_INO64_SIZE - XFS_INO32_SIZE)
-typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
-typedef union {
-        xfs_dir2_ino8_t i8;
-        xfs_dir2_ino4_t i4;
-} xfs_dir2_inou_t;
 #define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
 /*
@@ -246,39 +228,38 @@ typedef union {
 typedef struct xfs_dir2_sf_hdr {
        __uint8_t               count;          /* count of entries */
        __uint8_t               i8count;        /* count of 8-byte inode #s */
-        xfs_dir2_inou_t         parent;         /* parent dir inode number */
+        __uint8_t               parent[8];      /* parent dir inode number */
-} __arch_pack xfs_dir2_sf_hdr_t;
+} __packed xfs_dir2_sf_hdr_t;
 typedef struct xfs_dir2_sf_entry {
        __u8                    namelen;        /* actual name length */
-        xfs_dir2_sf_off_t       offset;         /* saved offset */
+        __u8                    offset[2];      /* saved offset */
        __u8                    name[];         /* name, variable size */
        /*
         * A single byte containing the file type field follows the inode
         * number for version 3 directory entries.
         *
-         * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
+         * A 64-bit or 32-bit inode number follows here, at a variable offset
-         * variable offset after the name.
+         * after the name.
         */
-} __arch_pack xfs_dir2_sf_entry_t;
+} xfs_dir2_sf_entry_t;
 static inline int xfs_dir2_sf_hdr_size(int i8count)
 {
        return sizeof(struct xfs_dir2_sf_hdr) -
-                (i8count == 0) *
+                (i8count == 0) * XFS_INO64_DIFF;
-                (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
 }
 static inline xfs_dir2_data_aoff_t
 xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
 {
-        return get_unaligned_be16(&sfep->offset.i);
+        return get_unaligned_be16(sfep->offset);
 }
 static inline void
 xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
 {
-        put_unaligned_be16(off, &sfep->offset.i);
+        put_unaligned_be16(off, sfep->offset);
 }
 static inline struct xfs_dir2_sf_entry *
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index e5bb9cc3b243..c6809ff41197 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -126,13 +126,12 @@ xfs_dir2_block_sfsize(
                /*
                 * Calculate the new size, see if we should give up yet.
                 */
-                size = xfs_dir2_sf_hdr_size(i8count) +          /* header */
+                size = xfs_dir2_sf_hdr_size(i8count) +  /* header */
-                       count +                                  /* namelen */
+                       count * 3 * sizeof(u8) +         /* namelen + offset */
-                       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+                       namelen +                        /* name */
-                       namelen +                                /* name */
+                       (i8count ?                       /* inumber */
-                       (i8count ?                               /* inumber */
+                                count * XFS_INO64_SIZE :
-                                (uint)sizeof(xfs_dir2_ino8_t) * count :
+                                count * XFS_INO32_SIZE);
-                                (uint)sizeof(xfs_dir2_ino4_t) * count);
                if (size > XFS_IFORK_DSIZE(dp))
                        return size;            /* size value is a failure */
        }
@@ -319,10 +318,7 @@ xfs_dir2_sf_addname(
                /*
                 * Yes, adjust the inode size.  old count + (parent + new)
                 */
-                incr_isize +=
+                incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
-                        (sfp->count + 2) *
-                        ((uint)sizeof(xfs_dir2_ino8_t) -
-                         (uint)sizeof(xfs_dir2_ino4_t));
                objchange = 1;
        }
@@ -897,11 +893,7 @@ xfs_dir2_sf_replace(
                int     error;                  /* error return value */
                int     newsize;                /* new inode size */
-                newsize =
+                newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
-                        dp->i_df.if_bytes +
-                        (sfp->count + 1) *
-                        ((uint)sizeof(xfs_dir2_ino8_t) -
-                         (uint)sizeof(xfs_dir2_ino4_t));
                /*
                 * Won't fit as shortform, convert to block then do replace.
                 */
@@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4(
        /*
         * Compute the new inode size.
         */
-        newsize =
+        newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
-                oldsize -
-                (oldsfp->count + 1) *
-                ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
        xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
        xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
        /*
@@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4(
             i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
                  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
                sfep->namelen = oldsfep->namelen;
-                sfep->offset = oldsfep->offset;
+                memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
                memcpy(sfep->name, oldsfep->name, sfep->namelen);
                dp->d_ops->sf_put_ino(sfp, sfep,
                                      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
@@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8(
        /*
         * Compute the new inode size (nb: entry count + 1 for parent)
         */
-        newsize =
+        newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
-                oldsize +
-                (oldsfp->count + 1) *
-                ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
        xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
        xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
        /*
@@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8(
             i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
                  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
                sfep->namelen = oldsfep->namelen;
-                sfep->offset = oldsfep->offset;
+                memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
                memcpy(sfep->name, oldsfep->name, sfep->namelen);
                dp->d_ops->sf_put_ino(sfp, sfep,
                                      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index dc97eb21af07..adb204d40f22 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 * with the crc feature bit, and all accesses to them must be conditional on
 * that flag.
 */
+/* short form block header */
+struct xfs_btree_block_shdr {
+        __be32          bb_leftsib;
+        __be32          bb_rightsib;
+        __be64          bb_blkno;
+        __be64          bb_lsn;
+        uuid_t          bb_uuid;
+        __be32          bb_owner;
+        __le32          bb_crc;
+};
+/* long form block header */
+struct xfs_btree_block_lhdr {
+        __be64          bb_leftsib;
+        __be64          bb_rightsib;
+        __be64          bb_blkno;
+        __be64          bb_lsn;
+        uuid_t          bb_uuid;
+        __be64          bb_owner;
+        __le32          bb_crc;
+        __be32          bb_pad; /* padding for alignment */
+};
 struct xfs_btree_block {
        __be32          bb_magic;       /* magic number for block type */
        __be16          bb_level;       /* 0 is a leaf */
        __be16          bb_numrecs;     /* current # of data records */
        union {
-                struct {
+                struct xfs_btree_block_shdr s;
-                        __be32          bb_leftsib;
+                struct xfs_btree_block_lhdr l;
-                        __be32          bb_rightsib;
-                        __be64          bb_blkno;
-                        __be64          bb_lsn;
-                        uuid_t          bb_uuid;
-                        __be32          bb_owner;
-                        __le32          bb_crc;
-                } s;                    /* short form pointers */
-                struct  {
-                        __be64          bb_leftsib;
-                        __be64          bb_rightsib;
-                        __be64          bb_blkno;
-                        __be64          bb_lsn;
-                        uuid_t          bb_uuid;
-                        __be64          bb_owner;
-                        __le32          bb_crc;
-                        __be32          bb_pad; /* padding for alignment */
-                } l;                    /* long form pointers */
        } bb_u;                         /* rest */
 };
-#define XFS_BTREE_SBLOCK_LEN    16      /* size of a short form block */
+/* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN    24      /* size of a long form block */
+#define XFS_BTREE_SBLOCK_LEN \
+        (offsetof(struct xfs_btree_block, bb_u) + \
+         offsetof(struct xfs_btree_block_shdr, bb_blkno))
+/* size of a long form block */
+#define XFS_BTREE_LBLOCK_LEN \
+        (offsetof(struct xfs_btree_block, bb_u) + \
+         offsetof(struct xfs_btree_block_lhdr, bb_blkno))
 /* sizes of CRC enabled btree blocks */
-#define XFS_BTREE_SBLOCK_CRC_LEN        (XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_SBLOCK_CRC_LEN \
-#define XFS_BTREE_LBLOCK_CRC_LEN        (XFS_BTREE_LBLOCK_LEN + 48)
+        (offsetof(struct xfs_btree_block, bb_u) + \
+         sizeof(struct xfs_btree_block_shdr))
+#define XFS_BTREE_LBLOCK_CRC_LEN \
+        (offsetof(struct xfs_btree_block, bb_u) + \
+         sizeof(struct xfs_btree_block_lhdr))
 #define XFS_BTREE_SBLOCK_CRC_OFF \
        offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index fffe3d01bd9f..f5ec9c5ccae6 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -521,12 +521,8 @@ typedef struct xfs_swapext
 #define XFS_IOC_ERROR_CLEARALL       _IOW ('X', 117, struct xfs_error_injection)
 /*      XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118      */
-/*      XFS_IOC_FREEZE            -- FIFREEZE   119      */
+#define XFS_IOC_FREEZE               _IOWR('X', 119, int)       /* aka FIFREEZE */
-/*      XFS_IOC_THAW              -- FITHAW     120      */
+#define XFS_IOC_THAW                 _IOWR('X', 120, int)       /* aka FITHAW */
-#ifndef FIFREEZE
-#define XFS_IOC_FREEZE               _IOWR('X', 119, int)
-#define XFS_IOC_THAW                 _IOWR('X', 120, int)
-#endif
 #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 22297f9b0fd5..4b1e408169a8 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk(
        if (!xfs_inobt_issparse(rec->ir_holemask)) {
                /* not sparse, calculate extent info directly */
-                xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+                xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
-                                  XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
+                                  mp->m_ialloc_blks);
-                                  mp->m_ialloc_blks, flist, mp);
                return;
        }
@@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk(
                ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
                ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
-                xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
+                xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
-                                  flist, mp);
+                                  contigblk);
                /* reset range to current bit and carry on... */
                startidx = endidx = nextbit;
@@ -2395,20 +2394,11 @@ void
 xfs_ialloc_compute_maxlevels(
        xfs_mount_t     *mp)            /* file system mount structure */
 {
-        int             level;
+        uint            inodes;
-        uint            maxblocks;
-        uint            maxleafents;
+        inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
-        int             minleafrecs;
+        mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
-        int             minnoderecs;
+                                                         inodes);
-        maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
-                XFS_INODES_PER_CHUNK_LOG;
-        minleafrecs = mp->m_inobt_mnr[0];
-        minnoderecs = mp->m_inobt_mnr[1];
-        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-        for (level = 1; maxblocks > 1; level++)
-                maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-        mp->m_in_maxlevels = level;
 }
 /*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 951c044e24e4..e2e1106c9fad 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
 * Get a buffer for the bitmap or summary file block specified.
 * The buffer is returned read and locked.
 */
-int
+static int
 xfs_rtbuf_get(
        xfs_mount_t     *mp,            /* file system mount structure */
        xfs_trans_t     *tp,            /* transaction pointer */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 87d2b215cbbd..7575cfc3ad15 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
 * We're now finished for good with this page.  Update the page state via the
 * associated buffer_heads, paying attention to the start and end offsets that
 * we need to process on the page.
+ *
+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+ * the page at all, as we may be racing with memory reclaim and it can free both
+ * the bufferhead chain and the page as it will see the page as clean and
+ * unused.
 */
 static void
 xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
        int                     error)
 {
        unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
-        struct buffer_head      *head, *bh;
+        struct buffer_head      *head, *bh, *next;
        unsigned int            off = 0;
+        unsigned int            bsize;
        ASSERT(bvec->bv_offset < PAGE_SIZE);
        ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
        bh = head = page_buffers(bvec->bv_page);
+        bsize = bh->b_size;
        do {
+                next = bh->b_this_page;
                if (off < bvec->bv_offset)
                        goto next_bh;
                if (off > end)
                        break;
                bh->b_end_io(bh, !error);
 next_bh:
-                off += bh->b_size;
+                off += bsize;
-        } while ((bh = bh->b_this_page) != head);
+        } while ((bh = next) != head);
 }
 /*
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+        /*
+         * mm accommodates an old ext3 case where clean pages might not have had
+         * the dirty bit cleared. Thus, it can send actual dirty pages to
+         * ->releasepage() via shrink_active_list(). Conversely,
+         * block_invalidatepage() can send pages that are still marked dirty
+         * but otherwise have invalidated buffers.
+         *
+         * We've historically freed buffers on the latter. Instead, quietly
+         * filter out all dirty pages to avoid spurious buffer state warnings.
+         * This can likely be removed once shrink_active_list() is fixed.
+         */
+        if (PageDirty(page))
+                return 0;
        xfs_count_page_state(page, &delalloc, &unwritten);
        if (WARN_ON_ONCE(delalloc))
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
        ssize_t                 size;
        int                     new = 0;
+        BUG_ON(create && !direct);
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
        size = bh_result->b_size;
-        if (!create && direct && offset >= i_size_read(inode))
+        if (!create && offset >= i_size_read(inode))
                return 0;
        /*
         * Direct I/O is usually done on preallocated files, so try getting
-         * a block mapping without an exclusive lock first.  For buffered
+         * a block mapping without an exclusive lock first.
-         * writes we already have the exclusive iolock anyway, so avoiding
-         * a lock roundtrip here by taking the ilock exclusive from the
-         * beginning is a useful micro optimization.
         */
-        if (create && !direct) {
+        lockmode = xfs_ilock_data_map_shared(ip);
-                lockmode = XFS_ILOCK_EXCL;
-                xfs_ilock(ip, lockmode);
-        } else {
-                lockmode = xfs_ilock_data_map_shared(ip);
-        }
        ASSERT(offset <= mp->m_super->s_maxbytes);
        if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
             (imap.br_startblock == HOLESTARTBLOCK ||
              imap.br_startblock == DELAYSTARTBLOCK) ||
             (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-                if (direct || xfs_get_extsz_hint(ip)) {
+                /*
-                        /*
+                 * xfs_iomap_write_direct() expects the shared lock. It
-                         * xfs_iomap_write_direct() expects the shared lock. It
+                 * is unlocked on return.
-                         * is unlocked on return.
+                 */
-                         */
+                if (lockmode == XFS_ILOCK_EXCL)
-                        if (lockmode == XFS_ILOCK_EXCL)
+                        xfs_ilock_demote(ip, lockmode);
-                                xfs_ilock_demote(ip, lockmode);
-                        error = xfs_iomap_write_direct(ip, offset, size,
-                                                       &imap, nimaps);
-                        if (error)
-                                return error;
-                        new = 1;
-                } else {
+                error = xfs_iomap_write_direct(ip, offset, size,
-                        /*
+                                               &imap, nimaps);
-                         * Delalloc reservations do not require a transaction,
+                if (error)
-                         * we can go on without dropping the lock here. If we
+                        return error;
-                         * are allocating a new delalloc block, make sure that
+                new = 1;
-                         * we set the new flag so that we mark the buffer new so
-                         * that we know that it is newly allocated if the write
-                         * fails.
-                         */
-                        if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-                                new = 1;
-                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
-                        if (error)
-                                goto out_unlock;
-                        xfs_iunlock(ip, lockmode);
-                }
                trace_xfs_get_blocks_alloc(ip, offset, size,
                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
                                                   : XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
        }
        /* trim mapping down to size requested */
-        if (direct || size > (1 << inode->i_blkbits))
+        xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
-                xfs_map_trim_size(inode, iblock, bh_result,
-                                  &imap, offset, size);
        /*
         * For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
                if (ISUNWRITTEN(&imap))
                        set_buffer_unwritten(bh_result);
                /* direct IO needs special help */
-                if (create && direct) {
+                if (create) {
                        if (dax_fault)
                                ASSERT(!ISUNWRITTEN(&imap));
                        else
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
             (new || ISUNWRITTEN(&imap))))
                set_buffer_new(bh_result);
-        if (imap.br_startblock == DELAYSTARTBLOCK) {
+        BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
-                BUG_ON(direct);
-                if (create) {
-                        set_buffer_uptodate(bh_result);
-                        set_buffer_mapped(bh_result);
-                        set_buffer_delay(bh_result);
-                }
-        }
        return 0;
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
 * whereas if we have flags set we will always be called in task context
 * (i.e. from a workqueue).
 */
-STATIC int
+int
 xfs_end_io_direct_write(
        struct kiocb            *iocb,
        loff_t                  offset,
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
        struct kiocb            *iocb,
        struct iov_iter         *iter)
 {
-        struct inode            *inode = iocb->ki_filp->f_mapping->host;
-        dio_iodone_t            *endio = NULL;
-        int                     flags = 0;
-        struct block_device     *bdev;
-        if (iov_iter_rw(iter) == WRITE) {
-                endio = xfs_end_io_direct_write;
-                flags = DIO_ASYNC_EXTEND;
-        }
-        if (IS_DAX(inode)) {
-                return dax_do_io(iocb, inode, iter,
-                                 xfs_get_blocks_direct, endio, 0);
-        }
-        bdev = xfs_find_bdev_for_inode(inode);
-        return  __blockdev_direct_IO(iocb, inode, bdev, iter,
-                        xfs_get_blocks_direct, endio, NULL, flags);
-}
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
-        struct inode            *inode,
-        loff_t                  start,
-        loff_t                  end)
-{
-        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fileoff_t           start_fsb;
-        xfs_fileoff_t           end_fsb;
-        int                     error;
-        start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-        end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-        if (end_fsb <= start_fsb)
-                return;
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-                                                end_fsb - start_fsb);
-        if (error) {
-                /* something screwed, just bail */
-                if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                        xfs_alert(ip->i_mount,
-                "xfs_vm_write_failed: unable to clean up ino %lld",
-                                        ip->i_ino);
-                }
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-STATIC void
-xfs_vm_write_failed(
-        struct inode            *inode,
-        struct page             *page,
-        loff_t                  pos,
-        unsigned                len)
-{
-        loff_t                  block_offset;
-        loff_t                  block_start;
-        loff_t                  block_end;
-        loff_t                  from = pos & (PAGE_SIZE - 1);
-        loff_t                  to = from + len;
-        struct buffer_head      *bh, *head;
-        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
        /*
-         * The request pos offset might be 32 or 64 bit, this is all fine
+         * We just need the method present so that open/fcntl allow direct I/O.
-         * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-         * platform, the high 32-bit will be masked off if we evaluate the
-         * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-         * 0xfffff000 as an unsigned long, hence the result is incorrect
-         * which could cause the following ASSERT failed in most cases.
-         * In order to avoid this, we can evaluate the block_offset of the
-         * start of the page by using shifts rather than masks the mismatch
-         * problem.
         */
-        block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
+        return -EINVAL;
-        ASSERT(block_offset + from == pos);
-        head = page_buffers(page);
-        block_start = 0;
-        for (bh = head; bh != head || !block_start;
-             bh = bh->b_this_page, block_start = block_end,
-                                   block_offset += bh->b_size) {
-                block_end = block_start + bh->b_size;
-                /* skip buffers before the write */
-                if (block_end <= from)
-                        continue;
-                /* if the buffer is after the write, we're done */
-                if (block_start >= to)
-                        break;
-                /*
-                 * Process delalloc and unwritten buffers beyond EOF. We can
-                 * encounter unwritten buffers in the event that a file has
-                 * post-EOF unwritten extents and an extending write happens to
-                 * fail (e.g., an unaligned write that also involves a delalloc
-                 * to the same page).
-                 */
-                if (!buffer_delay(bh) && !buffer_unwritten(bh))
-                        continue;
-                if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-                    block_offset < i_size_read(inode))
-                        continue;
-                if (buffer_delay(bh))
-                        xfs_vm_kill_delalloc_range(inode, block_offset,
-                                                   block_offset + bh->b_size);
-                /*
-                 * This buffer does not contain data anymore. make sure anyone
-                 * who finds it knows that for certain.
-                 */
-                clear_buffer_delay(bh);
-                clear_buffer_uptodate(bh);
-                clear_buffer_mapped(bh);
-                clear_buffer_new(bh);
-                clear_buffer_dirty(bh);
-                clear_buffer_unwritten(bh);
-        }
-}
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
-        struct file             *file,
-        struct address_space    *mapping,
-        loff_t                  pos,
-        unsigned                len,
-        unsigned                flags,
-        struct page             **pagep,
-        void                    **fsdata)
-{
-        pgoff_t                 index = pos >> PAGE_SHIFT;
-        struct page             *page;
-        int                     status;
-        struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
-        ASSERT(len <= PAGE_SIZE);
-        page = grab_cache_page_write_begin(mapping, index, flags);
-        if (!page)
-                return -ENOMEM;
-        status = __block_write_begin(page, pos, len, xfs_get_blocks);
-        if (xfs_mp_fail_writes(mp))
-                status = -EIO;
-        if (unlikely(status)) {
-                struct inode    *inode = mapping->host;
-                size_t          isize = i_size_read(inode);
-                xfs_vm_write_failed(inode, page, pos, len);
-                unlock_page(page);
-                /*
-                 * If the write is beyond EOF, we only want to kill blocks
-                 * allocated in this write, not blocks that were previously
-                 * written successfully.
-                 */
-                if (xfs_mp_fail_writes(mp))
-                        isize = 0;
-                if (pos + len > isize) {
-                        ssize_t start = max_t(ssize_t, pos, isize);
-                        truncate_pagecache_range(inode, start, pos + len);
-                }
-                put_page(page);
-                page = NULL;
-        }
-        *pagep = page;
-        return status;
-}
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
-        struct file             *file,
-        struct address_space    *mapping,
-        loff_t                  pos,
-        unsigned                len,
-        unsigned                copied,
-        struct page             *page,
-        void                    *fsdata)
-{
-        int                     ret;
-        ASSERT(len <= PAGE_SIZE);
-        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (unlikely(ret < len)) {
-                struct inode    *inode = mapping->host;
-                size_t          isize = i_size_read(inode);
-                loff_t          to = pos + len;
-                if (to > isize) {
-                        /* only kill blocks in this write beyond EOF */
-                        if (pos > isize)
-                                isize = pos;
-                        xfs_vm_kill_delalloc_range(inode, isize, to);
-                        truncate_pagecache_range(inode, isize, to);
-                }
-        }
-        return ret;
 }
 STATIC sector_t
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
        .set_page_dirty         = xfs_vm_set_page_dirty,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
-        .write_begin            = xfs_vm_write_begin,
-        .write_end              = xfs_vm_write_end,
        .bmap                   = xfs_vm_bmap,
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 814aab790713..bf2d9a141a73 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -60,6 +60,9 @@ int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
 int     xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
                                 struct buffer_head *map_bh, int create);
+int     xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
+                ssize_t size, void *private);
 extern void xfs_count_page_state(struct page *, int *, int *);
 extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 55d214981ed2..be0b79d8900f 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -322,7 +322,7 @@ xfs_attr3_node_inactive(
 * Recurse (gasp!) through the attribute nodes until we find leaves.
 * We're doing a depth-first traversal in order to invalidate everything.
 */
-int
+static int
 xfs_attr3_root_inactive(
        struct xfs_trans        **trans,
        struct xfs_inode        *dp)
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index d25f26b22ac9..25e76cd6c053 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
 * we have to calculate each entries' hashvalue and sort them before
 * we can begin returning them to the user.
 */
-int
+static int
 xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 {
        attrlist_cursor_kern_t *cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 586bb64e674b..cd4a850564f2 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -79,6 +79,23 @@ xfs_zero_extent(
                GFP_NOFS, true);
 }
+/* Sort bmap items by AG. */
+static int
+xfs_bmap_free_list_cmp(
+        void                    *priv,
+        struct list_head        *a,
+        struct list_head        *b)
+{
+        struct xfs_mount        *mp = priv;
+        struct xfs_bmap_free_item       *ra;
+        struct xfs_bmap_free_item       *rb;
+        ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
+        rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
+        return  XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
+                XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
+}
 /*
 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
 * caller.  Frees all the extents that need freeing, which must be done
@@ -99,14 +116,15 @@ xfs_bmap_finish(
        int                             error;  /* error return value */
        int                             committed;/* xact committed or not */
        struct xfs_bmap_free_item       *free;  /* free extent item */
-        struct xfs_bmap_free_item       *next;  /* next item on free list */
        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
        if (flist->xbf_count == 0)
                return 0;
+        list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
        efi = xfs_trans_get_efi(*tp, flist->xbf_count);
-        for (free = flist->xbf_first; free; free = free->xbfi_next)
+        list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
                xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
                        free->xbfi_blockcount);
@@ -125,9 +143,7 @@ xfs_bmap_finish(
                if (committed) {
                        xfs_efi_release(efi);
                        xfs_force_shutdown((*tp)->t_mountp,
-                                (error == -EFSCORRUPTED) ?
+                                           SHUTDOWN_META_IO_ERROR);
-                                        SHUTDOWN_CORRUPT_INCORE :
-                                        SHUTDOWN_META_IO_ERROR);
                }
                return error;
        }
@@ -138,15 +154,15 @@ xfs_bmap_finish(
         * on error.
         */
        efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
-        for (free = flist->xbf_first; free != NULL; free = next) {
+        while (!list_empty(&flist->xbf_flist)) {
-                next = free->xbfi_next;
+                free = list_first_entry(&flist->xbf_flist,
+                                struct xfs_bmap_free_item, xbfi_list);
                error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
                                              free->xbfi_blockcount);
                if (error)
                        return error;
-                xfs_bmap_del_free(flist, NULL, free);
+                xfs_bmap_del_free(flist, free);
        }
        return 0;
@@ -409,7 +425,7 @@ xfs_bmap_count_tree(
 /*
 * Count fsblocks of the given fork.
 */
-int                                             /* error */
+static int                                      /* error */
 xfs_bmap_count_blocks(
        xfs_trans_t             *tp,            /* transaction pointer */
        xfs_inode_t             *ip,            /* incore inode */
@@ -799,7 +815,7 @@ xfs_bmap_punch_delalloc_range(
                if (error)
                        break;
-                ASSERT(!flist.xbf_count && !flist.xbf_first);
+                ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
 next_block:
                start_fsb++;
                remaining--;
@@ -1089,99 +1105,120 @@ error1:	/* Just cancel transaction */
        return error;
 }
-/*
+static int
- * Zero file bytes between startoff and endoff inclusive.
+xfs_unmap_extent(
- * The iolock is held exclusive and no blocks are buffered.
+        struct xfs_inode        *ip,
- *
+        xfs_fileoff_t           startoffset_fsb,
- * This function is used by xfs_free_file_space() to zero
+        xfs_filblks_t           len_fsb,
- * partial blocks when the range to free is not block aligned.
+        int                     *done)
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
-        xfs_inode_t             *ip,
-        xfs_off_t               startoff,
-        xfs_off_t               endoff)
 {
-        xfs_bmbt_irec_t         imap;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           offset_fsb;
+        struct xfs_trans        *tp;
-        xfs_off_t               lastoffset;
+        struct xfs_bmap_free    free_list;
-        xfs_off_t               offset;
+        xfs_fsblock_t           firstfsb;
-        xfs_buf_t               *bp;
+        uint                    resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-        xfs_mount_t             *mp = ip->i_mount;
+        int                     error;
-        int                     nimap;
-        int                     error = 0;
-        /*
+        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
-         * Avoid doing I/O beyond eof - it's not necessary
+        if (error) {
-         * since nothing can read beyond eof.  The space will
+                ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-         * be zeroed when the file is extended anyway.
+                return error;
-         */
+        }
-        if (startoff >= XFS_ISIZE(ip))
-                return 0;
-        if (endoff > XFS_ISIZE(ip))
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                endoff = XFS_ISIZE(ip);
+        error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
+                        ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+        if (error)
+                goto out_trans_cancel;
-        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+        xfs_trans_ijoin(tp, ip, 0);
-                uint lock_mode;
-                offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        xfs_bmap_init(&free_list, &firstfsb);
-                nimap = 1;
+        error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
+                        &free_list, done);
+        if (error)
+                goto out_bmap_cancel;
-                lock_mode = xfs_ilock_data_map_shared(ip);
+        error = xfs_bmap_finish(&tp, &free_list, NULL);
-                error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+        if (error)
-                xfs_iunlock(ip, lock_mode);
+                goto out_bmap_cancel;
-                if (error || nimap < 1)
+        error = xfs_trans_commit(tp);
-                        break;
+out_unlock:
-                ASSERT(imap.br_blockcount >= 1);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                ASSERT(imap.br_startoff == offset_fsb);
+        return error;
-                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                if (imap.br_startblock == HOLESTARTBLOCK ||
+out_bmap_cancel:
-                    imap.br_state == XFS_EXT_UNWRITTEN) {
+        xfs_bmap_cancel(&free_list);
-                        /* skip the entire extent */
+out_trans_cancel:
-                        lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
+        xfs_trans_cancel(tp);
-                                                      imap.br_blockcount) - 1;
+        goto out_unlock;
-                        continue;
+}
-                }
-                lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+static int
-                if (lastoffset > endoff)
+xfs_adjust_extent_unmap_boundaries(
-                        lastoffset = endoff;
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           *startoffset_fsb,
+        xfs_fileoff_t           *endoffset_fsb)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_bmbt_irec    imap;
+        int                     nimap, error;
+        xfs_extlen_t            mod = 0;
-                /* DAX can just zero the backing device directly */
+        nimap = 1;
-                if (IS_DAX(VFS_I(ip))) {
+        error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
-                        error = dax_zero_page_range(VFS_I(ip), offset,
+        if (error)
-                                                    lastoffset - offset + 1,
+                return error;
-                                                    xfs_get_blocks_direct);
-                        if (error)
-                                return error;
-                        continue;
-                }
-                error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
+        if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                                mp->m_rtdev_targp : mp->m_ddev_targp,
+                xfs_daddr_t     block;
-                                xfs_fsb_to_db(ip, imap.br_startblock),
-                                BTOBB(mp->m_sb.sb_blocksize),
-                                0, &bp, NULL);
-                if (error)
-                        return error;
-                memset(bp->b_addr +
+                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                                (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+                block = imap.br_startblock;
-                       0, lastoffset - offset + 1);
+                mod = do_div(block, mp->m_sb.sb_rextsize);
+                if (mod)
+                        *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+        }
-                error = xfs_bwrite(bp);
+        nimap = 1;
-                xfs_buf_relse(bp);
+        error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
-                if (error)
+        if (error)
-                        return error;
+                return error;
+        if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                mod++;
+                if (mod && mod != mp->m_sb.sb_rextsize)
+                        *endoffset_fsb -= mod;
        }
-        return error;
+        return 0;
+}
+static int
+xfs_flush_unmap_range(
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        xfs_off_t               len)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct inode            *inode = VFS_I(ip);
+        xfs_off_t               rounding, start, end;
+        int                     error;
+        /* wait for the completion of any pending DIOs */
+        inode_dio_wait(inode);
+        rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+        start = round_down(offset, rounding);
+        end = round_up(offset + len, rounding) - 1;
+        error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        if (error)
+                return error;
+        truncate_pagecache_range(inode, start, end);
+        return 0;
 }
 int
@@ -1190,24 +1227,10 @@ xfs_free_file_space(
        xfs_off_t               offset,
        xfs_off_t               len)
 {
-        int                     done;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           endoffset_fsb;
-        int                     error;
-        xfs_fsblock_t           firstfsb;
-        xfs_bmap_free_t         free_list;
-        xfs_bmbt_irec_t         imap;
-        xfs_off_t               ioffset;
-        xfs_off_t               iendoffset;
-        xfs_extlen_t            mod=0;
-        xfs_mount_t             *mp;
-        int                     nimap;
-        uint                    resblks;
-        xfs_off_t               rounding;
-        int                     rt;
        xfs_fileoff_t           startoffset_fsb;
-        xfs_trans_t             *tp;
+        xfs_fileoff_t           endoffset_fsb;
+        int                     done = 0, error;
-        mp = ip->i_mount;
        trace_xfs_free_file_space(ip);
@@ -1215,135 +1238,45 @@ xfs_free_file_space(
        if (error)
                return error;
-        error = 0;
        if (len <= 0)   /* if nothing being freed */
-                return error;
+                return 0;
-        rt = XFS_IS_REALTIME_INODE(ip);
-        startoffset_fsb = XFS_B_TO_FSB(mp, offset);
-        endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-        /* wait for the completion of any pending DIOs */
-        inode_dio_wait(VFS_I(ip));
-        rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+        error = xfs_flush_unmap_range(ip, offset, len);
-        ioffset = round_down(offset, rounding);
-        iendoffset = round_up(offset + len, rounding) - 1;
-        error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
-                                             iendoffset);
        if (error)
-                goto out;
+                return error;
-        truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
+        startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+        endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
        /*
-         * Need to zero the stuff we're not freeing, on disk.
+         * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
-         * If it's a realtime file & can't use unwritten extents then we
+         * and we can't use unwritten extents then we actually need to ensure
-         * actually need to zero the extent edges.  Otherwise xfs_bunmapi
+         * to zero the whole extent, otherwise we just need to take of block
-         * will take care of it for us.
+         * boundaries, and xfs_bunmapi will handle the rest.
         */
-        if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+        if (XFS_IS_REALTIME_INODE(ip) &&
-                nimap = 1;
+            !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-                error = xfs_bmapi_read(ip, startoffset_fsb, 1,
+                error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
-                                        &imap, &nimap, 0);
+                                &endoffset_fsb);
-                if (error)
-                        goto out;
-                ASSERT(nimap == 0 || nimap == 1);
-                if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                        xfs_daddr_t     block;
-                        ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                        block = imap.br_startblock;
-                        mod = do_div(block, mp->m_sb.sb_rextsize);
-                        if (mod)
-                                startoffset_fsb += mp->m_sb.sb_rextsize - mod;
-                }
-                nimap = 1;
-                error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
-                                        &imap, &nimap, 0);
                if (error)
-                        goto out;
+                        return error;
-                ASSERT(nimap == 0 || nimap == 1);
-                if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                        ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                        mod++;
-                        if (mod && (mod != mp->m_sb.sb_rextsize))
-                                endoffset_fsb -= mod;
-                }
-        }
-        if ((done = (endoffset_fsb <= startoffset_fsb)))
-                /*
-                 * One contiguous piece to clear
-                 */
-                error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
-        else {
-                /*
-                 * Some full blocks, possibly two pieces to clear
-                 */
-                if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
-                        error = xfs_zero_remaining_bytes(ip, offset,
-                                XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
-                if (!error &&
-                    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
-                        error = xfs_zero_remaining_bytes(ip,
-                                XFS_FSB_TO_B(mp, endoffset_fsb),
-                                offset + len - 1);
        }
-        /*
+        if (endoffset_fsb > startoffset_fsb) {
-         * free file space until done or until there is an error
+                while (!done) {
-         */
+                        error = xfs_unmap_extent(ip, startoffset_fsb,
-        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+                                        endoffset_fsb - startoffset_fsb, &done);
-        while (!error && !done) {
+                        if (error)
+                                return error;
-                /*
-                 * allocate and setup the transaction. Allow this
-                 * transaction to dip into the reserve blocks to ensure
-                 * the freeing of the space succeeds at ENOSPC.
-                 */
-                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
-                                &tp);
-                if (error) {
-                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                        break;
                }
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                error = xfs_trans_reserve_quota(tp, mp,
-                                ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
-                                resblks, 0, XFS_QMOPT_RES_REGBLKS);
-                if (error)
-                        goto error1;
-                xfs_trans_ijoin(tp, ip, 0);
-                /*
-                 * issue the bunmapi() call to free the blocks
-                 */
-                xfs_bmap_init(&free_list, &firstfsb);
-                error = xfs_bunmapi(tp, ip, startoffset_fsb,
-                                  endoffset_fsb - startoffset_fsb,
-                                  0, 2, &firstfsb, &free_list, &done);
-                if (error)
-                        goto error0;
-                /*
-                 * complete the transaction
-                 */
-                error = xfs_bmap_finish(&tp, &free_list, NULL);
-                if (error)
-                        goto error0;
-                error = xfs_trans_commit(tp);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
- out:
+        /*
-        return error;
+         * Now that we've unmap all full blocks we'll have to zero out any
+         * partial block at the beginning and/or end.  xfs_zero_range is
- error0:
+         * smart enough to skip any holes, including those we just created.
-        xfs_bmap_cancel(&free_list);
+         */
- error1:
+        return xfs_zero_range(ip, offset, len, NULL);
-        xfs_trans_cancel(tp);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        goto out;
 }
 /*
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index af97d9a1dfb4..f20071432ca6 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,8 +31,6 @@ struct xfs_bmalloca;
 int     xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
 int     xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
                     int whichfork, int *eof);
-int     xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
-                              int whichfork, int *count);
 int     xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
                xfs_fileoff_t start_fsb, xfs_fileoff_t length);
@@ -43,7 +41,6 @@ int	xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
 /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
 void    xfs_bmap_del_free(struct xfs_bmap_free *flist,
-                          struct xfs_bmap_free_item *prev,
                          struct xfs_bmap_free_item *free);
 int     xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
                               struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index a87a0d5477bd..47a318ce82e0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,6 +80,47 @@ xfs_buf_vmap_len(
 }
 /*
+ * Bump the I/O in flight count on the buftarg if we haven't yet done so for
+ * this buffer. The count is incremented once per buffer (per hold cycle)
+ * because the corresponding decrement is deferred to buffer release. Buffers
+ * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
+ * tracking adds unnecessary overhead. This is used for sychronization purposes
+ * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+ * in-flight buffers.
+ *
+ * Buffers that are never released (e.g., superblock, iclog buffers) must set
+ * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
+ * never reaches zero and unmount hangs indefinitely.
+ */
+static inline void
+xfs_buf_ioacct_inc(
+        struct xfs_buf  *bp)
+{
+        if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
+                return;
+        ASSERT(bp->b_flags & XBF_ASYNC);
+        bp->b_flags |= _XBF_IN_FLIGHT;
+        percpu_counter_inc(&bp->b_target->bt_io_count);
+}
+/*
+ * Clear the in-flight state on a buffer about to be released to the LRU or
+ * freed and unaccount from the buftarg.
+ */
+static inline void
+xfs_buf_ioacct_dec(
+        struct xfs_buf  *bp)
+{
+        if (!(bp->b_flags & _XBF_IN_FLIGHT))
+                return;
+        ASSERT(bp->b_flags & XBF_ASYNC);
+        bp->b_flags &= ~_XBF_IN_FLIGHT;
+        percpu_counter_dec(&bp->b_target->bt_io_count);
+}
+/*
 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 * b_lru_ref count so that the buffer is freed immediately when the buffer
 * reference count falls to zero. If the buffer is already on the LRU, we need
@@ -102,6 +143,14 @@ xfs_buf_stale(
         */
        bp->b_flags &= ~_XBF_DELWRI_Q;
+        /*
+         * Once the buffer is marked stale and unlocked, a subsequent lookup
+         * could reset b_flags. There is no guarantee that the buffer is
+         * unaccounted (released to LRU) before that occurs. Drop in-flight
+         * status now to preserve accounting consistency.
+         */
+        xfs_buf_ioacct_dec(bp);
        spin_lock(&bp->b_lock);
        atomic_set(&bp->b_lru_ref, 0);
        if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
        struct xfs_buf          *bp;
        DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
-        bp = _xfs_buf_alloc(target, &map, 1, 0);
+        /* flags might contain irrelevant bits, pass only what we care about */
+        bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
        if (unlikely(bp == NULL))
                goto fail;
@@ -866,63 +916,85 @@ xfs_buf_hold(
 }
 /*
- *      Releases a hold on the specified buffer.  If the
+ * Release a hold on the specified buffer. If the hold count is 1, the buffer is
- *      the hold count is 1, calls xfs_buf_free.
+ * placed on LRU or freed (depending on b_lru_ref).
 */
 void
 xfs_buf_rele(
        xfs_buf_t               *bp)
 {
        struct xfs_perag        *pag = bp->b_pag;
+        bool                    release;
+        bool                    freebuf = false;
        trace_xfs_buf_rele(bp, _RET_IP_);
        if (!pag) {
                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
-                if (atomic_dec_and_test(&bp->b_hold))
+                if (atomic_dec_and_test(&bp->b_hold)) {
+                        xfs_buf_ioacct_dec(bp);
                        xfs_buf_free(bp);
+                }
                return;
        }
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
-        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-                spin_lock(&bp->b_lock);
-                if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
-                        /*
-                         * If the buffer is added to the LRU take a new
-                         * reference to the buffer for the LRU and clear the
-                         * (now stale) dispose list state flag
-                         */
-                        if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
-                                bp->b_state &= ~XFS_BSTATE_DISPOSE;
-                                atomic_inc(&bp->b_hold);
-                        }
-                        spin_unlock(&bp->b_lock);
-                        spin_unlock(&pag->pag_buf_lock);
-                } else {
-                        /*
-                         * most of the time buffers will already be removed from
-                         * the LRU, so optimise that case by checking for the
-                         * XFS_BSTATE_DISPOSE flag indicating the last list the
-                         * buffer was on was the disposal list
-                         */
-                        if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-                                list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
-                        } else {
-                                ASSERT(list_empty(&bp->b_lru));
-                        }
-                        spin_unlock(&bp->b_lock);
-                        ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+        release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
-                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+        spin_lock(&bp->b_lock);
-                        spin_unlock(&pag->pag_buf_lock);
+        if (!release) {
-                        xfs_perag_put(pag);
+                /*
-                        xfs_buf_free(bp);
+                 * Drop the in-flight state if the buffer is already on the LRU
+                 * and it holds the only reference. This is racy because we
+                 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
+                 * ensures the decrement occurs only once per-buf.
+                 */
+                if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+                        xfs_buf_ioacct_dec(bp);
+                goto out_unlock;
+        }
+        /* the last reference has been dropped ... */
+        xfs_buf_ioacct_dec(bp);
+        if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+                /*
+                 * If the buffer is added to the LRU take a new reference to the
+                 * buffer for the LRU and clear the (now stale) dispose list
+                 * state flag
+                 */
+                if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+                        bp->b_state &= ~XFS_BSTATE_DISPOSE;
+                        atomic_inc(&bp->b_hold);
+                }
+                spin_unlock(&pag->pag_buf_lock);
+        } else {
+                /*
+                 * most of the time buffers will already be removed from the
+                 * LRU, so optimise that case by checking for the
+                 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
+                 * was on was the disposal list
+                 */
+                if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+                        list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+                } else {
+                        ASSERT(list_empty(&bp->b_lru));
                }
+                ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+                rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+                spin_unlock(&pag->pag_buf_lock);
+                xfs_perag_put(pag);
+                freebuf = true;
        }
+out_unlock:
+        spin_unlock(&bp->b_lock);
+        if (freebuf)
+                xfs_buf_free(bp);
 }
@@ -944,10 +1016,12 @@ xfs_buf_trylock(
        int                     locked;
        locked = down_trylock(&bp->b_sema) == 0;
-        if (locked)
+        if (locked) {
                XB_SET_OWNER(bp);
+                trace_xfs_buf_trylock(bp, _RET_IP_);
-        trace_xfs_buf_trylock(bp, _RET_IP_);
+        } else {
+                trace_xfs_buf_trylock_fail(bp, _RET_IP_);
+        }
        return locked;
 }
@@ -1339,6 +1413,7 @@ xfs_buf_submit(
         * xfs_buf_ioend too early.
         */
        atomic_set(&bp->b_io_remaining, 1);
+        xfs_buf_ioacct_inc(bp);
        _xfs_buf_ioapply(bp);
        /*
@@ -1524,13 +1599,19 @@ xfs_wait_buftarg(
        int loop = 0;
        /*
-         * We need to flush the buffer workqueue to ensure that all IO
+         * First wait on the buftarg I/O count for all in-flight buffers to be
-         * completion processing is 100% done. Just waiting on buffer locks is
+         * released. This is critical as new buffers do not make the LRU until
-         * not sufficient for async IO as the reference count held over IO is
+         * they are released.
-         * not released until after the buffer lock is dropped. Hence we need to
+         *
-         * ensure here that all reference counts have been dropped before we
+         * Next, flush the buffer workqueue to ensure all completion processing
-         * start walking the LRU list.
+         * has finished. Just waiting on buffer locks is not sufficient for
+         * async IO as the reference count held over IO is not released until
+         * after the buffer lock is dropped. Hence we need to ensure here that
+         * all reference counts have been dropped before we start walking the
+         * LRU list.
         */
+        while (percpu_counter_sum(&btp->bt_io_count))
+                delay(100);
        drain_workqueue(btp->bt_mount->m_buf_workqueue);
        /* loop until there is nothing left on the lru list. */
@@ -1627,6 +1708,8 @@ xfs_free_buftarg(
        struct xfs_buftarg      *btp)
 {
        unregister_shrinker(&btp->bt_shrinker);
+        ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+        percpu_counter_destroy(&btp->bt_io_count);
        list_lru_destroy(&btp->bt_lru);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
@@ -1691,6 +1774,9 @@ xfs_alloc_buftarg(
        if (list_lru_init(&btp->bt_lru))
                goto error;
+        if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+                goto error;
        btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
        btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1774,18 +1860,33 @@ xfs_buf_cmp(
        return 0;
 }
+/*
+ * submit buffers for write.
+ *
+ * When we have a large buffer list, we do not want to hold all the buffers
+ * locked while we block on the request queue waiting for IO dispatch. To avoid
+ * this problem, we lock and submit buffers in groups of 50, thereby minimising
+ * the lock hold times for lists which may contain thousands of objects.
+ *
+ * To do this, we sort the buffer list before we walk the list to lock and
+ * submit buffers, and we plug and unplug around each group of buffers we
+ * submit.
+ */
 static int
-__xfs_buf_delwri_submit(
+xfs_buf_delwri_submit_buffers(
        struct list_head        *buffer_list,
-        struct list_head        *io_list,
+        struct list_head        *wait_list)
-        bool                    wait)
 {
-        struct blk_plug         plug;
        struct xfs_buf          *bp, *n;
+        LIST_HEAD               (submit_list);
        int                     pinned = 0;
+        struct blk_plug         plug;
+        list_sort(NULL, buffer_list, xfs_buf_cmp);
+        blk_start_plug(&plug);
        list_for_each_entry_safe(bp, n, buffer_list, b_list) {
-                if (!wait) {
+                if (!wait_list) {
                        if (xfs_buf_ispinned(bp)) {
                                pinned++;
                                continue;
@@ -1808,25 +1909,21 @@ __xfs_buf_delwri_submit(
                        continue;
                }
-                list_move_tail(&bp->b_list, io_list);
                trace_xfs_buf_delwri_split(bp, _RET_IP_);
-        }
-        list_sort(NULL, io_list, xfs_buf_cmp);
-        blk_start_plug(&plug);
-        list_for_each_entry_safe(bp, n, io_list, b_list) {
-                bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-                bp->b_flags |= XBF_WRITE | XBF_ASYNC;
                /*
-                 * we do all Io submission async. This means if we need to wait
+                 * We do all IO submission async. This means if we need
-                 * for IO completion we need to take an extra reference so the
+                 * to wait for IO completion we need to take an extra
-                 * buffer is still valid on the other side.
+                 * reference so the buffer is still valid on the other
+                 * side. We need to move the buffer onto the io_list
+                 * at this point so the caller can still access it.
                 */
-                if (wait)
+                bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+                bp->b_flags |= XBF_WRITE | XBF_ASYNC;
+                if (wait_list) {
                        xfs_buf_hold(bp);
-                else
+                        list_move_tail(&bp->b_list, wait_list);
+                } else
                        list_del_init(&bp->b_list);
                xfs_buf_submit(bp);
@@ -1849,8 +1946,7 @@ int
 xfs_buf_delwri_submit_nowait(
        struct list_head        *buffer_list)
 {
-        LIST_HEAD               (io_list);
+        return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
-        return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
 }
 /*
@@ -1865,15 +1961,15 @@ int
 xfs_buf_delwri_submit(
        struct list_head        *buffer_list)
 {
-        LIST_HEAD               (io_list);
+        LIST_HEAD               (wait_list);
        int                     error = 0, error2;
        struct xfs_buf          *bp;
-        __xfs_buf_delwri_submit(buffer_list, &io_list, true);
+        xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
        /* Wait for IO to complete. */
-        while (!list_empty(&io_list)) {
+        while (!list_empty(&wait_list)) {
-                bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+                bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
                list_del_init(&bp->b_list);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8bfb974f0772..1c2e52b2d926 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -43,6 +43,7 @@ typedef enum {
 #define XBF_READ         (1 << 0) /* buffer intended for reading from device */
 #define XBF_WRITE        (1 << 1) /* buffer intended for writing to device */
 #define XBF_READ_AHEAD   (1 << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT    (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
 #define XBF_ASYNC        (1 << 4) /* initiator will not wait for completion */
 #define XBF_DONE         (1 << 5) /* all pages in the buffer uptodate */
 #define XBF_STALE        (1 << 6) /* buffer has been staled, do not find it */
@@ -62,6 +63,7 @@ typedef enum {
 #define _XBF_KMEM        (1 << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q    (1 << 22)/* buffer on a delwri queue */
 #define _XBF_COMPOUND    (1 << 23)/* compound buffer */
+#define _XBF_IN_FLIGHT   (1 << 25) /* I/O in flight, for accounting purposes */
 typedef unsigned int xfs_buf_flags_t;
@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
        { _XBF_PAGES,           "PAGES" }, \
        { _XBF_KMEM,            "KMEM" }, \
        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
-        { _XBF_COMPOUND,        "COMPOUND" }
+        { _XBF_COMPOUND,        "COMPOUND" }, \
+        { _XBF_IN_FLIGHT,       "IN_FLIGHT" }
 /*
@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
        /* LRU control structures */
        struct shrinker         bt_shrinker;
        struct list_lru         bt_lru;
+        struct percpu_counter   bt_io_count;
 } xfs_buftarg_t;
 struct xfs_buf;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 34257992934c..e455f9098d49 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -359,7 +359,7 @@ xfs_buf_item_format(
        for (i = 0; i < bip->bli_format_count; i++) {
                xfs_buf_item_format_segment(bip, lv, &vecp, offset,
                                            &bip->bli_formats[i]);
-                offset += bp->b_maps[i].bm_len;
+                offset += BBTOB(bp->b_maps[i].bm_len);
        }
        /*
@@ -915,20 +915,28 @@ xfs_buf_item_log(
        for (i = 0; i < bip->bli_format_count; i++) {
                if (start > last)
                        break;
-                end = start + BBTOB(bp->b_maps[i].bm_len);
+                end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
+                /* skip to the map that includes the first byte to log */
                if (first > end) {
                        start += BBTOB(bp->b_maps[i].bm_len);
                        continue;
                }
+                /*
+                 * Trim the range to this segment and mark it in the bitmap.
+                 * Note that we must convert buffer offsets to segment relative
+                 * offsets (e.g., the first byte of each segment is byte 0 of
+                 * that segment).
+                 */
                if (first < start)
                        first = start;
                if (end > last)
                        end = last;
+                xfs_buf_item_log_segment(first - start, end - start,
-                xfs_buf_item_log_segment(first, end,
                                         &bip->bli_formats[i].blf_data_map[0]);
-                start += bp->b_maps[i].bm_len;
+                start += BBTOB(bp->b_maps[i].bm_len);
        }
 }
@@ -949,6 +957,7 @@ xfs_buf_item_free(
        xfs_buf_log_item_t      *bip)
 {
        xfs_buf_item_free_format(bip);
+        kmem_free(bip->bli_item.li_lv_shadow);
        kmem_zone_free(xfs_buf_item_zone, bip);
 }
@@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error(
        trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
        ASSERT(bp->b_iodone != NULL);
+        cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
        /*
         * If the write was asynchronous then no one will be looking for the
         * error.  If this is the first failure of this type, clear the error
@@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error(
         * async write failure at least once, but we also need to set the buffer
         * up to behave correctly now for repeated failures.
         */
-        if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
+        if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
             bp->b_last_error != bp->b_error) {
-                bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
+                bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
-                                XBF_DONE | XBF_WRITE_FAIL);
                bp->b_last_error = bp->b_error;
-                bp->b_retries = 0;
+                if (cfg->retry_timeout && !bp->b_first_retry_time)
-                bp->b_first_retry_time = jiffies;
+                        bp->b_first_retry_time = jiffies;
                xfs_buf_ioerror(bp, 0);
                xfs_buf_submit(bp);
@@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error(
         * Repeated failure on an async write. Take action according to the
         * error configuration we have been set up to use.
         */
-        cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
        if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
            ++bp->b_retries > cfg->max_retries)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index e0646659ce16..ccb0811963b2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
 {
        ASSERT(list_empty(&dqp->q_lru));
+        kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
        mutex_destroy(&dqp->q_qlock);
        XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 814cff94e78f..2c7a1629e064 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
        spin_lock(&ailp->xa_lock);
        xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
+        kmem_free(qfs->qql_item.li_lv_shadow);
+        kmem_free(lip->li_lv_shadow);
        kmem_free(qfs);
        kmem_free(qfe);
        return (xfs_lsn_t)-1;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 88693a98fac5..ed7ee4e8af73 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
 }
 int
-xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
 {
        int i;
        int len;
        int64_t fsid;
+        if (error_tag >= XFS_ERRTAG_MAX)
+                return -EINVAL;
        memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 4ed3042a0f16..2e4f67f68856 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
                        (rf))))
-extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
 extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 4aa0153214f9..ab779460ecbf 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,6 +40,7 @@ void
 xfs_efi_item_free(
        struct xfs_efi_log_item *efip)
 {
+        kmem_free(efip->efi_item.li_lv_shadow);
        if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
                kmem_free(efip);
        else
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
 STATIC void
 xfs_efd_item_free(struct xfs_efd_log_item *efdp)
 {
+        kmem_free(efdp->efd_item.li_lv_shadow);
        if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
                kmem_free(efdp);
        else
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1b3dc9dd8861..ed95e5bb04e6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
 #include "xfs_log.h"
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 #include <linux/dcache.h>
 #include <linux/falloc.h>
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
 }
 /*
- * xfs_iozero clears the specified range supplied via the page cache (except in
+ * Clear the specified ranges to zero through either the pagecache or DAX.
- * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
 */
 int
-xfs_iozero(
+xfs_zero_range(
-        struct xfs_inode        *ip,    /* inode                        */
+        struct xfs_inode        *ip,
-        loff_t                  pos,    /* offset in file               */
+        xfs_off_t               pos,
-        size_t                  count)  /* size of data to zero         */
+        xfs_off_t               count,
+        bool                    *did_zero)
 {
-        struct page             *page;
+        return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
-        struct address_space    *mapping;
-        int                     status = 0;
-        mapping = VFS_I(ip)->i_mapping;
-        do {
-                unsigned offset, bytes;
-                void *fsdata;
-                offset = (pos & (PAGE_SIZE -1)); /* Within page */
-                bytes = PAGE_SIZE - offset;
-                if (bytes > count)
-                        bytes = count;
-                if (IS_DAX(VFS_I(ip))) {
-                        status = dax_zero_page_range(VFS_I(ip), pos, bytes,
-                                                     xfs_get_blocks_direct);
-                        if (status)
-                                break;
-                } else {
-                        status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                                AOP_FLAG_UNINTERRUPTIBLE,
-                                                &page, &fsdata);
-                        if (status)
-                                break;
-                        zero_user(page, offset, bytes);
-                        status = pagecache_write_end(NULL, mapping, pos, bytes,
-                                                bytes, page, fsdata);
-                        WARN_ON(status <= 0); /* can't return less than zero! */
-                        status = 0;
-                }
-                pos += bytes;
-                count -= bytes;
-        } while (count);
-        return status;
 }
 int
@@ -282,48 +239,35 @@ xfs_file_fsync(
 }
 STATIC ssize_t
-xfs_file_read_iter(
+xfs_file_dio_aio_read(
        struct kiocb            *iocb,
        struct iov_iter         *to)
 {
-        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = iocb->ki_filp->f_mapping;
-        struct inode            *inode = file->f_mapping->host;
+        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
+        loff_t                  isize = i_size_read(inode);
-        size_t                  size = iov_iter_count(to);
+        size_t                  count = iov_iter_count(to);
+        struct iov_iter         data;
+        struct xfs_buftarg      *target;
        ssize_t                 ret = 0;
-        int                     ioflags = 0;
-        xfs_fsize_t             n;
-        loff_t                  pos = iocb->ki_pos;
-        XFS_STATS_INC(mp, xs_read_calls);
+        trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
-        if (unlikely(iocb->ki_flags & IOCB_DIRECT))
-                ioflags |= XFS_IO_ISDIRECT;
-        if (file->f_mode & FMODE_NOCMTIME)
-                ioflags |= XFS_IO_INVIS;
-        if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
-                xfs_buftarg_t   *target =
-                        XFS_IS_REALTIME_INODE(ip) ?
-                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                /* DIO must be aligned to device logical sector size */
-                if ((pos | size) & target->bt_logical_sectormask) {
-                        if (pos == i_size_read(inode))
-                                return 0;
-                        return -EINVAL;
-                }
-        }
-        n = mp->m_super->s_maxbytes - pos;
+        if (!count)
-        if (n <= 0 || size == 0)
+                return 0; /* skip atime */
-                return 0;
-        if (n < size)
+        if (XFS_IS_REALTIME_INODE(ip))
-                size = n;
+                target = ip->i_mount->m_rtdev_targp;
+        else
+                target = ip->i_mount->m_ddev_targp;
-        if (XFS_FORCED_SHUTDOWN(mp))
+        /* DIO must be aligned to device logical sector size */
-                return -EIO;
+        if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+                if (iocb->ki_pos == isize)
+                        return 0;
+                return -EINVAL;
+        }
        /*
         * Locking is a bit tricky here. If we take an exclusive lock for direct
@@ -336,7 +280,7 @@ xfs_file_read_iter(
         * serialisation.
         */
        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-        if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+        if (mapping->nrpages) {
                xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
@@ -351,8 +295,8 @@ xfs_file_read_iter(
                 * flush and reduce the chances of repeated iolock cycles going
                 * forward.
                 */
-                if (inode->i_mapping->nrpages) {
+                if (mapping->nrpages) {
-                        ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+                        ret = filemap_write_and_wait(mapping);
                        if (ret) {
                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                return ret;
@@ -363,20 +307,95 @@ xfs_file_read_iter(
                         * we fail to invalidate a page, but this should never
                         * happen on XFS. Warn if it does fail.
                         */
-                        ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+                        ret = invalidate_inode_pages2(mapping);
                        WARN_ON_ONCE(ret);
                        ret = 0;
                }
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
        }
-        trace_xfs_file_read(ip, size, pos, ioflags);
+        data = *to;
+        ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                        xfs_get_blocks_direct, NULL, NULL, 0);
+        if (ret > 0) {
+                iocb->ki_pos += ret;
+                iov_iter_advance(to, ret);
+        }
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+        file_accessed(iocb->ki_filp);
+        return ret;
+}
+static noinline ssize_t
+xfs_file_dax_read(
+        struct kiocb            *iocb,
+        struct iov_iter         *to)
+{
+        struct address_space    *mapping = iocb->ki_filp->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct iov_iter         data = *to;
+        size_t                  count = iov_iter_count(to);
+        ssize_t                 ret = 0;
+        trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+        if (!count)
+                return 0; /* skip atime */
+        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+        ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+        if (ret > 0) {
+                iocb->ki_pos += ret;
+                iov_iter_advance(to, ret);
+        }
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+        file_accessed(iocb->ki_filp);
+        return ret;
+}
+STATIC ssize_t
+xfs_file_buffered_aio_read(
+        struct kiocb            *iocb,
+        struct iov_iter         *to)
+{
+        struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+        ssize_t                 ret;
+        trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        ret = generic_file_read_iter(iocb, to);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+        return ret;
+}
+STATIC ssize_t
+xfs_file_read_iter(
+        struct kiocb            *iocb,
+        struct iov_iter         *to)
+{
+        struct inode            *inode = file_inode(iocb->ki_filp);
+        struct xfs_mount        *mp = XFS_I(inode)->i_mount;
+        ssize_t                 ret = 0;
+        XFS_STATS_INC(mp, xs_read_calls);
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        if (IS_DAX(inode))
+                ret = xfs_file_dax_read(iocb, to);
+        else if (iocb->ki_flags & IOCB_DIRECT)
+                ret = xfs_file_dio_aio_read(iocb, to);
+        else
+                ret = xfs_file_buffered_aio_read(iocb, to);
        if (ret > 0)
                XFS_STATS_ADD(mp, xs_read_bytes, ret);
-        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
@@ -389,18 +408,14 @@ xfs_file_splice_read(
        unsigned int            flags)
 {
        struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-        int                     ioflags = 0;
        ssize_t                 ret;
        XFS_STATS_INC(ip->i_mount, xs_read_calls);
-        if (infilp->f_mode & FMODE_NOCMTIME)
-                ioflags |= XFS_IO_INVIS;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
-        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+        trace_xfs_file_splice_read(ip, count, *ppos);
        /*
         * DAX inodes cannot ues the page cache for splice, so we have to push
@@ -424,49 +439,6 @@ out:
 }
 /*
- * This routine is called to handle zeroing any space in the last block of the
- * file that is beyond the EOF.  We do this since the size is being increased
- * without writing anything to that block and we don't want to read the
- * garbage on the disk.
- */
-STATIC int                              /* error (positive) */
-xfs_zero_last_block(
-        struct xfs_inode        *ip,
-        xfs_fsize_t             offset,
-        xfs_fsize_t             isize,
-        bool                    *did_zeroing)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           last_fsb = XFS_B_TO_FSBT(mp, isize);
-        int                     zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-        int                     zero_len;
-        int                     nimaps = 1;
-        int                     error = 0;
-        struct xfs_bmbt_irec    imap;
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        if (error)
-                return error;
-        ASSERT(nimaps > 0);
-        /*
-         * If the block underlying isize is just a hole, then there
-         * is nothing to zero.
-         */
-        if (imap.br_startblock == HOLESTARTBLOCK)
-                return 0;
-        zero_len = mp->m_sb.sb_blocksize - zero_offset;
-        if (isize + zero_len > offset)
-                zero_len = offset - isize;
-        *did_zeroing = true;
-        return xfs_iozero(ip, isize, zero_len);
-}
-/*
 * Zero any on disk space between the current EOF and the new, larger EOF.
 *
 * This handles the normal case of zeroing the remainder of the last block in
@@ -484,94 +456,11 @@ xfs_zero_eof(
        xfs_fsize_t             isize,          /* current inode size */
        bool                    *did_zeroing)
 {
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           start_zero_fsb;
-        xfs_fileoff_t           end_zero_fsb;
-        xfs_fileoff_t           zero_count_fsb;
-        xfs_fileoff_t           last_fsb;
-        xfs_fileoff_t           zero_off;
-        xfs_fsize_t             zero_len;
-        int                     nimaps;
-        int                     error = 0;
-        struct xfs_bmbt_irec    imap;
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(offset > isize);
        trace_xfs_zero_eof(ip, isize, offset - isize);
+        return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
-        /*
-         * First handle zeroing the block on which isize resides.
-         *
-         * We only zero a part of that block so it is handled specially.
-         */
-        if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
-                error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
-                if (error)
-                        return error;
-        }
-        /*
-         * Calculate the range between the new size and the old where blocks
-         * needing to be zeroed may exist.
-         *
-         * To get the block where the last byte in the file currently resides,
-         * we need to subtract one from the size and truncate back to a block
-         * boundary.  We subtract 1 in case the size is exactly on a block
-         * boundary.
-         */
-        last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-        start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-        end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-        ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-        if (last_fsb == end_zero_fsb) {
-                /*
-                 * The size was only incremented on its last block.
-                 * We took care of that above, so just return.
-                 */
-                return 0;
-        }
-        ASSERT(start_zero_fsb <= end_zero_fsb);
-        while (start_zero_fsb <= end_zero_fsb) {
-                nimaps = 1;
-                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
-                                          &imap, &nimaps, 0);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                if (error)
-                        return error;
-                ASSERT(nimaps > 0);
-                if (imap.br_state == XFS_EXT_UNWRITTEN ||
-                    imap.br_startblock == HOLESTARTBLOCK) {
-                        start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                        ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-                        continue;
-                }
-                /*
-                 * There are blocks we need to zero.
-                 */
-                zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-                zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-                if ((zero_off + zero_len) > offset)
-                        zero_len = offset - zero_off;
-                error = xfs_iozero(ip, zero_off, zero_len);
-                if (error)
-                        return error;
-                *did_zeroing = true;
-                start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-        }
-        return 0;
 }
 /*
@@ -722,8 +611,7 @@ xfs_file_dio_aio_write(
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
        /* DIO must be aligned to device logical sector size */
-        if (!IS_DAX(inode) &&
+        if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
-            ((iocb->ki_pos | count) & target->bt_logical_sectormask))
                return -EINVAL;
        /* "unaligned" here means not aligned to a filesystem block */
@@ -762,7 +650,7 @@ xfs_file_dio_aio_write(
        end = iocb->ki_pos + count - 1;
        /*
-         * See xfs_file_read_iter() for why we do a full-file flush here.
+         * See xfs_file_dio_aio_read() for why we do a full-file flush here.
         */
        if (mapping->nrpages) {
                ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -789,10 +677,12 @@ xfs_file_dio_aio_write(
                iolock = XFS_IOLOCK_SHARED;
        }
-        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+        trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
        data = *from;
-        ret = mapping->a_ops->direct_IO(iocb, &data);
+        ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                        xfs_get_blocks_direct, xfs_end_io_direct_write,
+                        NULL, DIO_ASYNC_EXTEND);
        /* see generic_file_direct_write() for why this is necessary */
        if (mapping->nrpages) {
@@ -809,10 +699,70 @@ out:
        xfs_rw_iunlock(ip, iolock);
        /*
-         * No fallback to buffered IO on errors for XFS. DAX can result in
+         * No fallback to buffered IO on errors for XFS, direct IO will either
-         * partial writes, but direct IO will either complete fully or fail.
+         * complete fully or fail.
         */
-        ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+        ASSERT(ret < 0 || ret == count);
+        return ret;
+}
+static noinline ssize_t
+xfs_file_dax_write(
+        struct kiocb            *iocb,
+        struct iov_iter         *from)
+{
+        struct address_space    *mapping = iocb->ki_filp->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 ret = 0;
+        int                     unaligned_io = 0;
+        int                     iolock;
+        struct iov_iter         data;
+        /* "unaligned" here means not aligned to a filesystem block */
+        if ((iocb->ki_pos & mp->m_blockmask) ||
+            ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+                unaligned_io = 1;
+                iolock = XFS_IOLOCK_EXCL;
+        } else if (mapping->nrpages) {
+                iolock = XFS_IOLOCK_EXCL;
+        } else {
+                iolock = XFS_IOLOCK_SHARED;
+        }
+        xfs_rw_ilock(ip, iolock);
+        ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+        if (ret)
+                goto out;
+        /*
+         * Yes, even DAX files can have page cache attached to them:  A zeroed
+         * page is inserted into the pagecache when we have to serve a write
+         * fault on a hole.  It should never be dirtied and can simply be
+         * dropped from the pagecache once we get real data for the page.
+         */
+        if (mapping->nrpages) {
+                ret = invalidate_inode_pages2(mapping);
+                WARN_ON_ONCE(ret);
+        }
+        if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                iolock = XFS_IOLOCK_SHARED;
+        }
+        trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+        data = *from;
+        ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+                        xfs_end_io_direct_write, 0);
+        if (ret > 0) {
+                iocb->ki_pos += ret;
+                iov_iter_advance(from, ret);
+        }
+out:
+        xfs_rw_iunlock(ip, iolock);
        return ret;
 }
@@ -839,9 +789,8 @@ xfs_file_buffered_aio_write(
        current->backing_dev_info = inode_to_bdi(inode);
 write_retry:
-        trace_xfs_file_buffered_write(ip, iov_iter_count(from),
+        trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
-                                      iocb->ki_pos, 0);
+        ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
-        ret = generic_perform_write(file, from, iocb->ki_pos);
        if (likely(ret >= 0))
                iocb->ki_pos += ret;
@@ -895,7 +844,9 @@ xfs_file_write_iter(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
-        if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+        if (IS_DAX(inode))
+                ret = xfs_file_dax_write(iocb, from);
+        else if (iocb->ki_flags & IOCB_DIRECT)
                ret = xfs_file_dio_aio_write(iocb, from);
        else
                ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite(
        if (IS_DAX(inode)) {
                ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
        } else {
-                ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+                ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                ret = block_page_mkwrite_return(ret);
        }
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b4d75825ae37..7191c3878b4a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -667,8 +667,11 @@ xfs_reserve_blocks(
        __uint64_t              *inval,
        xfs_fsop_resblks_t      *outval)
 {
-        __int64_t               lcounter, delta, fdblks_delta;
+        __int64_t               lcounter, delta;
+        __int64_t               fdblks_delta = 0;
        __uint64_t              request;
+        __int64_t               free;
+        int                     error = 0;
        /* If inval is null, report current values and return */
        if (inval == (__uint64_t *)NULL) {
@@ -682,24 +685,23 @@ xfs_reserve_blocks(
        request = *inval;
        /*
-         * With per-cpu counters, this becomes an interesting
+         * With per-cpu counters, this becomes an interesting problem. we need
-         * problem. we needto work out if we are freeing or allocation
+         * to work out if we are freeing or allocation blocks first, then we can
-         * blocks first, then we can do the modification as necessary.
+         * do the modification as necessary.
         *
-         * We do this under the m_sb_lock so that if we are near
+         * We do this under the m_sb_lock so that if we are near ENOSPC, we will
-         * ENOSPC, we will hold out any changes while we work out
+         * hold out any changes while we work out what to do. This means that
-         * what to do. This means that the amount of free space can
+         * the amount of free space can change while we do this, so we need to
-         * change while we do this, so we need to retry if we end up
+         * retry if we end up trying to reserve more space than is available.
-         * trying to reserve more space than is available.
         */
-retry:
        spin_lock(&mp->m_sb_lock);
        /*
         * If our previous reservation was larger than the current value,
-         * then move any unused blocks back to the free pool.
+         * then move any unused blocks back to the free pool. Modify the resblks
+         * counters directly since we shouldn't have any problems unreserving
+         * space.
         */
-        fdblks_delta = 0;
        if (mp->m_resblks > request) {
                lcounter = mp->m_resblks_avail - request;
                if (lcounter  > 0) {            /* release unused blocks */
@@ -707,54 +709,67 @@ retry:
                        mp->m_resblks_avail -= lcounter;
                }
                mp->m_resblks = request;
-        } else {
+                if (fdblks_delta) {
-                __int64_t       free;
+                        spin_unlock(&mp->m_sb_lock);
+                        error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+                        spin_lock(&mp->m_sb_lock);
+                }
+                goto out;
+        }
+        /*
+         * If the request is larger than the current reservation, reserve the
+         * blocks before we update the reserve counters. Sample m_fdblocks and
+         * perform a partial reservation if the request exceeds free space.
+         */
+        error = -ENOSPC;
+        do {
                free = percpu_counter_sum(&mp->m_fdblocks) -
                                                        XFS_ALLOC_SET_ASIDE(mp);
                if (!free)
-                        goto out; /* ENOSPC and fdblks_delta = 0 */
+                        break;
                delta = request - mp->m_resblks;
                lcounter = free - delta;
-                if (lcounter < 0) {
+                if (lcounter < 0)
                        /* We can't satisfy the request, just get what we can */
-                        mp->m_resblks += free;
+                        fdblks_delta = free;
-                        mp->m_resblks_avail += free;
+                else
-                        fdblks_delta = -free;
+                        fdblks_delta = delta;
-                } else {
-                        fdblks_delta = -delta;
-                        mp->m_resblks = request;
-                        mp->m_resblks_avail += delta;
-                }
-        }
-out:
-        if (outval) {
-                outval->resblks = mp->m_resblks;
-                outval->resblks_avail = mp->m_resblks_avail;
-        }
-        spin_unlock(&mp->m_sb_lock);
-        if (fdblks_delta) {
                /*
-                 * If we are putting blocks back here, m_resblks_avail is
+                 * We'll either succeed in getting space from the free block
-                 * already at its max so this will put it in the free pool.
+                 * count or we'll get an ENOSPC. If we get a ENOSPC, it means
-                 *
+                 * things changed while we were calculating fdblks_delta and so
-                 * If we need space, we'll either succeed in getting it
+                 * we should try again to see if there is anything left to
-                 * from the free block count or we'll get an enospc. If
+                 * reserve.
-                 * we get a ENOSPC, it means things changed while we were
-                 * calculating fdblks_delta and so we should try again to
-                 * see if there is anything left to reserve.
                 *
                 * Don't set the reserved flag here - we don't want to reserve
                 * the extra reserve blocks from the reserve.....
                 */
-                int error;
+                spin_unlock(&mp->m_sb_lock);
-                error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+                error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
-                if (error == -ENOSPC)
+                spin_lock(&mp->m_sb_lock);
-                        goto retry;
+        } while (error == -ENOSPC);
+        /*
+         * Update the reserve counters if blocks have been successfully
+         * allocated.
+         */
+        if (!error && fdblks_delta) {
+                mp->m_resblks += fdblks_delta;
+                mp->m_resblks_avail += fdblks_delta;
        }
-        return 0;
+out:
+        if (outval) {
+                outval->resblks = mp->m_resblks;
+                outval->resblks_avail = mp->m_resblks_avail;
+        }
+        spin_unlock(&mp->m_sb_lock);
+        return error;
 }
 int
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 99ee6eee5e0b..fb39a66914dd 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -765,7 +765,7 @@ restart:
 * Background scanning to trim post-EOF preallocated space. This is queued
 * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
 */
-STATIC void
+void
 xfs_queue_eofblocks(
        struct xfs_mount *mp)
 {
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 62f1f91c32cb..05bac99bef75 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
 int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
 void xfs_eofblocks_worker(struct work_struct *);
+void xfs_queue_eofblocks(struct xfs_mount *);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, int flags, void *args),
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ee6799e0476f..8825bcfd314c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
 * lock more than one at a time, lockdep will report false positives saying we
 * have violated locking orders.
 */
-void
+static void
 xfs_lock_inodes(
        xfs_inode_t     **ips,
        int             inodes,
@@ -667,14 +667,6 @@ xfs_ip2xflags(
        return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
 }
-uint
-xfs_dic2xflags(
-        struct xfs_dinode       *dip)
-{
-        return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
-                                be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
-}
 /*
 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 * is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -748,7 +740,7 @@ out_unlock:
 * are not linked into the directory structure - they are attached
 * directly to the superblock - and so have no parent.
 */
-int
+static int
 xfs_ialloc(
        xfs_trans_t     *tp,
        xfs_inode_t     *pip,
@@ -1085,7 +1077,7 @@ xfs_dir_ialloc(
 * link count to go to zero, move the inode to AGI unlinked list so that it can
 * be freed when the last active reference goes away via xfs_inactive().
 */
-int                             /* error */
+static int                      /* error */
 xfs_droplink(
        xfs_trans_t *tp,
        xfs_inode_t *ip)
@@ -1104,7 +1096,7 @@ xfs_droplink(
 /*
 * Increment the link count on an inode & log the change.
 */
-int
+static int
 xfs_bumplink(
        xfs_trans_t *tp,
        xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e52d7c7aeb5b..8eb78ec4a6e2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -395,12 +395,8 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
 uint            xfs_ilock_data_map_shared(struct xfs_inode *);
 uint            xfs_ilock_attr_map_shared(struct xfs_inode *);
-int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
-                           xfs_nlink_t, xfs_dev_t, prid_t, int,
-                           struct xfs_buf **, xfs_inode_t **);
 uint            xfs_ip2xflags(struct xfs_inode *);
-uint            xfs_dic2xflags(struct xfs_dinode *);
 int             xfs_ifree(struct xfs_trans *, xfs_inode_t *,
                           struct xfs_bmap_free *);
 int             xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
@@ -411,7 +407,6 @@ void		xfs_iunpin_wait(xfs_inode_t *);
 #define xfs_ipincount(ip)       ((unsigned int) atomic_read(&ip->i_pincount))
 int             xfs_iflush(struct xfs_inode *, struct xfs_buf **);
-void            xfs_lock_inodes(xfs_inode_t **, int, uint);
 void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 xfs_extlen_t    xfs_get_extsz_hint(struct xfs_inode *ip);
@@ -419,8 +414,6 @@ xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
 int             xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
                               xfs_nlink_t, xfs_dev_t, prid_t, int,
                               struct xfs_inode **, int *);
-int             xfs_droplink(struct xfs_trans *, struct xfs_inode *);
-int             xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
 /* from xfs_file.c */
 enum xfs_prealloc_flags {
@@ -434,7 +427,8 @@ int	xfs_update_prealloc_flags(struct xfs_inode *ip,
                                  enum xfs_prealloc_flags flags);
 int     xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
                     xfs_fsize_t isize, bool *did_zeroing);
-int     xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+int     xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
+                bool *did_zero);
 loff_t  __xfs_seek_hole_data(struct inode *inode, loff_t start,
                             loff_t eof, int whence);
@@ -479,14 +473,4 @@ do { \
 extern struct kmem_zone *xfs_inode_zone;
-/*
- * Flags for read/write calls
- */
-#define XFS_IO_ISDIRECT 0x00001         /* bypass page cache */
-#define XFS_IO_INVIS    0x00002         /* don't update inode timestamps */
-#define XFS_IO_FLAGS \
-        { XFS_IO_ISDIRECT,      "DIRECT" }, \
-        { XFS_IO_INVIS,         "INVIS"}
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a1b07612224c..892c2aced207 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,6 +651,7 @@ void
 xfs_inode_item_destroy(
        xfs_inode_t     *ip)
 {
+        kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
        kmem_zone_free(xfs_ili_zone, ip->i_itemp);
 }
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 63a6ff2cfc68..9a7c87809d3b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -595,13 +595,12 @@ xfs_attrmulti_by_handle(
 int
 xfs_ioc_space(
-        struct xfs_inode        *ip,
-        struct inode            *inode,
        struct file             *filp,
-        int                     ioflags,
        unsigned int            cmd,
        xfs_flock64_t           *bf)
 {
+        struct inode            *inode = file_inode(filp);
+        struct xfs_inode        *ip = XFS_I(inode);
        struct iattr            iattr;
        enum xfs_prealloc_flags flags = 0;
        uint                    iolock = XFS_IOLOCK_EXCL;
@@ -626,7 +625,7 @@ xfs_ioc_space(
        if (filp->f_flags & O_DSYNC)
                flags |= XFS_PREALLOC_SYNC;
-        if (ioflags & XFS_IO_INVIS)
+        if (filp->f_mode & FMODE_NOCMTIME)
                flags |= XFS_PREALLOC_INVISIBLE;
        error = mnt_want_write_file(filp);
@@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
 STATIC int
 xfs_ioc_getbmap(
-        struct xfs_inode        *ip,
+        struct file             *file,
-        int                     ioflags,
        unsigned int            cmd,
        void                    __user *arg)
 {
@@ -1479,10 +1477,10 @@ xfs_ioc_getbmap(
                return -EINVAL;
        bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
-        if (ioflags & XFS_IO_INVIS)
+        if (file->f_mode & FMODE_NOCMTIME)
                bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
-        error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+        error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
                            (__force struct getbmap *)arg+1);
        if (error)
                return error;
@@ -1575,6 +1573,11 @@ xfs_ioc_swapext(
                goto out_put_tmp_file;
        }
+        /*
+         * We need to ensure that the fds passed in point to XFS inodes
+         * before we cast and access them as XFS structures as we have no
+         * control over what the user passes us here.
+         */
        if (f.file->f_op != &xfs_file_operations ||
            tmp.file->f_op != &xfs_file_operations) {
                error = -EINVAL;
@@ -1625,12 +1628,8 @@ xfs_file_ioctl(
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        void                    __user *arg = (void __user *)p;
-        int                     ioflags = 0;
        int                     error;
-        if (filp->f_mode & FMODE_NOCMTIME)
-                ioflags |= XFS_IO_INVIS;
        trace_xfs_file_ioctl(ip);
        switch (cmd) {
@@ -1649,7 +1648,7 @@ xfs_file_ioctl(
                if (copy_from_user(&bf, arg, sizeof(bf)))
                        return -EFAULT;
-                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+                return xfs_ioc_space(filp, cmd, &bf);
        }
        case XFS_IOC_DIOINFO: {
                struct dioattr  da;
@@ -1708,7 +1707,7 @@ xfs_file_ioctl(
        case XFS_IOC_GETBMAP:
        case XFS_IOC_GETBMAPA:
-                return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+                return xfs_ioc_getbmap(filp, cmd, arg);
        case XFS_IOC_GETBMAPX:
                return xfs_ioc_getbmapx(ip, arg);
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 77c02c7900b6..8b52881bfd90 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -20,10 +20,7 @@
 extern int
 xfs_ioc_space(
-        struct xfs_inode        *ip,
-        struct inode            *inode,
        struct file             *filp,
-        int                     ioflags,
        unsigned int            cmd,
        xfs_flock64_t           *bf);
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1a05d8ae327d..321f57721b92 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        void                    __user *arg = (void __user *)p;
-        int                     ioflags = 0;
        int                     error;
-        if (filp->f_mode & FMODE_NOCMTIME)
-                ioflags |= XFS_IO_INVIS;
        trace_xfs_file_compat_ioctl(ip);
        switch (cmd) {
@@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
                if (xfs_compat_flock64_copyin(&bf, arg))
                        return -EFAULT;
                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+                return xfs_ioc_space(filp, cmd, &bf);
        }
        case XFS_IOC_FSGEOMETRY_V1_32:
                return xfs_compat_ioc_fsgeometry_v1(mp, arg);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 58391355a44d..620fc9120444 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
@@ -940,3 +941,173 @@ error_on_bmapi_transaction:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
+void
+xfs_bmbt_to_iomap(
+        struct xfs_inode        *ip,
+        struct iomap            *iomap,
+        struct xfs_bmbt_irec    *imap)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        if (imap->br_startblock == HOLESTARTBLOCK) {
+                iomap->blkno = IOMAP_NULL_BLOCK;
+                iomap->type = IOMAP_HOLE;
+        } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+                iomap->blkno = IOMAP_NULL_BLOCK;
+                iomap->type = IOMAP_DELALLOC;
+        } else {
+                iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+                if (imap->br_state == XFS_EXT_UNWRITTEN)
+                        iomap->type = IOMAP_UNWRITTEN;
+                else
+                        iomap->type = IOMAP_MAPPED;
+        }
+        iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+        iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+        iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+        return !nimaps ||
+                imap->br_startblock == HOLESTARTBLOCK ||
+                imap->br_startblock == DELAYSTARTBLOCK;
+}
+static int
+xfs_file_iomap_begin(
+        struct inode            *inode,
+        loff_t                  offset,
+        loff_t                  length,
+        unsigned                flags,
+        struct iomap            *iomap)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_bmbt_irec    imap;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     nimaps = 1, error = 0;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        ASSERT(offset <= mp->m_super->s_maxbytes);
+        if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+                length = mp->m_super->s_maxbytes - offset;
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        end_fsb = XFS_B_TO_FSB(mp, offset + length);
+        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+                               &nimaps, XFS_BMAPI_ENTIRE);
+        if (error) {
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                return error;
+        }
+        if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+                /*
+                 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+                 * pages to keep the chunks of work done where somewhat symmetric
+                 * with the work writeback does. This is a completely arbitrary
+                 * number pulled out of thin air as a best guess for initial
+                 * testing.
+                 *
+                 * Note that the values needs to be less than 32-bits wide until
+                 * the lower level functions are updated.
+                 */
+                length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+                if (xfs_get_extsz_hint(ip)) {
+                        /*
+                         * xfs_iomap_write_direct() expects the shared lock. It
+                         * is unlocked on return.
+                         */
+                        xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+                        error = xfs_iomap_write_direct(ip, offset, length, &imap,
+                                        nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, length, &imap);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                }
+                if (error)
+                        return error;
+                trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+                xfs_bmbt_to_iomap(ip, iomap, &imap);
+        } else if (nimaps) {
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+                xfs_bmbt_to_iomap(ip, iomap, &imap);
+        } else {
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+                iomap->blkno = IOMAP_NULL_BLOCK;
+                iomap->type = IOMAP_HOLE;
+                iomap->offset = offset;
+                iomap->length = length;
+        }
+        return 0;
+}
+static int
+xfs_file_iomap_end_delalloc(
+        struct xfs_inode        *ip,
+        loff_t                  offset,
+        loff_t                  length,
+        ssize_t                 written)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           start_fsb;
+        xfs_fileoff_t           end_fsb;
+        int                     error = 0;
+        start_fsb = XFS_B_TO_FSB(mp, offset + written);
+        end_fsb = XFS_B_TO_FSB(mp, offset + length);
+        /*
+         * Trim back delalloc blocks if we didn't manage to write the whole
+         * range reserved.
+         *
+         * We don't need to care about racing delalloc as we hold i_mutex
+         * across the reserve/allocate/unreserve calls. If there are delalloc
+         * blocks in the range, they are ours.
+         */
+        if (start_fsb < end_fsb) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                               end_fsb - start_fsb);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+                        xfs_alert(mp, "%s: unable to clean up ino %lld",
+                                __func__, ip->i_ino);
+                        return error;
+                }
+        }
+        return 0;
+}
+static int
+xfs_file_iomap_end(
+        struct inode            *inode,
+        loff_t                  offset,
+        loff_t                  length,
+        ssize_t                 written,
+        unsigned                flags,
+        struct iomap            *iomap)
+{
+        if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+                return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+                                length, written);
+        return 0;
+}
+struct iomap_ops xfs_iomap_ops = {
+        .iomap_begin            = xfs_file_iomap_begin,
+        .iomap_end              = xfs_file_iomap_end,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8688e663d744..e066d045e2ff 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
+#include <linux/iomap.h>
 struct xfs_inode;
 struct xfs_bmbt_irec;
@@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
                        struct xfs_bmbt_irec *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+                struct xfs_bmbt_irec *);
+extern struct iomap_ops xfs_iomap_ops;
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c5d4eba6972e..ab820f84ed50 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,12 +38,13 @@
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 #include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
 #include <linux/slab.h>
 /*
@@ -801,20 +802,30 @@ xfs_setattr_size(
                return error;
        /*
+         * Wait for all direct I/O to complete.
+         */
+        inode_dio_wait(inode);
+        /*
         * File data changes must be complete before we start the transaction to
         * modify the inode.  This needs to be done before joining the inode to
         * the transaction because the inode cannot be unlocked once it is a
         * part of the transaction.
         *
-         * Start with zeroing any data block beyond EOF that we may expose on
+         * Start with zeroing any data beyond EOF that we may expose on file
-         * file extension.
+         * extension, or zeroing out the rest of the block on a downward
+         * truncate.
         */
        if (newsize > oldsize) {
                error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
-                if (error)
+        } else {
-                        return error;
+                error = iomap_truncate_page(inode, newsize, &did_zeroing,
+                                &xfs_iomap_ops);
        }
+        if (error)
+                return error;
        /*
         * We are going to log the inode size change in this transaction so
         * any previous writes that are beyond the on disk EOF and the new
@@ -823,17 +834,14 @@ xfs_setattr_size(
         * problem. Note that this includes any block zeroing we did above;
         * otherwise those blocks may not be zeroed after a crash.
         */
-        if (newsize > ip->i_d.di_size &&
+        if (did_zeroing ||
-            (oldsize != ip->i_d.di_size || did_zeroing)) {
+            (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
                error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                                      ip->i_d.di_size, newsize);
                if (error)
                        return error;
        }
-        /* Now wait for all direct I/O to complete. */
-        inode_dio_wait(inode);
        /*
         * We've already locked out new page faults, so now we can safely remove
         * pages from the page cache knowing they won't get refaulted until we
@@ -851,13 +859,6 @@ xfs_setattr_size(
         * to hope that the caller sees ENOMEM and retries the truncate
         * operation.
         */
-        if (IS_DAX(inode))
-                error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
-        else
-                error = block_truncate_page(inode->i_mapping, newsize,
-                                            xfs_get_blocks);
-        if (error)
-                return error;
        truncate_setsize(inode, newsize);
        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@@ -998,51 +999,6 @@ xfs_vn_update_time(
        return xfs_trans_commit(tp);
 }
-#define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
-        void                    **arg,
-        struct getbmapx         *bmv,
-        int                     *full)
-{
-        int                     error;
-        struct fiemap_extent_info *fieinfo = *arg;
-        u32                     fiemap_flags = 0;
-        u64                     logical, physical, length;
-        /* Do nothing for a hole */
-        if (bmv->bmv_block == -1LL)
-                return 0;
-        logical = BBTOB(bmv->bmv_offset);
-        physical = BBTOB(bmv->bmv_block);
-        length = BBTOB(bmv->bmv_length);
-        if (bmv->bmv_oflags & BMV_OF_PREALLOC)
-                fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
-        else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-                fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
-                                 FIEMAP_EXTENT_UNKNOWN);
-                physical = 0;   /* no block yet */
-        }
-        if (bmv->bmv_oflags & BMV_OF_LAST)
-                fiemap_flags |= FIEMAP_EXTENT_LAST;
-        error = fiemap_fill_next_extent(fieinfo, logical, physical,
-                                        length, fiemap_flags);
-        if (error > 0) {
-                error = 0;
-                *full = 1;      /* user array now full */
-        }
-        return error;
-}
 STATIC int
 xfs_vn_fiemap(
        struct inode            *inode,
@@ -1050,38 +1006,13 @@ xfs_vn_fiemap(
        u64                     start,
        u64                     length)
 {
-        xfs_inode_t             *ip = XFS_I(inode);
-        struct getbmapx         bm;
        int                     error;
-        error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+        xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
-        if (error)
+        error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
-                return error;
+        xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
-        /* Set up bmap header for xfs internal routine */
-        bm.bmv_offset = BTOBBT(start);
-        /* Special case for whole file */
-        if (length == FIEMAP_MAX_OFFSET)
-                bm.bmv_length = -1LL;
-        else
-                bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
-        /* We add one because in getbmap world count includes the header */
-        bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
-                                        fieinfo->fi_extents_max + 1;
-        bm.bmv_count = min_t(__s32, bm.bmv_count,
-                             (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-        bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
-        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
-                bm.bmv_iflags |= BMV_IF_ATTRFORK;
-        if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
-                bm.bmv_iflags |= BMV_IF_DELALLOC;
-        error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
-        if (error)
-                return error;
-        return 0;
+        return error;
 }
 STATIC int
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index a8192dc797dc..b8d64d520e12 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
        return x;
 }
-/* ARM old ABI has some weird alignment/padding */
-#if defined(__arm__) && !defined(__ARM_EABI__)
-#define __arch_pack __attribute__((packed))
-#else
-#define __arch_pack
-#endif
 #define ASSERT_ALWAYS(expr)     \
        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bde02f1fba73..3b74fa011bb1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -788,7 +788,7 @@ xfs_log_mount_cancel(
 * As far as I know, there weren't any dependencies on the old behaviour.
 */
-int
+static int
 xfs_log_unmount_write(xfs_mount_t *mp)
 {
        struct xlog      *log = mp->m_log;
@@ -1036,7 +1036,7 @@ xfs_log_space_wake(
 * there's no point in running a dummy transaction at this point because we
 * can't start trying to idle the log until both the CIL and AIL are empty.
 */
-int
+static int
 xfs_log_need_covered(xfs_mount_t *mp)
 {
        struct xlog     *log = mp->m_log;
@@ -1177,7 +1177,7 @@ xlog_space_left(
 * The log manager needs its own routine, in order to control what
 * happens with the buffer after the write completes.
 */
-void
+static void
 xlog_iodone(xfs_buf_t *bp)
 {
        struct xlog_in_core     *iclog = bp->b_fspriv;
@@ -1302,7 +1302,7 @@ xfs_log_work_queue(
 * disk. If there is nothing dirty, then we might need to cover the log to
 * indicate that the filesystem is idle.
 */
-void
+static void
 xfs_log_worker(
        struct work_struct      *work)
 {
@@ -1415,7 +1415,7 @@ xlog_alloc_log(
         */
        error = -ENOMEM;
        bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
-                           BTOBB(log->l_iclog_size), 0);
+                           BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
        if (!bp)
                goto out_free_log;
@@ -1454,7 +1454,8 @@ xlog_alloc_log(
                prev_iclog = iclog;
                bp = xfs_buf_get_uncached(mp->m_logdev_targp,
-                                                BTOBB(log->l_iclog_size), 0);
+                                          BTOBB(log->l_iclog_size),
+                                          XBF_NO_IOACCT);
                if (!bp)
                        goto out_free_iclog;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 80ba0c047090..b5e71072fde5 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -163,12 +163,8 @@ int	  xfs_log_reserve(struct xfs_mount *mp,
                          __uint8_t        clientid,
                          bool             permanent);
 int       xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-int       xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
 int       xfs_log_force_umount(struct xfs_mount *mp, int logerror);
-int       xfs_log_need_covered(struct xfs_mount *mp);
-void      xlog_iodone(struct xfs_buf *);
 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void      xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -178,7 +174,6 @@ void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 void    xfs_log_work_queue(struct xfs_mount *mp);
-void    xfs_log_worker(struct work_struct *work);
 void    xfs_log_quiesce(struct xfs_mount *mp);
 bool    xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5e54e7955ea6..a4ab192e1792 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
        log->l_cilp->xc_ctx->sequence = 1;
 }
+static inline int
+xlog_cil_iovec_space(
+        uint    niovecs)
+{
+        return round_up((sizeof(struct xfs_log_vec) +
+                                        niovecs * sizeof(struct xfs_log_iovec)),
+                        sizeof(uint64_t));
+}
+/*
+ * Allocate or pin log vector buffers for CIL insertion.
+ *
+ * The CIL currently uses disposable buffers for copying a snapshot of the
+ * modified items into the log during a push. The biggest problem with this is
+ * the requirement to allocate the disposable buffer during the commit if:
+ *      a) does not exist; or
+ *      b) it is too small
+ *
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
+ * the memory allocation. This means that we have a potential deadlock situation
+ * under low memory conditions when we have lots of dirty metadata pinned in
+ * the CIL and we need a CIL commit to occur to free memory.
+ *
+ * To avoid this, we need to move the memory allocation outside the
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
+ * vector buffers between the check and the formatting of the item into the
+ * log vector buffer within the xc_ctx_lock.
+ *
+ * Because the log vector buffer needs to be unchanged during the CIL push
+ * process, we cannot share the buffer between the transaction commit (which
+ * modifies the buffer) and the CIL push context that is writing the changes
+ * into the log. This means skipping preallocation of buffer space is
+ * unreliable, but we most definitely do not want to be allocating and freeing
+ * buffers unnecessarily during commits when overwrites can be done safely.
+ *
+ * The simplest solution to this problem is to allocate a shadow buffer when a
+ * log item is committed for the second time, and then to only use this buffer
+ * if necessary. The buffer can remain attached to the log item until such time
+ * it is needed, and this is the buffer that is reallocated to match the size of
+ * the incoming modification. Then during the formatting of the item we can swap
+ * the active buffer with the new one if we can't reuse the existing buffer. We
+ * don't free the old buffer as it may be reused on the next modification if
+ * it's size is right, otherwise we'll free and reallocate it at that point.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and attaches the vector to the log item in preparation
+ * for the formatting step which occurs under the xc_ctx_lock.
+ *
+ * While this means the memory footprint goes up, it avoids the repeated
+ * alloc/free pattern that repeated modifications of an item would otherwise
+ * cause, and hence minimises the CPU overhead of such behaviour.
+ */
+static void
+xlog_cil_alloc_shadow_bufs(
+        struct xlog             *log,
+        struct xfs_trans        *tp)
+{
+        struct xfs_log_item_desc *lidp;
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+                struct xfs_log_item *lip = lidp->lid_item;
+                struct xfs_log_vec *lv;
+                int     niovecs = 0;
+                int     nbytes = 0;
+                int     buf_size;
+                bool    ordered = false;
+                /* Skip items which aren't dirty in this transaction. */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                        continue;
+                /* get number of vecs and size of data to be stored */
+                lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+                /*
+                 * Ordered items need to be tracked but we do not wish to write
+                 * them. We need a logvec to track the object, but we do not
+                 * need an iovec or buffer to be allocated for copying data.
+                 */
+                if (niovecs == XFS_LOG_VEC_ORDERED) {
+                        ordered = true;
+                        niovecs = 0;
+                        nbytes = 0;
+                }
+                /*
+                 * We 64-bit align the length of each iovec so that the start
+                 * of the next one is naturally aligned.  We'll need to
+                 * account for that slack space here. Then round nbytes up
+                 * to 64-bit alignment so that the initial buffer alignment is
+                 * easy to calculate and verify.
+                 */
+                nbytes += niovecs * sizeof(uint64_t);
+                nbytes = round_up(nbytes, sizeof(uint64_t));
+                /*
+                 * The data buffer needs to start 64-bit aligned, so round up
+                 * that space to ensure we can align it appropriately and not
+                 * overrun the buffer.
+                 */
+                buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+                /*
+                 * if we have no shadow buffer, or it is too small, we need to
+                 * reallocate it.
+                 */
+                if (!lip->li_lv_shadow ||
+                    buf_size > lip->li_lv_shadow->lv_size) {
+                        /*
+                         * We free and allocate here as a realloc would copy
+                         * unecessary data. We don't use kmem_zalloc() for the
+                         * same reason - we don't need to zero the data area in
+                         * the buffer, only the log vector header and the iovec
+                         * storage.
+                         */
+                        kmem_free(lip->li_lv_shadow);
+                        lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
+                        memset(lv, 0, xlog_cil_iovec_space(niovecs));
+                        lv->lv_item = lip;
+                        lv->lv_size = buf_size;
+                        if (ordered)
+                                lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                        else
+                                lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+                        lip->li_lv_shadow = lv;
+                } else {
+                        /* same or smaller, optimise common overwrite case */
+                        lv = lip->li_lv_shadow;
+                        if (ordered)
+                                lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                        else
+                                lv->lv_buf_len = 0;
+                        lv->lv_bytes = 0;
+                        lv->lv_next = NULL;
+                }
+                /* Ensure the lv is set up according to ->iop_size */
+                lv->lv_niovecs = niovecs;
+                /* The allocated data region lies beyond the iovec region */
+                lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
+        }
+}
 /*
 * Prepare the log item for insertion into the CIL. Calculate the difference in
 * log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
        /*
         * If there is no old LV, this is the first time we've seen the item in
         * this CIL context and so we need to pin it. If we are replacing the
-         * old_lv, then remove the space it accounts for and free it.
+         * old_lv, then remove the space it accounts for and make it the shadow
+         * buffer for later freeing. In both cases we are now switching to the
+         * shadow buffer, so update the the pointer to it appropriately.
         */
-        if (!old_lv)
+        if (!old_lv) {
                lv->lv_item->li_ops->iop_pin(lv->lv_item);
-        else if (old_lv != lv) {
+                lv->lv_item->li_lv_shadow = NULL;
+        } else if (old_lv != lv) {
                ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
                *diff_len -= old_lv->lv_bytes;
                *diff_iovecs -= old_lv->lv_niovecs;
-                kmem_free(old_lv);
+                lv->lv_item->li_lv_shadow = old_lv;
        }
        /* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
 * write it out asynchronously without needing to relock the object that was
 * modified at the time it gets written into the iclog.
 *
- * This function builds a vector for the changes in each log item in the
+ * This function takes the prepared log vectors attached to each log item, and
- * transaction. It then works out the length of the buffer needed for each log
+ * formats the changes into the log vector buffer. The buffer it uses is
- * item, allocates them and formats the vector for the item into the buffer.
+ * dependent on the current state of the vector in the CIL - the shadow lv is
- * The buffer is then attached to the log item are then inserted into the
+ * guaranteed to be large enough for the current modification, but we will only
- * Committed Item List for tracking until the next checkpoint is written out.
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
+ * done lazily either by th enext modification or the freeing of the log item.
 *
 * We don't set up region headers during this process; we simply copy the
 * regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
                struct xfs_log_item *lip = lidp->lid_item;
                struct xfs_log_vec *lv;
-                struct xfs_log_vec *old_lv;
+                struct xfs_log_vec *old_lv = NULL;
-                int     niovecs = 0;
+                struct xfs_log_vec *shadow;
-                int     nbytes = 0;
-                int     buf_size;
                bool    ordered = false;
                /* Skip items which aren't dirty in this transaction. */
                if (!(lidp->lid_flags & XFS_LID_DIRTY))
                        continue;
-                /* get number of vecs and size of data to be stored */
-                lip->li_ops->iop_size(lip, &niovecs, &nbytes);
-                /* Skip items that do not have any vectors for writing */
-                if (!niovecs)
-                        continue;
                /*
-                 * Ordered items need to be tracked but we do not wish to write
+                 * The formatting size information is already attached to
-                 * them. We need a logvec to track the object, but we do not
+                 * the shadow lv on the log item.
-                 * need an iovec or buffer to be allocated for copying data.
                 */
-                if (niovecs == XFS_LOG_VEC_ORDERED) {
+                shadow = lip->li_lv_shadow;
+                if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
                        ordered = true;
-                        niovecs = 0;
-                        nbytes = 0;
-                }
-                /*
+                /* Skip items that do not have any vectors for writing */
-                 * We 64-bit align the length of each iovec so that the start
+                if (!shadow->lv_niovecs && !ordered)
-                 * of the next one is naturally aligned.  We'll need to
+                        continue;
-                 * account for that slack space here. Then round nbytes up
-                 * to 64-bit alignment so that the initial buffer alignment is
-                 * easy to calculate and verify.
-                 */
-                nbytes += niovecs * sizeof(uint64_t);
-                nbytes = round_up(nbytes, sizeof(uint64_t));
-                /* grab the old item if it exists for reservation accounting */
-                old_lv = lip->li_lv;
-                /*
-                 * The data buffer needs to start 64-bit aligned, so round up
-                 * that space to ensure we can align it appropriately and not
-                 * overrun the buffer.
-                 */
-                buf_size = nbytes +
-                           round_up((sizeof(struct xfs_log_vec) +
-                                     niovecs * sizeof(struct xfs_log_iovec)),
-                                    sizeof(uint64_t));
                /* compare to existing item size */
-                if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+                old_lv = lip->li_lv;
+                if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
                        /* same or smaller, optimise common overwrite case */
                        lv = lip->li_lv;
                        lv->lv_next = NULL;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
                         */
                        *diff_iovecs -= lv->lv_niovecs;
                        *diff_len -= lv->lv_bytes;
+                        /* Ensure the lv is set up according to ->iop_size */
+                        lv->lv_niovecs = shadow->lv_niovecs;
+                        /* reset the lv buffer information for new formatting */
+                        lv->lv_buf_len = 0;
+                        lv->lv_bytes = 0;
+                        lv->lv_buf = (char *)lv +
+                                        xlog_cil_iovec_space(lv->lv_niovecs);
                } else {
-                        /* allocate new data chunk */
+                        /* switch to shadow buffer! */
-                        lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+                        lv = shadow;
                        lv->lv_item = lip;
-                        lv->lv_size = buf_size;
                        if (ordered) {
                                /* track as an ordered logvec */
                                ASSERT(lip->li_lv == NULL);
-                                lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
                                goto insert;
                        }
-                        lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
                }
-                /* Ensure the lv is set up according to ->iop_size */
-                lv->lv_niovecs = niovecs;
-                /* The allocated data region lies beyond the iovec region */
-                lv->lv_buf_len = 0;
-                lv->lv_bytes = 0;
-                lv->lv_buf = (char *)lv + buf_size - nbytes;
                ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
                lip->li_ops->iop_format(lip, lv);
 insert:
-                ASSERT(lv->lv_buf_len <= nbytes);
                xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
        }
 }
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
        struct xlog             *log = mp->m_log;
        struct xfs_cil          *cil = log->l_cilp;
+        /*
+         * Do all necessary memory allocation before we lock the CIL.
+         * This ensures the allocation does not deadlock with a CIL
+         * push in memory reclaim (e.g. from kswapd).
+         */
+        xlog_cil_alloc_shadow_bufs(log, tp);
        /* lock out background commit */
        down_read(&cil->xc_ctx_lock);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e39b02351b4a..970c19ba2f56 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -272,13 +272,15 @@ xfs_readsb(
        buf_ops = NULL;
        /*
-         * Allocate a (locked) buffer to hold the superblock.
+         * Allocate a (locked) buffer to hold the superblock. This will be kept
-         * This will be kept around at all times to optimize
+         * around at all times to optimize access to the superblock. Therefore,
-         * access to the superblock.
+         * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
+         * elevated.
         */
 reread:
        error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-                                   BTOBB(sector_size), 0, &bp, buf_ops);
+                                      BTOBB(sector_size), XBF_NO_IOACCT, &bp,
+                                      buf_ops);
        if (error) {
                if (loud)
                        xfs_warn(mp, "SB validate failed with error %d.", error);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 184c44effdd5..0cc8d8f74356 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -22,6 +22,11 @@
        BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
                #structname ") is wrong, expected " #size)
+#define XFS_CHECK_OFFSET(structname, member, off) \
+        BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+                "XFS: offsetof(" #structname ", " #member ") is wrong, " \
+                "expected " #off)
 static inline void __init
 xfs_check_ondisk_structs(void)
 {
@@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void)
        XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key,              8);
        XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec,              16);
        XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block,            4);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr,      48);
+        XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr,      64);
        XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block,           72);
        XFS_CHECK_STRUCT_SIZE(struct xfs_dinode,                176);
        XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot,            104);
@@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void)
        XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,      12);
         */
+        XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen,  0);
+        XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen,   2);
+        XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval,   3);
+        XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0);
+        XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
+        XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen,  8);
+        XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name,     9);
        XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t,             40);
-        XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t,             8);
+        XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize,     0);
+        XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count,       2);
+        XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
+        XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
+        XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags,   6);
+        XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
        XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t,                 12);
        XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t,                 16);
        XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t,              8);
        XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t,                16);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t,             4);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t,              16);
-        XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t,           6);
+        XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag,       0);
+        XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length,        2);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t,              16);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t,                  16);
-        XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t,                  4);
-        XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t,                  8);
-        XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t,                  8);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t,            8);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t,              16);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t,                  16);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t,             4);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t,              3);
+        XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen,          0);
+        XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset,           1);
+        XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name,             3);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t,                10);
-        XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t,                2);
        /* log structures */
        XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat,          24);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index d5b756669fb5..0f14b2e4bf6c 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
 /*
 * Copyright (c) 2014 Christoph Hellwig.
 */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
@@ -79,32 +80,6 @@ xfs_fs_get_uuid(
        return 0;
 }
-static void
-xfs_bmbt_to_iomap(
-        struct xfs_inode        *ip,
-        struct iomap            *iomap,
-        struct xfs_bmbt_irec    *imap)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        if (imap->br_startblock == HOLESTARTBLOCK) {
-                iomap->blkno = IOMAP_NULL_BLOCK;
-                iomap->type = IOMAP_HOLE;
-        } else if (imap->br_startblock == DELAYSTARTBLOCK) {
-                iomap->blkno = IOMAP_NULL_BLOCK;
-                iomap->type = IOMAP_DELALLOC;
-        } else {
-                iomap->blkno =
-                        XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
-                if (imap->br_state == XFS_EXT_UNWRITTEN)
-                        iomap->type = IOMAP_UNWRITTEN;
-                else
-                        iomap->type = IOMAP_MAPPED;
-        }
-        iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-        iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
 /*
 * Get a layout for the pNFS client.
 */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 76c0a4a9bb17..355dd9e1cb64 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -98,8 +98,6 @@ xfs_growfs_rt(
 /*
 * From xfs_rtbitmap.c
 */
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
-                  xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
 int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
                      xfs_rtblock_t start, xfs_extlen_t len, int val,
                      xfs_rtblock_t *new, int *stat);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 11ea5d51db56..0303f1005f88 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -546,7 +546,7 @@ xfs_showargs(
        return 0;
 }
-__uint64_t
+static __uint64_t
 xfs_max_file_offset(
        unsigned int            blockshift)
 {
@@ -1294,6 +1294,7 @@ xfs_fs_remount(
                 */
                xfs_restore_resvblks(mp);
                xfs_log_work_queue(mp);
+                xfs_queue_eofblocks(mp);
        }
        /* rw -> ro */
@@ -1306,6 +1307,13 @@ xfs_fs_remount(
                 * return it to the same size.
                 */
                xfs_save_resvblks(mp);
+                /*
+                 * Cancel background eofb scanning so it cannot race with the
+                 * final log force+buftarg wait and deadlock the remount.
+                 */
+                cancel_delayed_work_sync(&mp->m_eofblocks_work);
                xfs_quiesce_attr(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
        }
@@ -1565,10 +1573,6 @@ xfs_fs_fill_super(
                }
        }
-        if (xfs_sb_version_hassparseinodes(&mp->m_sb))
-                xfs_alert(mp,
-        "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;
@@ -1692,8 +1696,9 @@ xfs_init_zones(void)
        if (!xfs_log_ticket_zone)
                goto out_free_ioend_bioset;
-        xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+        xfs_bmap_free_item_zone = kmem_zone_init(
-                                                "xfs_bmap_free_item");
+                        sizeof(struct xfs_bmap_free_item),
+                        "xfs_bmap_free_item");
        if (!xfs_bmap_free_item_zone)
                goto out_destroy_log_ticket_zone;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 2dfb1ce4585f..529bce9fc37e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,8 +61,6 @@ struct xfs_mount;
 struct xfs_buftarg;
 struct block_device;
-extern __uint64_t xfs_max_file_offset(unsigned int);
 extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 4c2c55086208..79cfd3fc5324 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -634,6 +634,9 @@ xfs_error_get_cfg(
 {
        struct xfs_error_cfg    *cfg;
+        if (error < 0)
+                error = -error;
        switch (error) {
        case EIO:
                cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ea94ee0fe5ea..145169093fe0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
 DEFINE_BUF_EVENT(xfs_buf_trylock);
 DEFINE_BUF_EVENT(xfs_buf_unlock);
 DEFINE_BUF_EVENT(xfs_buf_iowait);
@@ -1134,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
 )
 DECLARE_EVENT_CLASS(xfs_file_class,
-        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
-        TP_ARGS(ip, count, offset, flags),
+        TP_ARGS(ip, count, offset),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fsize_t, size)
                __field(loff_t, offset)
                __field(size_t, count)
-                __field(int, flags)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1150,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class,
                __entry->size = ip->i_d.di_size;
                __entry->offset = offset;
                __entry->count = count;
-                __entry->flags = flags;
        ),
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
-                  "offset 0x%llx count 0x%zx ioflags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->offset,
-                  __entry->count,
+                  __entry->count)
-                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
 )
 #define DEFINE_RW_EVENT(name)           \
 DEFINE_EVENT(xfs_file_class, name,      \
-        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),    \
-        TP_ARGS(ip, count, offset, flags))
+        TP_ARGS(ip, count, offset))
-DEFINE_RW_EVENT(xfs_file_read);
+DEFINE_RW_EVENT(xfs_file_buffered_read);
+DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 DECLARE_EVENT_CLASS(xfs_page_class,
@@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9a462e892e4f..9b2b9fa89331 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
        /* delayed logging */
        struct list_head                li_cil;         /* CIL pointers */
        struct xfs_log_vec              *li_lv;         /* active log vector */
+        struct xfs_log_vec              *li_lv_shadow;  /* standby vector */
        xfs_lsn_t                       li_seq;         /* CIL commit seq */
 } xfs_log_item_t;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index d8414502edb4..b03c0625fa6e 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -6,6 +6,7 @@
 struct dentry;
 struct iattr;
 struct inode;
+struct iomap;
 struct super_block;
 struct vfsmount;
@@ -187,21 +188,6 @@ struct fid {
 *    get_name is not (which is possibly inconsistent)
 */
-/* types of block ranges for multipage write mappings. */
-#define IOMAP_HOLE      0x01    /* no blocks allocated, need allocation */
-#define IOMAP_DELALLOC  0x02    /* delayed allocation blocks */
-#define IOMAP_MAPPED    0x03    /* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN 0x04    /* blocks allocated @blkno in unwritten state */
-#define IOMAP_NULL_BLOCK -1LL   /* blkno is not valid */
-struct iomap {
-        sector_t        blkno;  /* first sector of mapping */
-        loff_t          offset; /* file offset of mapping, bytes */
-        u64             length; /* length of mapping, bytes */
-        int             type;   /* type of mapping */
-};
 struct export_operations {
        int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
                        struct inode *parent);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644
index 000000000000..3267df461012
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,70 @@
+#ifndef LINUX_IOMAP_H
+#define LINUX_IOMAP_H 1
+#include <linux/types.h>
+struct fiemap_extent_info;
+struct inode;
+struct iov_iter;
+struct kiocb;
+struct vm_area_struct;
+struct vm_fault;
+/*
+ * Types of block ranges for iomap mappings:
+ */
+#define IOMAP_HOLE      0x01    /* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC  0x02    /* delayed allocation blocks */
+#define IOMAP_MAPPED    0x03    /* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN 0x04    /* blocks allocated @blkno in unwritten state */
+/*
+ * Magic value for blkno:
+ */
+#define IOMAP_NULL_BLOCK -1LL   /* blkno is not valid */
+struct iomap {
+        sector_t                blkno;  /* 1st sector of mapping, 512b units */
+        loff_t                  offset; /* file offset of mapping, bytes */
+        u64                     length; /* length of mapping, bytes */
+        int                     type;   /* type of mapping */
+        struct block_device     *bdev;  /* block device for I/O */
+};
+/*
+ * Flags for iomap_begin / iomap_end.  No flag implies a read.
+ */
+#define IOMAP_WRITE             (1 << 0)
+#define IOMAP_ZERO              (1 << 1)
+struct iomap_ops {
+        /*
+         * Return the existing mapping at pos, or reserve space starting at
+         * pos for up to length, as long as we can do it as a single mapping.
+         * The actual length is returned in iomap->length.
+         */
+        int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
+                        unsigned flags, struct iomap *iomap);
+        /*
+         * Commit and/or unreserve space previous allocated using iomap_begin.
+         * Written indicates the length of the successful write operation which
+         * needs to be commited, while the rest needs to be unreserved.
+         * Written might be zero if no data was written.
+         */
+        int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
+                        ssize_t written, unsigned flags, struct iomap *iomap);
+};
+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+                struct iomap_ops *ops);
+int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
+                bool *did_zero, struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+                struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                struct iomap_ops *ops);
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                loff_t start, loff_t len, struct iomap_ops *ops);
+#endif /* LINUX_IOMAP_H */
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-07-27 12:53:35 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-07-27 12:53:35 -0400
commit	0e6acf0204da5b8705722a5f6806a4f55ed379d6 (patch)
tree	4a8a9bf9daba9c734a0fdde417ae1cb472ca396d
parent	0e06f5c0deeef0332a5da2ecb8f1fcf3e024d958 (diff)
parent	f2bdfda9a1c668539bc85baf5625f6f14bc510b1 (diff)