Merge tag 'xfs-5.4-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs updates from Darrick Wong: "For this cycle we have the usual pile of cleanups and bug fixes, some performance improvements for online metadata scrubbing, massive speedups in the directory entry creation code, some performance improvement in the file ACL lookup code, a fix for a logging stall during mount, and fixes for concurrency problems. It has survived a couple of weeks of xfstests runs and merges cleanly. Summary: - Remove KM_SLEEP/KM_NOSLEEP. - Ensure that memory buffers for IO are properly sector-aligned to avoid problems that the block layer doesn't check. - Make the bmap scrubber more efficient in its record checking. - Don't crash xfs_db when superblock inode geometry is corrupt. - Fix btree key helper functions. - Remove unneeded error returns for things that can't fail. - Fix buffer logging bugs in repair. - Clean up iterator return values. - Speed up directory entry creation. - Enable allocation of xattr value memory buffer during lookup. - Fix readahead racing with truncate/punch hole. - Other minor cleanups. - Fix one AGI/AGF deadlock with RENAME_WHITEOUT. - More BUG -> WARN whackamole. - Fix various problems with the log failing to advance under certain circumstances, which results in stalls during mount" * tag 'xfs-5.4-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (45 commits) xfs: push the grant head when the log head moves forward xfs: push iclog state cleaning into xlog_state_clean_log xfs: factor iclog state processing out of xlog_state_do_callback() xfs: factor callbacks out of xlog_state_do_callback() xfs: factor debug code out of xlog_state_do_callback() xfs: prevent CIL push holdoff in log recovery xfs: fix missed wakeup on l_flush_wait xfs: push the AIL in xlog_grant_head_wake xfs: Use WARN_ON_ONCE for bailout mount-operation xfs: Fix deadlock between AGI and AGF with RENAME_WHITEOUT xfs: define a flags field for the AG geometry ioctl structure xfs: add a xfs_valid_startblock helper xfs: remove the unused XFS_ALLOC_USERDATA flag xfs: cleanup xfs_fsb_to_db xfs: fix the dax supported check in xfs_ioctl_setattr_dax_invalidate xfs: Fix stale data exposure when readahead races with hole punch fs: Export generic_fadvise() mm: Handle MADV_WILLNEED through vfs_fadvise() xfs: allocate xattr buffer on demand xfs: consolidate attribute value copying ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2019-09-18 21:32:43 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-09-18 21:32:43 -0400
commit: b41dae061bbd722b9d7fa828f35d22035b218e18 (patch)
tree: a5c0bade0c3d221483b54204bfc47e4fdbf09316
parent: e6bc9de714972cac34daa1dc1567ee48a47a9342 (diff)
parent: 14e15f1bcd738dc13dd7c1e78e4800e8bc577980 (diff)
81 files changed, 1315 insertions, 1089 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 16bb9a328678..da031b93e182 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -3,10 +3,10 @@
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 */
-#include <linux/sched/mm.h>
+#include "xfs.h"
 #include <linux/backing-dev.h>
-#include "kmem.h"
 #include "xfs_message.h"
+#include "xfs_trace.h"
 void *
 kmem_alloc(size_t size, xfs_km_flags_t flags)
@@ -15,9 +15,11 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
        gfp_t   lflags = kmem_flags_convert(flags);
        void    *ptr;
+        trace_kmem_alloc(size, flags, _RET_IP_);
        do {
                ptr = kmalloc(size, lflags);
-                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                if (ptr || (flags & KM_MAYFAIL))
                        return ptr;
                if (!(++retries % 100))
                        xfs_err(NULL,
@@ -28,28 +30,24 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
        } while (1);
 }
-void *
-kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+/*
+ * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
+ * we need to tell memory reclaim that we are in such a context via
+ * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
+ * and potentially deadlocking.
+ */
+static void *
+__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
 {
        unsigned nofs_flag = 0;
        void    *ptr;
-        gfp_t   lflags;
+        gfp_t   lflags = kmem_flags_convert(flags);
-        ptr = kmem_alloc(size, flags | KM_MAYFAIL);
-        if (ptr)
-                return ptr;
-        /*
-         * __vmalloc() will allocate data pages and auxillary structures (e.g.
-         * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
-         * here. Hence we need to tell memory reclaim that we are in such a
-         * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
-         * the filesystem here and potentially deadlocking.
-         */
        if (flags & KM_NOFS)
                nofs_flag = memalloc_nofs_save();
-        lflags = kmem_flags_convert(flags);
        ptr = __vmalloc(size, lflags, PAGE_KERNEL);
        if (flags & KM_NOFS)
@@ -58,6 +56,44 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags)
        return ptr;
 }
+/*
+ * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
+ * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
+ * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
+ * aligned region.
+ */
+void *
+kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
+{
+        void    *ptr;
+        trace_kmem_alloc_io(size, flags, _RET_IP_);
+        if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
+                align_mask = PAGE_SIZE - 1;
+        ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+        if (ptr) {
+                if (!((uintptr_t)ptr & align_mask))
+                        return ptr;
+                kfree(ptr);
+        }
+        return __kmem_vmalloc(size, flags);
+}
+void *
+kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+{
+        void    *ptr;
+        trace_kmem_alloc_large(size, flags, _RET_IP_);
+        ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+        if (ptr)
+                return ptr;
+        return __kmem_vmalloc(size, flags);
+}
 void *
 kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
 {
@@ -65,9 +101,11 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
        gfp_t   lflags = kmem_flags_convert(flags);
        void    *ptr;
+        trace_kmem_realloc(newsize, flags, _RET_IP_);
        do {
                ptr = krealloc(old, newsize, lflags);
-                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                if (ptr || (flags & KM_MAYFAIL))
                        return ptr;
                if (!(++retries % 100))
                        xfs_err(NULL,
@@ -85,9 +123,10 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
        gfp_t   lflags = kmem_flags_convert(flags);
        void    *ptr;
+        trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
        do {
                ptr = kmem_cache_alloc(zone, lflags);
-                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                if (ptr || (flags & KM_MAYFAIL))
                        return ptr;
                if (!(++retries % 100))
                        xfs_err(NULL,
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 267655acd426..8170d95cf930 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -16,8 +16,6 @@
 */
 typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_SLEEP        ((__force xfs_km_flags_t)0x0001u)
-#define KM_NOSLEEP      ((__force xfs_km_flags_t)0x0002u)
 #define KM_NOFS         ((__force xfs_km_flags_t)0x0004u)
 #define KM_MAYFAIL      ((__force xfs_km_flags_t)0x0008u)
 #define KM_ZERO         ((__force xfs_km_flags_t)0x0010u)
@@ -32,15 +30,11 @@ kmem_flags_convert(xfs_km_flags_t flags)
 {
        gfp_t   lflags;
-        BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
+        BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
-        if (flags & KM_NOSLEEP) {
+        lflags = GFP_KERNEL | __GFP_NOWARN;
-                lflags = GFP_ATOMIC | __GFP_NOWARN;
+        if (flags & KM_NOFS)
-        } else {
+                lflags &= ~__GFP_FS;
-                lflags = GFP_KERNEL | __GFP_NOWARN;
-                if (flags & KM_NOFS)
-                        lflags &= ~__GFP_FS;
-        }
        /*
         * Default page/slab allocator behavior is to retry for ever
@@ -59,6 +53,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 }
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
 extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
 extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
 static inline void  kmem_free(const void *ptr)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 372ad55631fc..533b04aaf6f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2205,7 +2205,7 @@ xfs_defer_agfl_block(
        ASSERT(xfs_bmap_free_item_zone != NULL);
        ASSERT(oinfo != NULL);
-        new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+        new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
        new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
        new->xefi_blockcount = 1;
        new->xefi_oinfo = *oinfo;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d6ed5d2c07c2..58fa85cec325 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -81,10 +81,9 @@ typedef struct xfs_alloc_arg {
 /*
 * Defines for datatype
 */
-#define XFS_ALLOC_USERDATA              (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA     (1 << 0)/* special case start of file */
-#define XFS_ALLOC_INITIAL_USER_DATA     (1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO         (1 << 1)/* zero extent on allocation */
-#define XFS_ALLOC_USERDATA_ZERO         (1 << 2)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY                (1 << 2)/* Busy extents not allowed */
-#define XFS_ALLOC_NOBUSY                (1 << 3)/* Busy extents not allowed */
 static inline bool
 xfs_alloc_is_userdata(int datatype)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index d48fcf11cc35..510ca6974604 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -97,7 +97,10 @@ xfs_inode_hasattr(
 * Overall external interface routines.
 *========================================================================*/
-/* Retrieve an extended attribute and its value.  Must have ilock. */
+/*
+ * Retrieve an extended attribute and its value.  Must have ilock.
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
 int
 xfs_attr_get_ilocked(
        struct xfs_inode        *ip,
@@ -115,12 +118,28 @@ xfs_attr_get_ilocked(
                return xfs_attr_node_get(args);
 }
-/* Retrieve an extended attribute by name, and its value. */
+/*
+ * Retrieve an extended attribute by name, and its value if requested.
+ *
+ * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
+ * just an indication whether the attribute exists and the size of the value if
+ * it exists. The size is returned in @valuelenp,
+ *
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * @valuelenp, return -ERANGE with the size of the attribute that was found in
+ * @valuelenp.
+ *
+ * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
+ * existence of the attribute has been determined. On success, return that
+ * buffer to the caller and leave them to free it. On failure, free any
+ * allocated buffer and ensure the buffer pointer returned to the caller is
+ * null.
+ */
 int
 xfs_attr_get(
        struct xfs_inode        *ip,
        const unsigned char     *name,
-        unsigned char           *value,
+        unsigned char           **value,
        int                     *valuelenp,
        int                     flags)
 {
@@ -128,6 +147,8 @@ xfs_attr_get(
        uint                    lock_mode;
        int                     error;
+        ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
        XFS_STATS_INC(ip->i_mount, xs_attr_get);
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -137,17 +158,29 @@ xfs_attr_get(
        if (error)
                return error;
-        args.value = value;
-        args.valuelen = *valuelenp;
        /* Entirely possible to look up a name which doesn't exist */
        args.op_flags = XFS_DA_OP_OKNOENT;
+        if (flags & ATTR_ALLOC)
+                args.op_flags |= XFS_DA_OP_ALLOCVAL;
+        else
+                args.value = *value;
+        args.valuelen = *valuelenp;
        lock_mode = xfs_ilock_attr_map_shared(ip);
        error = xfs_attr_get_ilocked(ip, &args);
        xfs_iunlock(ip, lock_mode);
        *valuelenp = args.valuelen;
-        return error == -EEXIST ? 0 : error;
+        /* on error, we have to clean up allocated value buffers */
+        if (error) {
+                if (flags & ATTR_ALLOC) {
+                        kmem_free(args.value);
+                        *value = NULL;
+                }
+                return error;
+        }
+        *value = args.value;
+        return 0;
 }
 /*
@@ -768,6 +801,8 @@ xfs_attr_leaf_removename(
 *
 * This leaf block cannot have a "remote" value, we only call this routine
 * if bmap_one_block() says there is only one block (ie: no remote blks).
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
 */
 STATIC int
 xfs_attr_leaf_get(xfs_da_args_t *args)
@@ -789,9 +824,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
        }
        error = xfs_attr3_leaf_getvalue(bp, args);
        xfs_trans_brelse(args->trans, bp);
-        if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
-                error = xfs_attr_rmtval_get(args);
-        }
        return error;
 }
@@ -1268,11 +1300,13 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 }
 /*
- * Look up a filename in a node attribute list.
+ * Retrieve the attribute data from a node attribute list.
 *
 * This routine gets called for any attribute fork that has more than one
 * block, ie: both true Btree attr lists and for single-leaf-blocks with
 * "remote" values taking up more blocks.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
 */
 STATIC int
 xfs_attr_node_get(xfs_da_args_t *args)
@@ -1294,24 +1328,21 @@ xfs_attr_node_get(xfs_da_args_t *args)
        error = xfs_da3_node_lookup_int(state, &retval);
        if (error) {
                retval = error;
-        } else if (retval == -EEXIST) {
+                goto out_release;
-                blk = &state->path.blk[ state->path.active-1 ];
-                ASSERT(blk->bp != NULL);
-                ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-                /*
-                 * Get the value, local or "remote"
-                 */
-                retval = xfs_attr3_leaf_getvalue(blk->bp, args);
-                if (!retval && (args->rmtblkno > 0)
-                    && !(args->flags & ATTR_KERNOVAL)) {
-                        retval = xfs_attr_rmtval_get(args);
-                }
        }
+        if (retval != -EEXIST)
+                goto out_release;
+        /*
+         * Get the value, local or "remote"
+         */
+        blk = &state->path.blk[state->path.active - 1];
+        retval = xfs_attr3_leaf_getvalue(blk->bp, args);
        /*
         * If not in a transaction, we have to release all the buffers.
         */
+out_release:
        for (i = 0; i < state->path.active; i++) {
                xfs_trans_brelse(args->trans, state->path.blk[i].bp);
                state->path.blk[i].bp = NULL;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index ff28ebf3b635..94badfa1743e 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -37,6 +37,7 @@ struct xfs_attr_list_context;
 #define ATTR_KERNOVAL   0x2000  /* [kernel] get attr size only, not value */
 #define ATTR_INCOMPLETE 0x4000  /* [kernel] return INCOMPLETE attr keys */
+#define ATTR_ALLOC      0x8000  /* allocate xattr buffer on demand */
 #define XFS_ATTR_FLAGS \
        { ATTR_DONTFOLLOW,      "DONTFOLLOW" }, \
@@ -47,7 +48,8 @@ struct xfs_attr_list_context;
        { ATTR_REPLACE,         "REPLACE" }, \
        { ATTR_KERNOTIME,       "KERNOTIME" }, \
        { ATTR_KERNOVAL,        "KERNOVAL" }, \
-        { ATTR_INCOMPLETE,      "INCOMPLETE" }
+        { ATTR_INCOMPLETE,      "INCOMPLETE" }, \
+        { ATTR_ALLOC,           "ALLOC" }
 /*
 * The maximum size (into the kernel or returned from the kernel) of an
@@ -143,7 +145,7 @@ int xfs_attr_list_int(struct xfs_attr_list_context *);
 int xfs_inode_hasattr(struct xfs_inode *ip);
 int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
 int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
-                 unsigned char *value, int *valuelenp, int flags);
+                 unsigned char **value, int *valuelenp, int flags);
 int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
                 unsigned char *value, int valuelen, int flags);
 int xfs_attr_set_args(struct xfs_da_args *args);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 70eb941d02e4..b9f019603d0b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -393,6 +393,50 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
        return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
 }
+static int
+xfs_attr_copy_value(
+        struct xfs_da_args      *args,
+        unsigned char           *value,
+        int                     valuelen)
+{
+        /*
+         * No copy if all we have to do is get the length
+         */
+        if (args->flags & ATTR_KERNOVAL) {
+                args->valuelen = valuelen;
+                return 0;
+        }
+        /*
+         * No copy if the length of the existing buffer is too small
+         */
+        if (args->valuelen < valuelen) {
+                args->valuelen = valuelen;
+                return -ERANGE;
+        }
+        if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
+                args->value = kmem_alloc_large(valuelen, 0);
+                if (!args->value)
+                        return -ENOMEM;
+        }
+        args->valuelen = valuelen;
+        /* remote block xattr requires IO for copy-in */
+        if (args->rmtblkno)
+                return xfs_attr_rmtval_get(args);
+        /*
+         * This is to prevent a GCC warning because the remote xattr case
+         * doesn't have a value to pass in. In that case, we never reach here,
+         * but GCC can't work that out and so throws a "passing NULL to
+         * memcpy" warning.
+         */
+        if (!value)
+                return -EINVAL;
+        memcpy(args->value, value, valuelen);
+        return 0;
+}
 /*========================================================================
 * External routines when attribute fork size < XFS_LITINO(mp).
@@ -720,15 +764,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
 }
 /*
- * Look up a name in a shortform attribute list structure.
+ * Retreive the attribute value and length.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
 */
-/*ARGSUSED*/
 int
-xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+xfs_attr_shortform_getvalue(
+        struct xfs_da_args      *args)
 {
-        xfs_attr_shortform_t *sf;
+        struct xfs_attr_shortform *sf;
-        xfs_attr_sf_entry_t *sfe;
+        struct xfs_attr_sf_entry *sfe;
-        int i;
+        int                     i;
        ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
        sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
@@ -741,18 +789,8 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
                        continue;
                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
                        continue;
-                if (args->flags & ATTR_KERNOVAL) {
+                return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
-                        args->valuelen = sfe->valuelen;
+                                                sfe->valuelen);
-                        return -EEXIST;
-                }
-                if (args->valuelen < sfe->valuelen) {
-                        args->valuelen = sfe->valuelen;
-                        return -ERANGE;
-                }
-                args->valuelen = sfe->valuelen;
-                memcpy(args->value, &sfe->nameval[args->namelen],
-                                                    args->valuelen);
-                return -EEXIST;
        }
        return -ENOATTR;
 }
@@ -782,7 +820,7 @@ xfs_attr_shortform_to_leaf(
        ifp = dp->i_afp;
        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
        size = be16_to_cpu(sf->hdr.totsize);
-        tmpbuffer = kmem_alloc(size, KM_SLEEP);
+        tmpbuffer = kmem_alloc(size, 0);
        ASSERT(tmpbuffer != NULL);
        memcpy(tmpbuffer, ifp->if_u1.if_data, size);
        sf = (xfs_attr_shortform_t *)tmpbuffer;
@@ -985,7 +1023,7 @@ xfs_attr3_leaf_to_shortform(
        trace_xfs_attr_leaf_to_sf(args);
-        tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+        tmpbuffer = kmem_alloc(args->geo->blksize, 0);
        if (!tmpbuffer)
                return -ENOMEM;
@@ -1448,7 +1486,7 @@ xfs_attr3_leaf_compact(
        trace_xfs_attr_leaf_compact(args);
-        tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+        tmpbuffer = kmem_alloc(args->geo->blksize, 0);
        memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
        memset(bp->b_addr, 0, args->geo->blksize);
        leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -2167,7 +2205,7 @@ xfs_attr3_leaf_unbalance(
                struct xfs_attr_leafblock *tmp_leaf;
                struct xfs_attr3_icleaf_hdr tmphdr;
-                tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+                tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
                /*
                 * Copy the header into the temp leaf so that all the stuff
@@ -2350,6 +2388,10 @@ xfs_attr3_leaf_lookup_int(
 /*
 * Get the value associated with an attribute name from a leaf attribute
 * list structure.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
 */
 int
 xfs_attr3_leaf_getvalue(
@@ -2361,7 +2403,6 @@ xfs_attr3_leaf_getvalue(
        struct xfs_attr_leaf_entry *entry;
        struct xfs_attr_leaf_name_local *name_loc;
        struct xfs_attr_leaf_name_remote *name_rmt;
-        int                     valuelen;
        leaf = bp->b_addr;
        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
@@ -2373,36 +2414,19 @@ xfs_attr3_leaf_getvalue(
                name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
                ASSERT(name_loc->namelen == args->namelen);
                ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
-                valuelen = be16_to_cpu(name_loc->valuelen);
+                return xfs_attr_copy_value(args,
-                if (args->flags & ATTR_KERNOVAL) {
+                                        &name_loc->nameval[args->namelen],
-                        args->valuelen = valuelen;
+                                        be16_to_cpu(name_loc->valuelen));
-                        return 0;
+        }
-                }
-                if (args->valuelen < valuelen) {
+        name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-                        args->valuelen = valuelen;
+        ASSERT(name_rmt->namelen == args->namelen);
-                        return -ERANGE;
+        ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-                }
+        args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-                args->valuelen = valuelen;
+        args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
+        args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-        } else {
+                                               args->rmtvaluelen);
-                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+        return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
-                ASSERT(name_rmt->namelen == args->namelen);
-                ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-                args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-                args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-                                                       args->rmtvaluelen);
-                if (args->flags & ATTR_KERNOVAL) {
-                        args->valuelen = args->rmtvaluelen;
-                        return 0;
-                }
-                if (args->valuelen < args->rmtvaluelen) {
-                        args->valuelen = args->rmtvaluelen;
-                        return -ERANGE;
-                }
-                args->valuelen = args->rmtvaluelen;
-        }
-        return 0;
 }
 /*========================================================================
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4eb30d357045..3e39b7d40f25 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -358,6 +358,8 @@ xfs_attr_rmtval_copyin(
 /*
 * Read the value associated with an attribute from the out-of-line buffer
 * that we stored it in.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
 */
 int
 xfs_attr_rmtval_get(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 07aad70f3931..054b4ce30033 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -553,7 +553,7 @@ __xfs_bmap_add_free(
 #endif
        ASSERT(xfs_bmap_free_item_zone != NULL);
-        new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+        new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
        new->xefi_startblock = bno;
        new->xefi_blockcount = (xfs_extlen_t)len;
        if (oinfo)
@@ -1099,7 +1099,7 @@ xfs_bmap_add_attrfork(
        if (error)
                goto trans_cancel;
        ASSERT(ip->i_afp == NULL);
-        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
        ip->i_afp->if_flags = XFS_IFEXTENTS;
        logflags = 0;
        switch (ip->i_d.di_format) {
@@ -1985,11 +1985,8 @@ xfs_bmap_add_extent_delay_real(
        }
        /* add reverse mapping unless caller opted out */
-        if (!(bma->flags & XFS_BMAPI_NORMAP)) {
+        if (!(bma->flags & XFS_BMAPI_NORMAP))
-                error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
+                xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
-                if (error)
-                        goto done;
-        }
        /* convert to a btree if necessary */
        if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2471,9 +2468,7 @@ xfs_bmap_add_extent_unwritten_real(
        }
        /* update reverse mappings */
-        error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
+        xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
-        if (error)
-                goto done;
        /* convert to a btree if necessary */
        if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -2832,11 +2827,8 @@ xfs_bmap_add_extent_hole_real(
        }
        /* add reverse mapping unless caller opted out */
-        if (!(flags & XFS_BMAPI_NORMAP)) {
+        if (!(flags & XFS_BMAPI_NORMAP))
-                error = xfs_rmap_map_extent(tp, ip, whichfork, new);
+                xfs_rmap_map_extent(tp, ip, whichfork, new);
-                if (error)
-                        goto done;
-        }
        /* convert to a btree if necessary */
        if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -4050,12 +4042,8 @@ xfs_bmapi_allocate(
         */
        if (!(bma->flags & XFS_BMAPI_METADATA)) {
                bma->datatype = XFS_ALLOC_NOBUSY;
-                if (whichfork == XFS_DATA_FORK) {
+                if (whichfork == XFS_DATA_FORK && bma->offset == 0)
-                        if (bma->offset == 0)
+                        bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
-                                bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
-                        else
-                                bma->datatype |= XFS_ALLOC_USERDATA;
-                }
                if (bma->flags & XFS_BMAPI_ZERO)
                        bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
        }
@@ -4401,12 +4389,9 @@ xfs_bmapi_write(
                         * If this is a CoW allocation, record the data in
                         * the refcount btree for orphan recovery.
                         */
-                        if (whichfork == XFS_COW_FORK) {
+                        if (whichfork == XFS_COW_FORK)
-                                error = xfs_refcount_alloc_cow_extent(tp,
+                                xfs_refcount_alloc_cow_extent(tp, bma.blkno,
-                                                bma.blkno, bma.length);
+                                                bma.length);
-                                if (error)
-                                        goto error0;
-                        }
                }
                /* Deal with the allocated space we found.  */
@@ -4530,7 +4515,7 @@ xfs_bmapi_convert_delalloc(
        if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
                goto out_finish;
        error = -EFSCORRUPTED;
-        if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+        if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
                goto out_finish;
        XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
@@ -4540,12 +4525,8 @@ xfs_bmapi_convert_delalloc(
        *imap = bma.got;
        *seq = READ_ONCE(ifp->if_seq);
-        if (whichfork == XFS_COW_FORK) {
+        if (whichfork == XFS_COW_FORK)
-                error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+                xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
-                                bma.length);
-                if (error)
-                        goto out_finish;
-        }
        error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
                        whichfork);
@@ -5149,18 +5130,14 @@ xfs_bmap_del_extent_real(
        }
        /* remove reverse mapping */
-        error = xfs_rmap_unmap_extent(tp, ip, whichfork, del);
+        xfs_rmap_unmap_extent(tp, ip, whichfork, del);
-        if (error)
-                goto done;
        /*
         * If we need to, add to list of extents to delete.
         */
        if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
                if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
-                        error = xfs_refcount_decrease_extent(tp, del);
+                        xfs_refcount_decrease_extent(tp, del);
-                        if (error)
-                                goto done;
                } else {
                        __xfs_bmap_add_free(tp, del->br_startblock,
                                        del->br_blockcount, NULL,
@@ -5651,12 +5628,11 @@ done:
                        &new);
        /* update reverse mapping. rmap functions merge the rmaps for us */
-        error = xfs_rmap_unmap_extent(tp, ip, whichfork, got);
+        xfs_rmap_unmap_extent(tp, ip, whichfork, got);
-        if (error)
-                return error;
        memcpy(&new, got, sizeof(new));
        new.br_startoff = left->br_startoff + left->br_blockcount;
-        return xfs_rmap_map_extent(tp, ip, whichfork, &new);
+        xfs_rmap_map_extent(tp, ip, whichfork, &new);
+        return 0;
 }
 static int
@@ -5695,10 +5671,9 @@ xfs_bmap_shift_update_extent(
                        got);
        /* update reverse mapping */
-        error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
+        xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
-        if (error)
+        xfs_rmap_map_extent(tp, ip, whichfork, got);
-                return error;
+        return 0;
-        return xfs_rmap_map_extent(tp, ip, whichfork, got);
 }
 int
@@ -6094,7 +6069,7 @@ __xfs_bmap_add(
                        bmap->br_blockcount,
                        bmap->br_state);
-        bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+        bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
        INIT_LIST_HEAD(&bi->bi_list);
        bi->bi_type = type;
        bi->bi_owner = ip;
@@ -6106,29 +6081,29 @@ __xfs_bmap_add(
 }
 /* Map an extent into a file. */
-int
+void
 xfs_bmap_map_extent(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_bmap_is_update_needed(PREV))
-                return 0;
+                return;
-        return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+        __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
 }
 /* Unmap an extent out of a file. */
-int
+void
 xfs_bmap_unmap_extent(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_bmap_is_update_needed(PREV))
-                return 0;
+                return;
-        return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+        __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
 }
 /*
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8f597f9abdbe..5bb446d80542 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -171,6 +171,13 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
                !isnullstartblock(irec->br_startblock);
 }
+/*
+ * Check the mapping for obviously garbage allocations that could trash the
+ * filesystem immediately.
+ */
+#define xfs_valid_startblock(ip, startblock) \
+        ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
 void    xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
                xfs_filblks_t len);
 int     xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
@@ -254,9 +261,9 @@ int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
                enum xfs_bmap_intent_type type, int whichfork,
                xfs_fileoff_t startoff, xfs_fsblock_t startblock,
                xfs_filblks_t *blockcount, xfs_exntst_t state);
-int     xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void    xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
                struct xfs_bmbt_irec *imap);
-int     xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void    xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
                struct xfs_bmbt_irec *imap);
 static inline int xfs_bmap_fork_to_state(int whichfork)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fbb18ba5d905..ffe608d2a2d9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -400,8 +400,20 @@ xfs_bmbt_diff_two_keys(
        union xfs_btree_key     *k1,
        union xfs_btree_key     *k2)
 {
-        return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) -
+        uint64_t                a = be64_to_cpu(k1->bmbt.br_startoff);
-                          be64_to_cpu(k2->bmbt.br_startoff);
+        uint64_t                b = be64_to_cpu(k2->bmbt.br_startoff);
+        /*
+         * Note: This routine previously casted a and b to int64 and subtracted
+         * them to generate a result.  This lead to problems if b was the
+         * "maximum" key value (all ones) being signed incorrectly, hence this
+         * somewhat less efficient version.
+         */
+        if (a > b)
+                return 1;
+        if (b > a)
+                return -1;
+        return 0;
 }
 static xfs_failaddr_t
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f1048efa4268..71de937f9e64 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4466,8 +4466,6 @@ xfs_btree_lblock_verify(
 *                                    btree block
 *
 * @bp: buffer containing the btree block
- * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
- * @pag_max_level: pointer to the per-ag max level field
 */
 xfs_failaddr_t
 xfs_btree_sblock_v5hdr_verify(
@@ -4600,7 +4598,7 @@ xfs_btree_simple_query_range(
                /* Callback */
                error = fn(cur, recp, priv);
-                if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
+                if (error)
                        break;
 advloop:
@@ -4702,8 +4700,7 @@ pop_up:
                         */
                        if (ldiff >= 0 && hdiff >= 0) {
                                error = fn(cur, recp, priv);
-                                if (error < 0 ||
+                                if (error)
-                                    error == XFS_BTREE_QUERY_RANGE_ABORT)
                                        break;
                        } else if (hdiff < 0) {
                                /* Record is larger than high key; pop. */
@@ -4774,8 +4771,7 @@ out:
 * Query a btree for all records overlapping a given interval of keys.  The
 * supplied function will be called with each record found; return one of the
 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
- * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
+ * code.  This function returns -ECANCELED, zero, or a negative error code.
- * negative error code.
 */
 int
 xfs_btree_query_range(
@@ -4891,7 +4887,7 @@ xfs_btree_has_record_helper(
        union xfs_btree_rec             *rec,
        void                            *priv)
 {
-        return XFS_BTREE_QUERY_RANGE_ABORT;
+        return -ECANCELED;
 }
 /* Is there a record covering a given range of keys? */
@@ -4906,7 +4902,7 @@ xfs_btree_has_record(
        error = xfs_btree_query_range(cur, low, high,
                        &xfs_btree_has_record_helper, NULL);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+        if (error == -ECANCELED) {
                *exists = true;
                return 0;
        }
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index fa3cd8ab9aba..ced1e65d1483 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -464,9 +464,13 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
 uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
 unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
-/* return codes */
+/*
-#define XFS_BTREE_QUERY_RANGE_CONTINUE  (XFS_ITER_CONTINUE) /* keep iterating */
+ * Return codes for the query range iterator function are 0 to continue
-#define XFS_BTREE_QUERY_RANGE_ABORT     (XFS_ITER_ABORT)    /* stop iterating */
+ * iterating, and non-zero to stop iterating.  Any non-zero value will be
+ * passed up to the _query_range caller.  The special value -ECANCELED can be
+ * used to stop iteration, because _query_range never generates that error
+ * code on its own.
+ */
 typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
                union xfs_btree_rec *rec, void *priv);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 0bf56e94bfe9..4fd1223c1bd5 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2098,7 +2098,7 @@ xfs_da_grow_inode_int(
                 * If we didn't get it and the block might work if fragmented,
                 * try without the CONTIG flag.  Loop until we get it all.
                 */
-                mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+                mapp = kmem_alloc(sizeof(*mapp) * count, 0);
                for (b = *bno, mapi = 0; b < *bno + count; ) {
                        nmap = min(XFS_BMAP_MAX_NMAP, count);
                        c = (int)(*bno + count - b);
@@ -2480,7 +2480,7 @@ xfs_buf_map_from_irec(
        if (nirecs > 1) {
                map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
-                                  KM_SLEEP | KM_NOFS);
+                                  KM_NOFS);
                if (!map)
                        return -ENOMEM;
                *mapp = map;
@@ -2539,7 +2539,7 @@ xfs_dabuf_map(
                 */
                if (nfsb != 1)
                        irecs = kmem_zalloc(sizeof(irec) * nfsb,
-                                            KM_SLEEP | KM_NOFS);
+                                            KM_NOFS);
                nirecs = nfsb;
                error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 84dd865b6c3d..ae0bbd20d9ca 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -81,13 +81,15 @@ typedef struct xfs_da_args {
 #define XFS_DA_OP_ADDNAME       0x0004  /* this is an add operation */
 #define XFS_DA_OP_OKNOENT       0x0008  /* lookup/add op, ENOENT ok, else die */
 #define XFS_DA_OP_CILOOKUP      0x0010  /* lookup to return CI name if found */
+#define XFS_DA_OP_ALLOCVAL      0x0020  /* lookup to alloc buffer if found  */
 #define XFS_DA_OP_FLAGS \
        { XFS_DA_OP_JUSTCHECK,  "JUSTCHECK" }, \
        { XFS_DA_OP_RENAME,     "RENAME" }, \
        { XFS_DA_OP_ADDNAME,    "ADDNAME" }, \
        { XFS_DA_OP_OKNOENT,    "OKNOENT" }, \
-        { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }
+        { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }, \
+        { XFS_DA_OP_ALLOCVAL,   "ALLOCVAL" }
 /*
 * Storage for holding state during Btree searches and split/join ops.
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eb2be2a6a25a..22557527cfdb 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -517,7 +517,7 @@ xfs_defer_add(
        }
        if (!dfp) {
                dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
-                                KM_SLEEP | KM_NOFS);
+                                KM_NOFS);
                dfp->dfp_type = type;
                dfp->dfp_intent = NULL;
                dfp->dfp_done = NULL;
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 67840723edbb..867c5dee0751 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -110,9 +110,9 @@ xfs_da_mount(
        nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
        mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-                                    KM_SLEEP | KM_MAYFAIL);
+                                    KM_MAYFAIL);
        mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-                                     KM_SLEEP | KM_MAYFAIL);
+                                     KM_MAYFAIL);
        if (!mp->m_dir_geo || !mp->m_attr_geo) {
                kmem_free(mp->m_dir_geo);
                kmem_free(mp->m_attr_geo);
@@ -217,7 +217,7 @@ xfs_dir_init(
        if (error)
                return error;
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        if (!args)
                return -ENOMEM;
@@ -254,7 +254,7 @@ xfs_dir_createname(
                XFS_STATS_INC(dp->i_mount, xs_dir_create);
        }
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        if (!args)
                return -ENOMEM;
@@ -353,7 +353,7 @@ xfs_dir_lookup(
         * lockdep Doing this avoids having to add a bunch of lockdep class
         * annotations into the reclaim path for the ilock.
         */
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        args->geo = dp->i_mount->m_dir_geo;
        args->name = name->name;
        args->namelen = name->len;
@@ -422,7 +422,7 @@ xfs_dir_removename(
        ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
        XFS_STATS_INC(dp->i_mount, xs_dir_remove);
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        if (!args)
                return -ENOMEM;
@@ -483,7 +483,7 @@ xfs_dir_replace(
        if (rval)
                return rval;
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        if (!args)
                return -ENOMEM;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index a6fb0cc2085e..9595ced393dc 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1092,7 +1092,7 @@ xfs_dir2_sf_to_block(
         * Copy the directory into a temporary buffer.
         * Then pitch the incore inode data so we can make extents.
         */
-        sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+        sfp = kmem_alloc(ifp->if_bytes, 0);
        memcpy(sfp, oldsfp, ifp->if_bytes);
        xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 1fc44efc344d..705c4f562758 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -32,8 +32,6 @@ static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
 static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
                                 int index, xfs_da_state_blk_t *dblk,
                                 int *rval);
-static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
-                                     xfs_da_state_blk_t *fblk);
 /*
 * Check internal consistency of a leafn block.
@@ -1611,113 +1609,152 @@ xfs_dir2_leafn_unbalance(
 }
 /*
- * Top-level node form directory addname routine.
+ * Add a new data block to the directory at the free space index that the caller
+ * has specified.
 */
-int                                             /* error */
+static int
-xfs_dir2_node_addname(
+xfs_dir2_node_add_datablk(
-        xfs_da_args_t           *args)          /* operation arguments */
+        struct xfs_da_args      *args,
+        struct xfs_da_state_blk *fblk,
+        xfs_dir2_db_t           *dbno,
+        struct xfs_buf          **dbpp,
+        struct xfs_buf          **fbpp,
+        int                     *findex)
 {
-        xfs_da_state_blk_t      *blk;           /* leaf block for insert */
+        struct xfs_inode        *dp = args->dp;
-        int                     error;          /* error return value */
+        struct xfs_trans        *tp = args->trans;
-        int                     rval;           /* sub-return value */
+        struct xfs_mount        *mp = dp->i_mount;
-        xfs_da_state_t          *state;         /* btree cursor */
+        struct xfs_dir3_icfree_hdr freehdr;
+        struct xfs_dir2_data_free *bf;
+        struct xfs_dir2_data_hdr *hdr;
+        struct xfs_dir2_free    *free = NULL;
+        xfs_dir2_db_t           fbno;
+        struct xfs_buf          *fbp;
+        struct xfs_buf          *dbp;
+        __be16                  *bests = NULL;
+        int                     error;
-        trace_xfs_dir2_node_addname(args);
+        /* Not allowed to allocate, return failure. */
+        if (args->total == 0)
+                return -ENOSPC;
+        /* Allocate and initialize the new data block.  */
+        error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
+        if (error)
+                return error;
+        error = xfs_dir3_data_init(args, *dbno, &dbp);
+        if (error)
+                return error;
        /*
-         * Allocate and initialize the state (btree cursor).
+         * Get the freespace block corresponding to the data block
-         */
+         * that was just allocated.
-        state = xfs_da_state_alloc();
-        state->args = args;
-        state->mp = args->dp->i_mount;
-        /*
-         * Look up the name.  We're not supposed to find it, but
-         * this gives us the insertion point.
         */
-        error = xfs_da3_node_lookup_int(state, &rval);
+        fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
+        error = xfs_dir2_free_try_read(tp, dp,
+                               xfs_dir2_db_to_da(args->geo, fbno), &fbp);
        if (error)
-                rval = error;
+                return error;
-        if (rval != -ENOENT) {
-                goto done;
-        }
        /*
-         * Add the data entry to a data block.
+         * If there wasn't a freespace block, the read will
-         * Extravalid is set to a freeblock found by lookup.
+         * return a NULL fbp.  Allocate and initialize a new one.
         */
-        rval = xfs_dir2_node_addname_int(args,
+        if (!fbp) {
-                state->extravalid ? &state->extrablk : NULL);
+                error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
-        if (rval) {
+                if (error)
-                goto done;
+                        return error;
+                if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
+                        xfs_alert(mp,
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
+                                __func__, (unsigned long long)dp->i_ino,
+                                (long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
+                                (long long)*dbno, (long long)fbno);
+                        if (fblk) {
+                                xfs_alert(mp,
+                        " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
+                                        fblk, (unsigned long long)fblk->blkno,
+                                        fblk->index, fblk->magic);
+                        } else {
+                                xfs_alert(mp, " ... fblk is NULL");
+                        }
+                        XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+                        return -EFSCORRUPTED;
+                }
+                /* Get a buffer for the new block. */
+                error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+                if (error)
+                        return error;
+                free = fbp->b_addr;
+                bests = dp->d_ops->free_bests_p(free);
+                dp->d_ops->free_hdr_from_disk(&freehdr, free);
+                /* Remember the first slot as our empty slot. */
+                freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+                                                        XFS_DIR2_FREE_OFFSET)) *
+                                dp->d_ops->free_max_bests(args->geo);
+        } else {
+                free = fbp->b_addr;
+                bests = dp->d_ops->free_bests_p(free);
+                dp->d_ops->free_hdr_from_disk(&freehdr, free);
        }
-        blk = &state->path.blk[state->path.active - 1];
-        ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        /* Set the freespace block index from the data block number. */
+        *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
+        /* Extend the freespace table if the new data block is off the end. */
+        if (*findex >= freehdr.nvalid) {
+                ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
+                freehdr.nvalid = *findex + 1;
+                bests[*findex] = cpu_to_be16(NULLDATAOFF);
+        }
        /*
-         * Add the new leaf entry.
+         * If this entry was for an empty data block (this should always be
+         * true) then update the header.
         */
-        rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+        if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
-        if (rval == 0) {
+                freehdr.nused++;
-                /*
+                dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-                 * It worked, fix the hash values up the btree.
+                xfs_dir2_free_log_header(args, fbp);
-                 */
-                if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
-                        xfs_da3_fixhashpath(state, &state->path);
-        } else {
-                /*
-                 * It didn't work, we need to split the leaf block.
-                 */
-                if (args->total == 0) {
-                        ASSERT(rval == -ENOSPC);
-                        goto done;
-                }
-                /*
-                 * Split the leaf block and insert the new entry.
-                 */
-                rval = xfs_da3_split(state);
        }
-done:
-        xfs_da_state_free(state);
+        /* Update the freespace value for the new block in the table. */
-        return rval;
+        hdr = dbp->b_addr;
+        bf = dp->d_ops->data_bestfree_p(hdr);
+        bests[*findex] = bf[0].length;
+        *dbpp = dbp;
+        *fbpp = fbp;
+        return 0;
 }
-/*
+static int
- * Add the data entry for a node-format directory name addition.
+xfs_dir2_node_find_freeblk(
- * The leaf entry is added in xfs_dir2_leafn_add.
+        struct xfs_da_args      *args,
- * We may enter with a freespace block that the lookup found.
+        struct xfs_da_state_blk *fblk,
- */
+        xfs_dir2_db_t           *dbnop,
-static int                                      /* error */
+        struct xfs_buf          **fbpp,
-xfs_dir2_node_addname_int(
+        int                     *findexp,
-        xfs_da_args_t           *args,          /* operation arguments */
+        int                     length)
-        xfs_da_state_blk_t      *fblk)          /* optional freespace block */
 {
-        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-        xfs_dir2_db_t           dbno;           /* data block number */
-        struct xfs_buf          *dbp;           /* data block buffer */
-        xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
-        xfs_inode_t             *dp;            /* incore directory inode */
-        xfs_dir2_data_unused_t  *dup;           /* data unused entry pointer */
-        int                     error;          /* error return value */
-        xfs_dir2_db_t           fbno;           /* freespace block number */
-        struct xfs_buf          *fbp;           /* freespace buffer */
-        int                     findex;         /* freespace entry index */
-        xfs_dir2_free_t         *free=NULL;     /* freespace block structure */
-        xfs_dir2_db_t           ifbno;          /* initial freespace block no */
-        xfs_dir2_db_t           lastfbno=0;     /* highest freespace block no */
-        int                     length;         /* length of the new entry */
-        int                     logfree;        /* need to log free entry */
-        xfs_mount_t             *mp;            /* filesystem mount point */
-        int                     needlog;        /* need to log data header */
-        int                     needscan;       /* need to rescan data frees */
-        __be16                  *tagp;          /* data entry tag pointer */
-        xfs_trans_t             *tp;            /* transaction pointer */
-        __be16                  *bests;
        struct xfs_dir3_icfree_hdr freehdr;
-        struct xfs_dir2_data_free *bf;
+        struct xfs_dir2_free    *free = NULL;
-        xfs_dir2_data_aoff_t    aoff;
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_trans        *tp = args->trans;
+        struct xfs_buf          *fbp = NULL;
+        xfs_dir2_db_t           firstfbno;
+        xfs_dir2_db_t           lastfbno;
+        xfs_dir2_db_t           ifbno = -1;
+        xfs_dir2_db_t           dbno = -1;
+        xfs_dir2_db_t           fbno;
+        xfs_fileoff_t           fo;
+        __be16                  *bests = NULL;
+        int                     findex = 0;
+        int                     error;
-        dp = args->dp;
-        mp = dp->i_mount;
-        tp = args->trans;
-        length = dp->d_ops->data_entsize(args->namelen);
        /*
         * If we came in with a freespace block that means that lookup
         * found an entry with our hash value.  This is the freespace
@@ -1725,288 +1762,157 @@ xfs_dir2_node_addname_int(
         */
        if (fblk) {
                fbp = fblk->bp;
-                /*
-                 * Remember initial freespace block number.
-                 */
-                ifbno = fblk->blkno;
                free = fbp->b_addr;
                findex = fblk->index;
-                bests = dp->d_ops->free_bests_p(free);
-                dp->d_ops->free_hdr_from_disk(&freehdr, free);
-                /*
-                 * This means the free entry showed that the data block had
-                 * space for our entry, so we remembered it.
-                 * Use that data block.
-                 */
                if (findex >= 0) {
+                        /* caller already found the freespace for us. */
+                        bests = dp->d_ops->free_bests_p(free);
+                        dp->d_ops->free_hdr_from_disk(&freehdr, free);
                        ASSERT(findex < freehdr.nvalid);
                        ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
                        ASSERT(be16_to_cpu(bests[findex]) >= length);
                        dbno = freehdr.firstdb + findex;
-                } else {
+                        goto found_block;
-                        /*
-                         * The data block looked at didn't have enough room.
-                         * We'll start at the beginning of the freespace entries.
-                         */
-                        dbno = -1;
-                        findex = 0;
                }
-        } else {
                /*
-                 * Didn't come in with a freespace block, so no data block.
+                 * The data block looked at didn't have enough room.
+                 * We'll start at the beginning of the freespace entries.
                 */
-                ifbno = dbno = -1;
+                ifbno = fblk->blkno;
+                xfs_trans_brelse(tp, fbp);
                fbp = NULL;
-                findex = 0;
+                fblk->bp = NULL;
        }
        /*
-         * If we don't have a data block yet, we're going to scan the
+         * If we don't have a data block yet, we're going to scan the freespace
-         * freespace blocks looking for one.  Figure out what the
+         * data for a data block with enough free space in it.
-         * highest freespace block number is.
-         */
-        if (dbno == -1) {
-                xfs_fileoff_t   fo;             /* freespace block number */
-                if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
-                        return error;
-                lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
-                fbno = ifbno;
-        }
-        /*
-         * While we haven't identified a data block, search the freeblock
-         * data for a good data block.  If we find a null freeblock entry,
-         * indicating a hole in the data blocks, remember that.
         */
-        while (dbno == -1) {
+        error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
-                /*
+        if (error)
-                 * If we don't have a freeblock in hand, get the next one.
+                return error;
-                 */
+        lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
-                if (fbp == NULL) {
+        firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
-                        /*
-                         * Happens the first time through unless lookup gave
-                         * us a freespace block to start with.
-                         */
-                        if (++fbno == 0)
-                                fbno = xfs_dir2_byte_to_db(args->geo,
-                                                        XFS_DIR2_FREE_OFFSET);
-                        /*
-                         * If it's ifbno we already looked at it.
-                         */
-                        if (fbno == ifbno)
-                                fbno++;
-                        /*
-                         * If it's off the end we're done.
-                         */
-                        if (fbno >= lastfbno)
-                                break;
-                        /*
-                         * Read the block.  There can be holes in the
-                         * freespace blocks, so this might not succeed.
-                         * This should be really rare, so there's no reason
-                         * to avoid it.
-                         */
-                        error = xfs_dir2_free_try_read(tp, dp,
-                                        xfs_dir2_db_to_da(args->geo, fbno),
-                                        &fbp);
-                        if (error)
-                                return error;
-                        if (!fbp)
-                                continue;
-                        free = fbp->b_addr;
-                        findex = 0;
-                }
-                /*
-                 * Look at the current free entry.  Is it good enough?
-                 *
-                 * The bests initialisation should be where the bufer is read in
-                 * the above branch. But gcc is too stupid to realise that bests
-                 * and the freehdr are actually initialised if they are placed
-                 * there, so we have to do it here to avoid warnings. Blech.
-                 */
-                bests = dp->d_ops->free_bests_p(free);
-                dp->d_ops->free_hdr_from_disk(&freehdr, free);
-                if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-                    be16_to_cpu(bests[findex]) >= length)
-                        dbno = freehdr.firstdb + findex;
-                else {
-                        /*
-                         * Are we done with the freeblock?
-                         */
-                        if (++findex == freehdr.nvalid) {
-                                /*
-                                 * Drop the block.
-                                 */
-                                xfs_trans_brelse(tp, fbp);
-                                fbp = NULL;
-                                if (fblk && fblk->bp)
-                                        fblk->bp = NULL;
-                        }
-                }
-        }
-        /*
-         * If we don't have a data block, we need to allocate one and make
-         * the freespace entries refer to it.
-         */
-        if (unlikely(dbno == -1)) {
-                /*
-                 * Not allowed to allocate, return failure.
-                 */
-                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-                        return -ENOSPC;
-                /*
-                 * Allocate and initialize the new data block.
-                 */
-                if (unlikely((error = xfs_dir2_grow_inode(args,
-                                                         XFS_DIR2_DATA_SPACE,
-                                                         &dbno)) ||
-                    (error = xfs_dir3_data_init(args, dbno, &dbp))))
-                        return error;
-                /*
+        for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
-                 * If (somehow) we have a freespace block, get rid of it.
+                /* If it's ifbno we already looked at it. */
-                 */
+                if (fbno == ifbno)
-                if (fbp)
+                        continue;
-                        xfs_trans_brelse(tp, fbp);
-                if (fblk && fblk->bp)
-                        fblk->bp = NULL;
                /*
-                 * Get the freespace block corresponding to the data block
+                 * Read the block.  There can be holes in the freespace blocks,
-                 * that was just allocated.
+                 * so this might not succeed.  This should be really rare, so
+                 * there's no reason to avoid it.
                 */
-                fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
                error = xfs_dir2_free_try_read(tp, dp,
-                                       xfs_dir2_db_to_da(args->geo, fbno),
+                                xfs_dir2_db_to_da(args->geo, fbno),
-                                       &fbp);
+                                &fbp);
                if (error)
                        return error;
+                if (!fbp)
+                        continue;
-                /*
+                free = fbp->b_addr;
-                 * If there wasn't a freespace block, the read will
+                bests = dp->d_ops->free_bests_p(free);
-                 * return a NULL fbp.  Allocate and initialize a new one.
+                dp->d_ops->free_hdr_from_disk(&freehdr, free);
-                 */
-                if (!fbp) {
-                        error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
-                                                    &fbno);
-                        if (error)
-                                return error;
-                        if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
+                /* Scan the free entry array for a large enough free space. */
-                                xfs_alert(mp,
+                for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
-"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
+                        if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-                                        __func__, (unsigned long long)dp->i_ino,
+                            be16_to_cpu(bests[findex]) >= length) {
-                                        (long long)dp->d_ops->db_to_fdb(
+                                dbno = freehdr.firstdb + findex;
-                                                                args->geo, dbno),
+                                goto found_block;
-                                        (long long)dbno, (long long)fbno,
-                                        (unsigned long long)ifbno, lastfbno);
-                                if (fblk) {
-                                        xfs_alert(mp,
-                                " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
-                                                fblk,
-                                                (unsigned long long)fblk->blkno,
-                                                fblk->index,
-                                                fblk->magic);
-                                } else {
-                                        xfs_alert(mp, " ... fblk is NULL");
-                                }
-                                XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                return -EFSCORRUPTED;
                        }
-                        /*
-                         * Get a buffer for the new block.
-                         */
-                        error = xfs_dir3_free_get_buf(args, fbno, &fbp);
-                        if (error)
-                                return error;
-                        free = fbp->b_addr;
-                        bests = dp->d_ops->free_bests_p(free);
-                        dp->d_ops->free_hdr_from_disk(&freehdr, free);
-                        /*
-                         * Remember the first slot as our empty slot.
-                         */
-                        freehdr.firstdb =
-                                (fbno - xfs_dir2_byte_to_db(args->geo,
-                                                        XFS_DIR2_FREE_OFFSET)) *
-                                        dp->d_ops->free_max_bests(args->geo);
-                } else {
-                        free = fbp->b_addr;
-                        bests = dp->d_ops->free_bests_p(free);
-                        dp->d_ops->free_hdr_from_disk(&freehdr, free);
                }
-                /*
+                /* Didn't find free space, go on to next free block */
-                 * Set the freespace block index from the data block number.
+                xfs_trans_brelse(tp, fbp);
-                 */
-                findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
-                /*
-                 * If it's after the end of the current entries in the
-                 * freespace block, extend that table.
-                 */
-                if (findex >= freehdr.nvalid) {
-                        ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
-                        freehdr.nvalid = findex + 1;
-                        /*
-                         * Tag new entry so nused will go up.
-                         */
-                        bests[findex] = cpu_to_be16(NULLDATAOFF);
-                }
-                /*
-                 * If this entry was for an empty data block
-                 * (this should always be true) then update the header.
-                 */
-                if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
-                        freehdr.nused++;
-                        dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-                        xfs_dir2_free_log_header(args, fbp);
-                }
-                /*
-                 * Update the real value in the table.
-                 * We haven't allocated the data entry yet so this will
-                 * change again.
-                 */
-                hdr = dbp->b_addr;
-                bf = dp->d_ops->data_bestfree_p(hdr);
-                bests[findex] = bf[0].length;
-                logfree = 1;
        }
+found_block:
+        *dbnop = dbno;
+        *fbpp = fbp;
+        *findexp = findex;
+        return 0;
+}
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int
+xfs_dir2_node_addname_int(
+        struct xfs_da_args      *args,          /* operation arguments */
+        struct xfs_da_state_blk *fblk)          /* optional freespace block */
+{
+        struct xfs_dir2_data_unused *dup;       /* data unused entry pointer */
+        struct xfs_dir2_data_entry *dep;        /* data entry pointer */
+        struct xfs_dir2_data_hdr *hdr;          /* data block header */
+        struct xfs_dir2_data_free *bf;
+        struct xfs_dir2_free    *free = NULL;   /* freespace block structure */
+        struct xfs_trans        *tp = args->trans;
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_buf          *dbp;           /* data block buffer */
+        struct xfs_buf          *fbp;           /* freespace buffer */
+        xfs_dir2_data_aoff_t    aoff;
+        xfs_dir2_db_t           dbno;           /* data block number */
+        int                     error;          /* error return value */
+        int                     findex;         /* freespace entry index */
+        int                     length;         /* length of the new entry */
+        int                     logfree = 0;    /* need to log free entry */
+        int                     needlog = 0;    /* need to log data header */
+        int                     needscan = 0;   /* need to rescan data frees */
+        __be16                  *tagp;          /* data entry tag pointer */
+        __be16                  *bests;
+        length = dp->d_ops->data_entsize(args->namelen);
+        error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
+                                           length);
+        if (error)
+                return error;
        /*
-         * We had a data block so we don't have to make a new one.
+         * Now we know if we must allocate blocks, so if we are checking whether
+         * we can insert without allocation then we can return now.
         */
-        else {
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
-                /*
+                if (dbno == -1)
-                 * If just checking, we succeeded.
+                        return -ENOSPC;
-                 */
+                return 0;
-                if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+        }
-                        return 0;
-                /*
+        /*
-                 * Read the data block in.
+         * If we don't have a data block, we need to allocate one and make
-                 */
+         * the freespace entries refer to it.
+         */
+        if (dbno == -1) {
+                /* we're going to have to log the free block index later */
+                logfree = 1;
+                error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
+                                                  &findex);
+        } else {
+                /* Read the data block in. */
                error = xfs_dir3_data_read(tp, dp,
                                           xfs_dir2_db_to_da(args->geo, dbno),
                                           -1, &dbp);
-                if (error)
-                        return error;
-                hdr = dbp->b_addr;
-                bf = dp->d_ops->data_bestfree_p(hdr);
-                logfree = 0;
        }
+        if (error)
+                return error;
+        /* setup for data block up now */
+        hdr = dbp->b_addr;
+        bf = dp->d_ops->data_bestfree_p(hdr);
        ASSERT(be16_to_cpu(bf[0].length) >= length);
-        /*
-         * Point to the existing unused space.
+        /* Point to the existing unused space. */
-         */
        dup = (xfs_dir2_data_unused_t *)
              ((char *)hdr + be16_to_cpu(bf[0].offset));
-        needscan = needlog = 0;
-        /*
+        /* Mark the first part of the unused space, inuse for us. */
-         * Mark the first part of the unused space, inuse for us.
-         */
        aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
        error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
                        &needlog, &needscan);
@@ -2014,9 +1920,8 @@ xfs_dir2_node_addname_int(
                xfs_trans_brelse(tp, dbp);
                return error;
        }
-        /*
-         * Fill in the new entry and log it.
+        /* Fill in the new entry and log it. */
-         */
        dep = (xfs_dir2_data_entry_t *)dup;
        dep->inumber = cpu_to_be64(args->inumber);
        dep->namelen = args->namelen;
@@ -2025,38 +1930,101 @@ xfs_dir2_node_addname_int(
        tagp = dp->d_ops->data_entry_tag_p(dep);
        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
        xfs_dir2_data_log_entry(args, dbp, dep);
-        /*
-         * Rescan the block for bestfree if needed.
+        /* Rescan the freespace and log the data block if needed. */
-         */
        if (needscan)
                xfs_dir2_data_freescan(dp, hdr, &needlog);
-        /*
-         * Log the data block header if needed.
-         */
        if (needlog)
                xfs_dir2_data_log_header(args, dbp);
-        /*
-         * If the freespace entry is now wrong, update it.
+        /* If the freespace block entry is now wrong, update it. */
-         */
+        free = fbp->b_addr;
-        bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
+        bests = dp->d_ops->free_bests_p(free);
-        if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+        if (bests[findex] != bf[0].length) {
                bests[findex] = bf[0].length;
                logfree = 1;
        }
-        /*
-         * Log the freespace entry if needed.
+        /* Log the freespace entry if needed. */
-         */
        if (logfree)
                xfs_dir2_free_log_bests(args, fbp, findex, findex);
-        /*
-         * Return the data block and offset in args, then drop the data block.
+        /* Return the data block and offset in args. */
-         */
        args->blkno = (xfs_dablk_t)dbno;
        args->index = be16_to_cpu(*tagp);
        return 0;
 }
 /*
+ * Top-level node form directory addname routine.
+ */
+int                                             /* error */
+xfs_dir2_node_addname(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_da_state_blk_t      *blk;           /* leaf block for insert */
+        int                     error;          /* error return value */
+        int                     rval;           /* sub-return value */
+        xfs_da_state_t          *state;         /* btree cursor */
+        trace_xfs_dir2_node_addname(args);
+        /*
+         * Allocate and initialize the state (btree cursor).
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        /*
+         * Look up the name.  We're not supposed to find it, but
+         * this gives us the insertion point.
+         */
+        error = xfs_da3_node_lookup_int(state, &rval);
+        if (error)
+                rval = error;
+        if (rval != -ENOENT) {
+                goto done;
+        }
+        /*
+         * Add the data entry to a data block.
+         * Extravalid is set to a freeblock found by lookup.
+         */
+        rval = xfs_dir2_node_addname_int(args,
+                state->extravalid ? &state->extrablk : NULL);
+        if (rval) {
+                goto done;
+        }
+        blk = &state->path.blk[state->path.active - 1];
+        ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        /*
+         * Add the new leaf entry.
+         */
+        rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+        if (rval == 0) {
+                /*
+                 * It worked, fix the hash values up the btree.
+                 */
+                if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+                        xfs_da3_fixhashpath(state, &state->path);
+        } else {
+                /*
+                 * It didn't work, we need to split the leaf block.
+                 */
+                if (args->total == 0) {
+                        ASSERT(rval == -ENOSPC);
+                        goto done;
+                }
+                /*
+                 * Split the leaf block and insert the new entry.
+                 */
+                rval = xfs_da3_split(state);
+        }
+done:
+        xfs_da_state_free(state);
+        return rval;
+}
+/*
 * Lookup an entry in a node-format directory.
 * All the real work happens in xfs_da3_node_lookup_int.
 * The only real output is the inode number of the entry.
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 033589257f54..85f14fc2a8da 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -164,7 +164,7 @@ xfs_dir2_block_to_sf(
         * can free the block and copy the formatted data into the inode literal
         * area.
         */
-        dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+        dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
        hdr = bp->b_addr;
        /*
@@ -436,7 +436,7 @@ xfs_dir2_sf_addname_hard(
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        old_isize = (int)dp->i_d.di_size;
-        buf = kmem_alloc(old_isize, KM_SLEEP);
+        buf = kmem_alloc(old_isize, 0);
        oldsfp = (xfs_dir2_sf_hdr_t *)buf;
        memcpy(oldsfp, sfp, old_isize);
        /*
@@ -1096,7 +1096,7 @@ xfs_dir2_sf_toino4(
         * Don't want xfs_idata_realloc copying the data here.
         */
        oldsize = dp->i_df.if_bytes;
-        buf = kmem_alloc(oldsize, KM_SLEEP);
+        buf = kmem_alloc(oldsize, 0);
        oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        ASSERT(oldsfp->i8count == 1);
        memcpy(buf, oldsfp, oldsize);
@@ -1169,7 +1169,7 @@ xfs_dir2_sf_toino8(
         * Don't want xfs_idata_realloc copying the data here.
         */
        oldsize = dp->i_df.if_bytes;
-        buf = kmem_alloc(oldsize, KM_SLEEP);
+        buf = kmem_alloc(oldsize, 0);
        oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        ASSERT(oldsfp->i8count == 0);
        memcpy(buf, oldsfp, oldsize);
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 52d03a3a02a4..39dd2b908106 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -287,7 +287,7 @@ struct xfs_ag_geometry {
        uint32_t        ag_ifree;       /* o: inodes free */
        uint32_t        ag_sick;        /* o: sick things in ag */
        uint32_t        ag_checked;     /* o: checked metadata in ag */
-        uint32_t        ag_reserved32;  /* o: zero */
+        uint32_t        ag_flags;       /* i/o: flags for this ag */
        uint64_t        ag_reserved[12];/* o: zero */
 };
 #define XFS_AG_GEOM_SICK_SB     (1 << 0)  /* superblock */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 04377ab75863..588d44613094 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2787,8 +2787,13 @@ xfs_ialloc_setup_geometry(
        igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
                        inodes);
-        /* Set the maximum inode count for this filesystem. */
+        /*
-        if (sbp->sb_imax_pct) {
+         * Set the maximum inode count for this filesystem, being careful not
+         * to use obviously garbage sb_inopblog/sb_inopblock values.  Regular
+         * users should never get here due to failing sb verification, but
+         * certain users (xfs_db) need to be usable even with corrupt metadata.
+         */
+        if (sbp->sb_imax_pct && igeo->ialloc_blks) {
                /*
                 * Make sure the maximum inode count is a multiple
                 * of the units we allocate inodes in.
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 27aa3f2bc4bc..7bc87408f1a0 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -616,7 +616,7 @@ xfs_iext_realloc_root(
 * sequence counter is seen before the modifications to the extent tree itself
 * take effect.
 */
-static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
+static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
 {
        WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
 }
@@ -633,7 +633,7 @@ xfs_iext_insert(
        struct xfs_iext_leaf    *new = NULL;
        int                     nr_entries, i;
-        xfs_iext_inc_seq(ifp, state);
+        xfs_iext_inc_seq(ifp);
        if (ifp->if_height == 0)
                xfs_iext_alloc_root(ifp, cur);
@@ -875,7 +875,7 @@ xfs_iext_remove(
        ASSERT(ifp->if_u1.if_root != NULL);
        ASSERT(xfs_iext_valid(ifp, cur));
-        xfs_iext_inc_seq(ifp, state);
+        xfs_iext_inc_seq(ifp);
        nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
        for (i = cur->pos; i < nr_entries; i++)
@@ -983,7 +983,7 @@ xfs_iext_update_extent(
 {
        struct xfs_ifork        *ifp = xfs_iext_state_to_fork(ip, state);
-        xfs_iext_inc_seq(ifp, state);
+        xfs_iext_inc_seq(ifp);
        if (cur->pos == 0) {
                struct xfs_bmbt_irec    old;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bf3e04018246..c643beeb5a24 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -94,7 +94,7 @@ xfs_iformat_fork(
                return 0;
        ASSERT(ip->i_afp == NULL);
-        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
        switch (dip->di_aformat) {
        case XFS_DINODE_FMT_LOCAL:
@@ -147,7 +147,7 @@ xfs_init_local_fork(
        if (size) {
                real_size = roundup(mem_size, 4);
-                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+                ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
                memcpy(ifp->if_u1.if_data, data, size);
                if (zero_terminate)
                        ifp->if_u1.if_data[size] = '\0';
@@ -302,7 +302,7 @@ xfs_iformat_btree(
        }
        ifp->if_broot_bytes = size;
-        ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+        ifp->if_broot = kmem_alloc(size, KM_NOFS);
        ASSERT(ifp->if_broot != NULL);
        /*
         * Copy and convert from the on-disk structure
@@ -367,7 +367,7 @@ xfs_iroot_realloc(
                 */
                if (ifp->if_broot_bytes == 0) {
                        new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
-                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+                        ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
                        ifp->if_broot_bytes = (int)new_size;
                        return;
                }
@@ -382,7 +382,7 @@ xfs_iroot_realloc(
                new_max = cur_max + rec_diff;
                new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                                KM_SLEEP | KM_NOFS);
+                                KM_NOFS);
                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -408,7 +408,7 @@ xfs_iroot_realloc(
        else
                new_size = 0;
        if (new_size > 0) {
-                new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+                new_broot = kmem_alloc(new_size, KM_NOFS);
                /*
                 * First copy over the btree block header.
                 */
@@ -492,7 +492,7 @@ xfs_idata_realloc(
         * We enforce that here.
         */
        ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
-                        roundup(new_size, 4), KM_SLEEP | KM_NOFS);
+                        roundup(new_size, 4), KM_NOFS);
        ifp->if_bytes = new_size;
 }
@@ -683,7 +683,7 @@ xfs_ifork_init_cow(
                return;
        ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
-                                       KM_SLEEP | KM_NOFS);
+                                       KM_NOFS);
        ip->i_cowfp->if_flags = XFS_IFEXTENTS;
        ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
        ip->i_cnextents = 0;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 51bb9bdb0e84..9a7fadb1361c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1174,7 +1174,7 @@ out_cur:
 /*
 * Record a refcount intent for later processing.
 */
-static int
+static void
 __xfs_refcount_add(
        struct xfs_trans                *tp,
        enum xfs_refcount_intent_type   type,
@@ -1189,44 +1189,43 @@ __xfs_refcount_add(
                        blockcount);
        ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
-                        KM_SLEEP | KM_NOFS);
+                        KM_NOFS);
        INIT_LIST_HEAD(&ri->ri_list);
        ri->ri_type = type;
        ri->ri_startblock = startblock;
        ri->ri_blockcount = blockcount;
        xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
-        return 0;
 }
 /*
 * Increase the reference count of the blocks backing a file's extent.
 */
-int
+void
 xfs_refcount_increase_extent(
        struct xfs_trans                *tp,
        struct xfs_bmbt_irec            *PREV)
 {
        if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
-                return 0;
+                return;
-        return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE,
+        __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
-                        PREV->br_startblock, PREV->br_blockcount);
+                        PREV->br_blockcount);
 }
 /*
 * Decrease the reference count of the blocks backing a file's extent.
 */
-int
+void
 xfs_refcount_decrease_extent(
        struct xfs_trans                *tp,
        struct xfs_bmbt_irec            *PREV)
 {
        if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
-                return 0;
+                return;
-        return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE,
+        __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
-                        PREV->br_startblock, PREV->br_blockcount);
+                        PREV->br_blockcount);
 }
 /*
@@ -1541,47 +1540,40 @@ __xfs_refcount_cow_free(
 }
 /* Record a CoW staging extent in the refcount btree. */
-int
+void
 xfs_refcount_alloc_cow_extent(
        struct xfs_trans                *tp,
        xfs_fsblock_t                   fsb,
        xfs_extlen_t                    len)
 {
        struct xfs_mount                *mp = tp->t_mountp;
-        int                             error;
        if (!xfs_sb_version_hasreflink(&mp->m_sb))
-                return 0;
+                return;
-        error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+        __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
-        if (error)
-                return error;
        /* Add rmap entry */
-        return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+        xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
                        XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
 }
 /* Forget a CoW staging event in the refcount btree. */
-int
+void
 xfs_refcount_free_cow_extent(
        struct xfs_trans                *tp,
        xfs_fsblock_t                   fsb,
        xfs_extlen_t                    len)
 {
        struct xfs_mount                *mp = tp->t_mountp;
-        int                             error;
        if (!xfs_sb_version_hasreflink(&mp->m_sb))
-                return 0;
+                return;
        /* Remove rmap entry */
-        error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+        xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
                        XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
-        if (error)
+        __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
-                return error;
-        return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 struct xfs_refcount_recovery {
@@ -1602,7 +1594,7 @@ xfs_refcount_recover_extent(
        if (be32_to_cpu(rec->refc.rc_refcount) != 1)
                return -EFSCORRUPTED;
-        rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+        rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
        xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
        list_add_tail(&rr->rr_list, debris);
@@ -1679,10 +1671,8 @@ xfs_refcount_recover_cow_leftovers(
                /* Free the orphan record */
                agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
                fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
-                error = xfs_refcount_free_cow_extent(tp, fsb,
+                xfs_refcount_free_cow_extent(tp, fsb,
                                rr->rr_rrec.rc_blockcount);
-                if (error)
-                        goto out_trans;
                /* Free the block. */
                xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 1d9c518575e7..209795539c8d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -29,9 +29,9 @@ struct xfs_refcount_intent {
        xfs_extlen_t                            ri_blockcount;
 };
-extern int xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp,
                struct xfs_bmbt_irec *irec);
-extern int xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp,
                struct xfs_bmbt_irec *irec);
 extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
@@ -45,10 +45,10 @@ extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
                xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
                xfs_extlen_t *flen, bool find_end_of_shared);
-extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp,
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-                xfs_fsblock_t fsb, xfs_extlen_t len);
+                xfs_extlen_t len);
-extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp,
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-                xfs_fsblock_t fsb, xfs_extlen_t len);
+                xfs_extlen_t len);
 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
                xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index e6aeb390b2fb..38e9414878b3 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -168,7 +168,6 @@ xfs_rmap_btrec_to_irec(
        union xfs_btree_rec     *rec,
        struct xfs_rmap_irec    *irec)
 {
-        irec->rm_flags = 0;
        irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
        irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
        irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
@@ -254,15 +253,15 @@ xfs_rmap_find_left_neighbor_helper(
                        rec->rm_flags);
        if (rec->rm_owner != info->high.rm_owner)
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
            !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
            rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        *info->irec = *rec;
        *info->stat = 1;
-        return XFS_BTREE_QUERY_RANGE_ABORT;
+        return -ECANCELED;
 }
 /*
@@ -305,7 +304,7 @@ xfs_rmap_find_left_neighbor(
        error = xfs_rmap_query_range(cur, &info.high, &info.high,
                        xfs_rmap_find_left_neighbor_helper, &info);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+        if (error == -ECANCELED)
                error = 0;
        if (*stat)
                trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
@@ -330,16 +329,16 @@ xfs_rmap_lookup_le_range_helper(
                        rec->rm_flags);
        if (rec->rm_owner != info->high.rm_owner)
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
            !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
            (rec->rm_offset > info->high.rm_offset ||
             rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        *info->irec = *rec;
        *info->stat = 1;
-        return XFS_BTREE_QUERY_RANGE_ABORT;
+        return -ECANCELED;
 }
 /*
@@ -377,7 +376,7 @@ xfs_rmap_lookup_le_range(
                        cur->bc_private.a.agno, bno, 0, owner, offset, flags);
        error = xfs_rmap_query_range(cur, &info.high, &info.high,
                        xfs_rmap_lookup_le_range_helper, &info);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+        if (error == -ECANCELED)
                error = 0;
        if (*stat)
                trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
@@ -2268,7 +2267,7 @@ xfs_rmap_update_is_needed(
 * Record a rmap intent; the list is kept sorted first by AG and then by
 * increasing age.
 */
-static int
+static void
 __xfs_rmap_add(
        struct xfs_trans                *tp,
        enum xfs_rmap_intent_type       type,
@@ -2287,7 +2286,7 @@ __xfs_rmap_add(
                        bmap->br_blockcount,
                        bmap->br_state);
-        ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS);
+        ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
        INIT_LIST_HEAD(&ri->ri_list);
        ri->ri_type = type;
        ri->ri_owner = owner;
@@ -2295,11 +2294,10 @@ __xfs_rmap_add(
        ri->ri_bmap = *bmap;
        xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
-        return 0;
 }
 /* Map an extent into a file. */
-int
+void
 xfs_rmap_map_extent(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
@@ -2307,15 +2305,15 @@ xfs_rmap_map_extent(
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
-                return 0;
+                return;
-        return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+        __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
                        XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
                        whichfork, PREV);
 }
 /* Unmap an extent out of a file. */
-int
+void
 xfs_rmap_unmap_extent(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
@@ -2323,9 +2321,9 @@ xfs_rmap_unmap_extent(
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
-                return 0;
+                return;
-        return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+        __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
                        XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
                        whichfork, PREV);
 }
@@ -2336,7 +2334,7 @@ xfs_rmap_unmap_extent(
 * Note that tp can be NULL here as no transaction is used for COW fork
 * unwritten conversion.
 */
-int
+void
 xfs_rmap_convert_extent(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
@@ -2345,15 +2343,15 @@ xfs_rmap_convert_extent(
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_rmap_update_is_needed(mp, whichfork))
-                return 0;
+                return;
-        return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+        __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
                        XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
                        whichfork, PREV);
 }
 /* Schedule the creation of an rmap for non-file data. */
-int
+void
 xfs_rmap_alloc_extent(
        struct xfs_trans        *tp,
        xfs_agnumber_t          agno,
@@ -2364,18 +2362,18 @@ xfs_rmap_alloc_extent(
        struct xfs_bmbt_irec    bmap;
        if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
-                return 0;
+                return;
        bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
        bmap.br_blockcount = len;
        bmap.br_startoff = 0;
        bmap.br_state = XFS_EXT_NORM;
-        return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+        __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
 }
 /* Schedule the deletion of an rmap for non-file data. */
-int
+void
 xfs_rmap_free_extent(
        struct xfs_trans        *tp,
        xfs_agnumber_t          agno,
@@ -2386,14 +2384,14 @@ xfs_rmap_free_extent(
        struct xfs_bmbt_irec    bmap;
        if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
-                return 0;
+                return;
        bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
        bmap.br_blockcount = len;
        bmap.br_startoff = 0;
        bmap.br_state = XFS_EXT_NORM;
-        return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+        __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
 }
 /* Compare rmap records.  Returns -1 if a < b, 1 if a > b, and 0 if equal. */
@@ -2511,7 +2509,7 @@ xfs_rmap_has_other_keys_helper(
            ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
                return 0;
        rks->has_rmap = true;
-        return XFS_BTREE_QUERY_RANGE_ABORT;
+        return -ECANCELED;
 }
 /*
@@ -2540,8 +2538,11 @@ xfs_rmap_has_other_keys(
        error = xfs_rmap_query_range(cur, &low, &high,
                        xfs_rmap_has_other_keys_helper, &rks);
+        if (error < 0)
+                return error;
        *has_rmap = rks.has_rmap;
-        return error;
+        return 0;
 }
 const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index e21ed0294e5c..abe633403fd1 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -68,6 +68,7 @@ xfs_rmap_irec_offset_unpack(
        if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
                return -EFSCORRUPTED;
        irec->rm_offset = XFS_RMAP_OFF(offset);
+        irec->rm_flags = 0;
        if (offset & XFS_RMAP_OFF_ATTR_FORK)
                irec->rm_flags |= XFS_RMAP_ATTR_FORK;
        if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
@@ -161,16 +162,16 @@ struct xfs_rmap_intent {
 };
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
-int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
                int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
                int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
+void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
                struct xfs_inode *ip, int whichfork,
                struct xfs_bmbt_irec *imap);
-int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
                xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
                xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
 void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index e0641b7337b3..c45acbd3add9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -177,10 +177,4 @@ struct xfs_ino_geometry {
        unsigned int    agino_log;      /* #bits for agino in inum */
 };
-/* Keep iterating the data structure. */
-#define XFS_ITER_CONTINUE       (0)
-/* Stop iterating the data structure. */
-#define XFS_ITER_ABORT          (1)
 #endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 802b34cd10fe..300b3e91ca3a 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -169,6 +169,14 @@ typedef struct xfs_bmbt_irec
        xfs_exntst_t    br_state;       /* extent state */
 } xfs_bmbt_irec_t;
+/* per-AG block reservation types */
+enum xfs_ag_resv_type {
+        XFS_AG_RESV_NONE = 0,
+        XFS_AG_RESV_AGFL,
+        XFS_AG_RESV_METADATA,
+        XFS_AG_RESV_RMAPBT,
+};
 /*
 * Type verifier functions
 */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 16b09b941441..ba0f747c82e8 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -639,7 +639,7 @@ xchk_agfl_block(
        xchk_agfl_block_xref(sc, agbno);
        if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                return XFS_ITER_ABORT;
+                return -ECANCELED;
        return 0;
 }
@@ -730,7 +730,7 @@ xchk_agfl(
        /* Check the blocks in the AGFL. */
        error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
                        sc->sa.agfl_bp, xchk_agfl_block, &sai);
-        if (error == XFS_ITER_ABORT) {
+        if (error == -ECANCELED) {
                error = 0;
                goto out_free;
        }
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 1afc58bf71dd..0edc7f8eb96e 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -80,7 +80,7 @@ xchk_setup_xattr(
         * without the inode lock held, which means we can sleep.
         */
        if (sc->flags & XCHK_TRY_HARDER) {
-                error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP);
+                error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
                if (error)
                        return error;
        }
@@ -163,8 +163,6 @@ xchk_xattr_listent(
        args.valuelen = valuelen;
        error = xfs_attr_get_ilocked(context->dp, &args);
-        if (error == -EEXIST)
-                error = 0;
        if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
                        &error))
                goto fail_xref;
@@ -173,7 +171,7 @@ xchk_xattr_listent(
                                             args.blkno);
 fail_xref:
        if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                context->seen_enough = XFS_ITER_ABORT;
+                context->seen_enough = 1;
        return;
 }
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 1bd29fdc2ab5..fa6ea6407992 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -75,6 +75,7 @@ struct xchk_bmap_info {
        xfs_fileoff_t           lastoff;
        bool                    is_rt;
        bool                    is_shared;
+        bool                    was_loaded;
        int                     whichfork;
 };
@@ -213,25 +214,20 @@ xchk_bmap_xref_rmap(
 /* Cross-reference a single rtdev extent record. */
 STATIC void
-xchk_bmap_rt_extent_xref(
+xchk_bmap_rt_iextent_xref(
-        struct xchk_bmap_info   *info,
        struct xfs_inode        *ip,
-        struct xfs_btree_cur    *cur,
+        struct xchk_bmap_info   *info,
        struct xfs_bmbt_irec    *irec)
 {
-        if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                return;
        xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
                        irec->br_blockcount);
 }
 /* Cross-reference a single datadev extent record. */
 STATIC void
-xchk_bmap_extent_xref(
+xchk_bmap_iextent_xref(
-        struct xchk_bmap_info   *info,
        struct xfs_inode        *ip,
-        struct xfs_btree_cur    *cur,
+        struct xchk_bmap_info   *info,
        struct xfs_bmbt_irec    *irec)
 {
        struct xfs_mount        *mp = info->sc->mp;
@@ -240,9 +236,6 @@ xchk_bmap_extent_xref(
        xfs_extlen_t            len;
        int                     error;
-        if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                return;
        agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
        agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
        len = irec->br_blockcount;
@@ -300,20 +293,15 @@ xchk_bmap_dirattr_extent(
 /* Scrub a single extent record. */
 STATIC int
-xchk_bmap_extent(
+xchk_bmap_iextent(
        struct xfs_inode        *ip,
-        struct xfs_btree_cur    *cur,
        struct xchk_bmap_info   *info,
        struct xfs_bmbt_irec    *irec)
 {
        struct xfs_mount        *mp = info->sc->mp;
-        struct xfs_buf          *bp = NULL;
        xfs_filblks_t           end;
        int                     error = 0;
-        if (cur)
-                xfs_btree_get_block(cur, 0, &bp);
        /*
         * Check for out-of-order extents.  This record could have come
         * from the incore list, for which there is no ordering check.
@@ -364,10 +352,13 @@ xchk_bmap_extent(
                xchk_fblock_set_corrupt(info->sc, info->whichfork,
                                irec->br_startoff);
+        if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+                return 0;
        if (info->is_rt)
-                xchk_bmap_rt_extent_xref(info, ip, cur, irec);
+                xchk_bmap_rt_iextent_xref(ip, info, irec);
        else
-                xchk_bmap_extent_xref(info, ip, cur, irec);
+                xchk_bmap_iextent_xref(ip, info, irec);
        info->lastoff = irec->br_startoff + irec->br_blockcount;
        return error;
@@ -380,10 +371,13 @@ xchk_bmapbt_rec(
        union xfs_btree_rec     *rec)
 {
        struct xfs_bmbt_irec    irec;
+        struct xfs_bmbt_irec    iext_irec;
+        struct xfs_iext_cursor  icur;
        struct xchk_bmap_info   *info = bs->private;
        struct xfs_inode        *ip = bs->cur->bc_private.b.ip;
        struct xfs_buf          *bp = NULL;
        struct xfs_btree_block  *block;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, info->whichfork);
        uint64_t                owner;
        int                     i;
@@ -402,9 +396,26 @@ xchk_bmapbt_rec(
                }
        }
-        /* Set up the in-core record and scrub it. */
+        /*
+         * Check that the incore extent tree contains an extent that matches
+         * this one exactly.  We validate those cached bmaps later, so we don't
+         * need to check them here.  If the incore extent tree was just loaded
+         * from disk by the scrubber, we assume that its contents match what's
+         * on disk (we still hold the ILOCK) and skip the equivalence check.
+         */
+        if (!info->was_loaded)
+                return 0;
        xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
-        return xchk_bmap_extent(ip, bs->cur, info, &irec);
+        if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
+                                &iext_irec) ||
+            irec.br_startoff != iext_irec.br_startoff ||
+            irec.br_startblock != iext_irec.br_startblock ||
+            irec.br_blockcount != iext_irec.br_blockcount ||
+            irec.br_state != iext_irec.br_state)
+                xchk_fblock_set_corrupt(bs->sc, info->whichfork,
+                                irec.br_startoff);
+        return 0;
 }
 /* Scan the btree records. */
@@ -415,15 +426,26 @@ xchk_bmap_btree(
        struct xchk_bmap_info   *info)
 {
        struct xfs_owner_info   oinfo;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
        struct xfs_mount        *mp = sc->mp;
        struct xfs_inode        *ip = sc->ip;
        struct xfs_btree_cur    *cur;
        int                     error;
+        /* Load the incore bmap cache if it's not loaded. */
+        info->was_loaded = ifp->if_flags & XFS_IFEXTENTS;
+        if (!info->was_loaded) {
+                error = xfs_iread_extents(sc->tp, ip, whichfork);
+                if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
+                        goto out;
+        }
+        /* Check the btree structure. */
        cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
        xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
        error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info);
        xfs_btree_del_cursor(cur, error);
+out:
        return error;
 }
@@ -500,7 +522,7 @@ xchk_bmap_check_rmap(
 out:
        if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                return XFS_BTREE_QUERY_RANGE_ABORT;
+                return -ECANCELED;
        return 0;
 }
@@ -529,7 +551,7 @@ xchk_bmap_check_ag_rmaps(
        sbcri.sc = sc;
        sbcri.whichfork = whichfork;
        error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+        if (error == -ECANCELED)
                error = 0;
        xfs_btree_del_cursor(cur, error);
@@ -671,13 +693,6 @@ xchk_bmap(
        if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
                goto out;
-        /* Now try to scrub the in-memory extent list. */
-        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-                error = xfs_iread_extents(sc->tp, ip, whichfork);
-                if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
-                        goto out;
-        }
        /* Find the offset of the last extent in the mapping. */
        error = xfs_bmap_last_offset(ip, &endoff, whichfork);
        if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
@@ -689,7 +704,7 @@ xchk_bmap(
        for_each_xfs_iext(ifp, &icur, &irec) {
                if (xchk_should_terminate(sc, &error) ||
                    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
-                        break;
+                        goto out;
                if (isnullstartblock(irec.br_startblock))
                        continue;
                if (irec.br_startoff >= endoff) {
@@ -697,7 +712,7 @@ xchk_bmap(
                                        irec.br_startoff);
                        goto out;
                }
-                error = xchk_bmap_extent(ip, NULL, &info, &irec);
+                error = xchk_bmap_iextent(ip, &info, &irec);
                if (error)
                        goto out;
        }
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index fc3f510c9034..98f82d7c8b40 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -125,7 +125,7 @@ xchk_setup_fscounters(
        struct xchk_fscounters  *fsc;
        int                     error;
-        sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP);
+        sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
        if (!sc->buf)
                return -ENOMEM;
        fsc = sc->buf;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4cfeec57fb05..b70a88bc975e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -351,7 +351,7 @@ xrep_init_btblock(
        xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
        xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
-        xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+        xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
        bp->b_ops = ops;
        *bpp = bp;
@@ -664,7 +664,7 @@ xrep_findroot_agfl_walk(
 {
        xfs_agblock_t           *agbno = priv;
-        return (*agbno == bno) ? XFS_ITER_ABORT : 0;
+        return (*agbno == bno) ? -ECANCELED : 0;
 }
 /* Does this block match the btree information passed in? */
@@ -694,7 +694,7 @@ xrep_findroot_block(
        if (owner == XFS_RMAP_OWN_AG) {
                error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
                                xrep_findroot_agfl_walk, &agbno);
-                if (error == XFS_ITER_ABORT)
+                if (error == -ECANCELED)
                        return 0;
                if (error)
                        return error;
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 99c0b1234c3c..5641ae512c9e 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -22,7 +22,7 @@ xchk_setup_symlink(
        struct xfs_inode        *ip)
 {
        /* Allocate the buffer without the inode lock held. */
-        sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP);
+        sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
        if (!sc->buf)
                return -ENOMEM;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cbda40d40326..96d7071cfa46 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -112,7 +112,7 @@ xfs_get_acl(struct inode *inode, int type)
 {
        struct xfs_inode *ip = XFS_I(inode);
        struct posix_acl *acl = NULL;
-        struct xfs_acl *xfs_acl;
+        struct xfs_acl *xfs_acl = NULL;
        unsigned char *ea_name;
        int error;
        int len;
@@ -135,12 +135,8 @@ xfs_get_acl(struct inode *inode, int type)
         * go out to the disk.
         */
        len = XFS_ACL_MAX_SIZE(ip->i_mount);
-        xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+        error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
-        if (!xfs_acl)
+                                ATTR_ALLOC | ATTR_ROOT);
-                return ERR_PTR(-ENOMEM);
-        error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
-                                                        &len, ATTR_ROOT);
        if (error) {
                /*
                 * If the attribute doesn't exist make sure we have a negative
@@ -151,8 +147,8 @@ xfs_get_acl(struct inode *inode, int type)
        } else  {
                acl = xfs_acl_from_disk(xfs_acl, len,
                                        XFS_ACL_MAX_ENTRIES(ip->i_mount));
+                kmem_free(xfs_acl);
        }
-        kmem_free(xfs_acl);
        return acl;
 }
@@ -180,7 +176,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
                struct xfs_acl *xfs_acl;
                int len = XFS_ACL_MAX_SIZE(ip->i_mount);
-                xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+                xfs_acl = kmem_zalloc_large(len, 0);
                if (!xfs_acl)
                        return -ENOMEM;
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index dc93c51c17de..a640a285cc52 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -147,7 +147,7 @@ xfs_attr3_leaf_inactive(
         * Allocate storage for a list of all the "remote" value extents.
         */
        size = count * sizeof(xfs_attr_inactive_list_t);
-        list = kmem_alloc(size, KM_SLEEP);
+        list = kmem_alloc(size, 0);
        /*
         * Identify each of the "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 58fc820a70c6..00758fdc2fec 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -109,7 +109,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+        sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
        /*
         * Scan the attribute list for the rest of the entries, storing
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9fa4a7ee8cfc..83d24e983d4c 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -141,7 +141,7 @@ xfs_bui_init(
 {
        struct xfs_bui_log_item         *buip;
-        buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP);
+        buip = kmem_zone_zalloc(xfs_bui_zone, 0);
        xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
        buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -218,7 +218,7 @@ xfs_trans_get_bud(
 {
        struct xfs_bud_log_item         *budp;
-        budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
+        budp = kmem_zone_zalloc(xfs_bud_zone, 0);
        xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
                          &xfs_bud_item_ops);
        budp->bud_buip = buip;
@@ -542,9 +542,7 @@ xfs_bui_recover(
                irec.br_blockcount = count;
                irec.br_startoff = bmap->me_startoff;
                irec.br_state = state;
-                error = xfs_bmap_unmap_extent(tp, ip, &irec);
+                xfs_bmap_unmap_extent(tp, ip, &irec);
-                if (error)
-                        goto err_inode;
        }
        set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 98c6a7a71427..0910cb75b65d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -39,9 +39,9 @@
 xfs_daddr_t
 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 {
-        return (XFS_IS_REALTIME_INODE(ip) ? \
+        if (XFS_IS_REALTIME_INODE(ip))
-                 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+                return XFS_FSB_TO_BB(ip->i_mount, fsb);
-                 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+        return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
 }
 /*
@@ -1532,24 +1532,16 @@ xfs_swap_extent_rmap(
                        trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
                        /* Remove the mapping from the donor file. */
-                        error = xfs_bmap_unmap_extent(tp, tip, &uirec);
+                        xfs_bmap_unmap_extent(tp, tip, &uirec);
-                        if (error)
-                                goto out;
                        /* Remove the mapping from the source file. */
-                        error = xfs_bmap_unmap_extent(tp, ip, &irec);
+                        xfs_bmap_unmap_extent(tp, ip, &irec);
-                        if (error)
-                                goto out;
                        /* Map the donor file's blocks into the source file. */
-                        error = xfs_bmap_map_extent(tp, ip, &uirec);
+                        xfs_bmap_map_extent(tp, ip, &uirec);
-                        if (error)
-                                goto out;
                        /* Map the source file's blocks into the donor file. */
-                        error = xfs_bmap_map_extent(tp, tip, &irec);
+                        xfs_bmap_map_extent(tp, tip, &irec);
-                        if (error)
-                                goto out;
                        error = xfs_defer_finish(tpp);
                        tp = *tpp;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ca0849043f54..120ef99d09e8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -353,7 +353,8 @@ xfs_buf_allocate_memory(
         */
        size = BBTOB(bp->b_length);
        if (size < PAGE_SIZE) {
-                bp->b_addr = kmem_alloc(size, KM_NOFS);
+                int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
+                bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
                if (!bp->b_addr) {
                        /* low memory - use alloc_page loop instead */
                        goto use_alloc_page;
@@ -368,7 +369,7 @@ xfs_buf_allocate_memory(
                }
                bp->b_offset = offset_in_page(bp->b_addr);
                bp->b_pages = bp->b_page_array;
-                bp->b_pages[0] = virt_to_page(bp->b_addr);
+                bp->b_pages[0] = kmem_to_page(bp->b_addr);
                bp->b_page_count = 1;
                bp->b_flags |= _XBF_KMEM;
                return 0;
@@ -1741,7 +1742,7 @@ xfs_alloc_buftarg(
 {
        xfs_buftarg_t           *btp;
-        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
+        btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c6e57a3f409e..f6ce17d8d848 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -350,6 +350,12 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
 #define xfs_getsize_buftarg(buftarg)    block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)   bdev_read_only((buftarg)->bt_bdev)
+static inline int
+xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
+{
+        return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
+}
 int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
 bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
 bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7dcaec54a20b..d74fbd1e9d3e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -702,7 +702,7 @@ xfs_buf_item_get_format(
        }
        bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
-                                KM_SLEEP);
+                                0);
        if (!bip->bli_formats)
                return -ENOMEM;
        return 0;
@@ -747,7 +747,7 @@ xfs_buf_item_init(
                return 0;
        }
-        bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
+        bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
        xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
        bip->bli_buf = bp;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index fb1ad4483081..aeb95e7391c1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -440,7 +440,7 @@ xfs_dquot_alloc(
 {
        struct xfs_dquot        *dqp;
-        dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+        dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
        dqp->dq_flags = type;
        dqp->q_core.d_id = cpu_to_be32(id);
@@ -1239,7 +1239,7 @@ xfs_qm_exit(void)
 /*
 * Iterate every dquot of a particular type.  The caller must ensure that the
 * particular quota type is active.  iter_fn can return negative error codes,
- * or XFS_ITER_ABORT to indicate that it wants to stop iterating.
+ * or -ECANCELED to indicate that it wants to stop iterating.
 */
 int
 xfs_qm_dqiterate(
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 282ec5af293e..d60647d7197b 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -347,7 +347,7 @@ xfs_qm_qoff_logitem_init(
 {
        struct xfs_qoff_logitem *qf;
-        qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+        qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
        xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
                        &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 544c9482a0ef..849fd4476950 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -213,7 +213,7 @@ xfs_errortag_init(
        struct xfs_mount        *mp)
 {
        mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
-                        KM_SLEEP | KM_MAYFAIL);
+                        KM_MAYFAIL);
        if (!mp->m_errortag)
                return -ENOMEM;
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 0ed68379e551..2183d87be4cf 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -33,7 +33,7 @@ xfs_extent_busy_insert(
        struct rb_node          **rbp;
        struct rb_node          *parent = NULL;
-        new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
+        new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
        new->agno = agno;
        new->bno = bno;
        new->length = len;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 86f6512d6864..e44efc41a041 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -163,9 +163,9 @@ xfs_efi_init(
        if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
                size = (uint)(sizeof(xfs_efi_log_item_t) +
                        ((nextents - 1) * sizeof(xfs_extent_t)));
-                efip = kmem_zalloc(size, KM_SLEEP);
+                efip = kmem_zalloc(size, 0);
        } else {
-                efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
+                efip = kmem_zone_zalloc(xfs_efi_zone, 0);
        }
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -333,9 +333,9 @@ xfs_trans_get_efd(
        if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
                efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
                                (nextents - 1) * sizeof(struct xfs_extent),
-                                KM_SLEEP);
+                                0);
        } else {
-                efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+                efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
        }
        xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 28101bbc0b78..d952d5962e93 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -28,6 +28,7 @@
 #include <linux/falloc.h>
 #include <linux/backing-dev.h>
 #include <linux/mman.h>
+#include <linux/fadvise.h>
 static const struct vm_operations_struct xfs_file_vm_ops;
@@ -933,6 +934,30 @@ out_unlock:
        return error;
 }
+STATIC int
+xfs_file_fadvise(
+        struct file     *file,
+        loff_t          start,
+        loff_t          end,
+        int             advice)
+{
+        struct xfs_inode *ip = XFS_I(file_inode(file));
+        int ret;
+        int lockflags = 0;
+        /*
+         * Operations creating pages in page cache need protection from hole
+         * punching and similar ops
+         */
+        if (advice == POSIX_FADV_WILLNEED) {
+                lockflags = XFS_IOLOCK_SHARED;
+                xfs_ilock(ip, lockflags);
+        }
+        ret = generic_fadvise(file, start, end, advice);
+        if (lockflags)
+                xfs_iunlock(ip, lockflags);
+        return ret;
+}
 STATIC loff_t
 xfs_file_remap_range(
@@ -1232,6 +1257,7 @@ const struct file_operations xfs_file_operations = {
        .fsync          = xfs_file_fsync,
        .get_unmapped_area = thp_get_unmapped_area,
        .fallocate      = xfs_file_fallocate,
+        .fadvise        = xfs_file_fadvise,
        .remap_file_range = xfs_file_remap_range,
 };
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a8f9641562a..d082143feb5a 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -250,7 +250,7 @@ xfs_getfsmap_helper(
                rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
                if (info->next_daddr < rec_daddr)
                        info->next_daddr = rec_daddr;
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        }
        /* Are we just counting mappings? */
@@ -259,14 +259,14 @@ xfs_getfsmap_helper(
                        info->head->fmh_entries++;
                if (info->last)
-                        return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                        return 0;
                info->head->fmh_entries++;
                rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
                if (info->next_daddr < rec_daddr)
                        info->next_daddr = rec_daddr;
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        }
        /*
@@ -276,7 +276,7 @@ xfs_getfsmap_helper(
         */
        if (rec_daddr > info->next_daddr) {
                if (info->head->fmh_entries >= info->head->fmh_count)
-                        return XFS_BTREE_QUERY_RANGE_ABORT;
+                        return -ECANCELED;
                fmr.fmr_device = info->dev;
                fmr.fmr_physical = info->next_daddr;
@@ -295,7 +295,7 @@ xfs_getfsmap_helper(
        /* Fill out the extent we found */
        if (info->head->fmh_entries >= info->head->fmh_count)
-                return XFS_BTREE_QUERY_RANGE_ABORT;
+                return -ECANCELED;
        trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
@@ -328,7 +328,7 @@ out:
        rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
        if (info->next_daddr < rec_daddr)
                info->next_daddr = rec_daddr;
-        return XFS_BTREE_QUERY_RANGE_CONTINUE;
+        return 0;
 }
 /* Transform a rmapbt irec into a fsmap */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0b0fd10a36d4..944add5ff8e0 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -40,7 +40,7 @@ xfs_inode_alloc(
         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
         * code up to do this anyway.
         */
-        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+        ip = kmem_zone_alloc(xfs_inode_zone, 0);
        if (!ip)
                return NULL;
        if (inode_init_always(mp->m_super, VFS_I(ip))) {
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d99a0a3e5f40..3ebd1b7f49d8 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -89,7 +89,7 @@ xfs_icreate_log(
 {
        struct xfs_icreate_item *icp;
-        icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+        icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
        xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
                          &xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6467d5e1df2d..18f4b262e61c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2018,7 +2018,7 @@ xfs_iunlink_add_backref(
        if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
                return 0;
-        iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+        iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
        iu->iu_agino = prev_agino;
        iu->iu_next_unlinked = this_agino;
@@ -3282,7 +3282,8 @@ xfs_rename(
                                        spaceres);
        /*
-         * Set up the target.
+         * Check for expected errors before we dirty the transaction
+         * so we can return an error without a transaction abort.
         */
        if (target_ip == NULL) {
                /*
@@ -3294,6 +3295,46 @@ xfs_rename(
                        if (error)
                                goto out_trans_cancel;
                }
+        } else {
+                /*
+                 * If target exists and it's a directory, check that whether
+                 * it can be destroyed.
+                 */
+                if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+                    (!xfs_dir_isempty(target_ip) ||
+                     (VFS_I(target_ip)->i_nlink > 2))) {
+                        error = -EEXIST;
+                        goto out_trans_cancel;
+                }
+        }
+        /*
+         * Directory entry creation below may acquire the AGF. Remove
+         * the whiteout from the unlinked list first to preserve correct
+         * AGI/AGF locking order. This dirties the transaction so failures
+         * after this point will abort and log recovery will clean up the
+         * mess.
+         *
+         * For whiteouts, we need to bump the link count on the whiteout
+         * inode. After this point, we have a real link, clear the tmpfile
+         * state flag from the inode so it doesn't accidentally get misused
+         * in future.
+         */
+        if (wip) {
+                ASSERT(VFS_I(wip)->i_nlink == 0);
+                error = xfs_iunlink_remove(tp, wip);
+                if (error)
+                        goto out_trans_cancel;
+                xfs_bumplink(tp, wip);
+                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
+                VFS_I(wip)->i_state &= ~I_LINKABLE;
+        }
+        /*
+         * Set up the target.
+         */
+        if (target_ip == NULL) {
                /*
                 * If target does not exist and the rename crosses
                 * directories, adjust the target directory link count
@@ -3312,22 +3353,6 @@ xfs_rename(
                }
        } else { /* target_ip != NULL */
                /*
-                 * If target exists and it's a directory, check that both
-                 * target and source are directories and that target can be
-                 * destroyed, or that neither is a directory.
-                 */
-                if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
-                        /*
-                         * Make sure target dir is empty.
-                         */
-                        if (!(xfs_dir_isempty(target_ip)) ||
-                            (VFS_I(target_ip)->i_nlink > 2)) {
-                                error = -EEXIST;
-                                goto out_trans_cancel;
-                        }
-                }
-                /*
                 * Link the source inode under the target name.
                 * If the source inode is a directory and we are moving
                 * it across directories, its ".." entry will be
@@ -3417,30 +3442,6 @@ xfs_rename(
        if (error)
                goto out_trans_cancel;
-        /*
-         * For whiteouts, we need to bump the link count on the whiteout inode.
-         * This means that failures all the way up to this point leave the inode
-         * on the unlinked list and so cleanup is a simple matter of dropping
-         * the remaining reference to it. If we fail here after bumping the link
-         * count, we're shutting down the filesystem so we'll never see the
-         * intermediate state on disk.
-         */
-        if (wip) {
-                ASSERT(VFS_I(wip)->i_nlink == 0);
-                xfs_bumplink(tp, wip);
-                error = xfs_iunlink_remove(tp, wip);
-                if (error)
-                        goto out_trans_cancel;
-                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
-                /*
-                 * Now we have a real link, clear the "I'm a tmpfile" state
-                 * flag from the inode so it doesn't accidentally get misused in
-                 * future.
-                 */
-                VFS_I(wip)->i_state &= ~I_LINKABLE;
-        }
        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
        if (new_parent)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c9a502eed204..bb8f076805b9 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,7 +651,7 @@ xfs_inode_item_init(
        struct xfs_inode_log_item *iip;
        ASSERT(ip->i_itemp == NULL);
-        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
        iip->ili_inode = ip;
        xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index affa557c2337..d58f0d6a699e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -396,7 +396,7 @@ xfs_attrlist_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+        kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
        if (!kbuf)
                goto out_dput;
@@ -434,11 +434,11 @@ xfs_attrmulti_attr_get(
        if (*len > XFS_XATTR_SIZE_MAX)
                return -EINVAL;
-        kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+        kbuf = kmem_zalloc_large(*len, 0);
        if (!kbuf)
                return -ENOMEM;
-        error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+        error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
        if (error)
                goto out_kfree;
@@ -831,7 +831,7 @@ xfs_bulkstat_fmt(
 /*
 * Check the incoming bulk request @hdr from userspace and initialize the
 * internal @breq bulk request appropriately.  Returns 0 if the bulk request
- * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual
+ * should proceed; -ECANCELED if there's nothing to do; or the usual
 * negative error code.
 */
 static int
@@ -889,13 +889,13 @@ xfs_bulk_ireq_setup(
                /* Asking for an inode past the end of the AG?  We're done! */
                if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
-                        return XFS_ITER_ABORT;
+                        return -ECANCELED;
        } else if (hdr->agno)
                return -EINVAL;
        /* Asking for an inode past the end of the FS?  We're done! */
        if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
-                return XFS_ITER_ABORT;
+                return -ECANCELED;
        return 0;
 }
@@ -936,7 +936,7 @@ xfs_ioc_bulkstat(
                return -EFAULT;
        error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
-        if (error == XFS_ITER_ABORT)
+        if (error == -ECANCELED)
                goto out_teardown;
        if (error < 0)
                return error;
@@ -986,7 +986,7 @@ xfs_ioc_inumbers(
                return -EFAULT;
        error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
-        if (error == XFS_ITER_ABORT)
+        if (error == -ECANCELED)
                goto out_teardown;
        if (error < 0)
                return error;
@@ -1038,6 +1038,10 @@ xfs_ioc_ag_geometry(
        if (copy_from_user(&ageo, arg, sizeof(ageo)))
                return -EFAULT;
+        if (ageo.ag_flags)
+                return -EINVAL;
+        if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
+                return -EINVAL;
        error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
        if (error)
@@ -1309,8 +1313,7 @@ xfs_ioctl_setattr_dax_invalidate(
        if (fa->fsx_xflags & FS_XFLAG_DAX) {
                if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
                        return -EINVAL;
-                if (S_ISREG(inode->i_mode) &&
+                if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
-                    !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
                                sb->s_blocksize))
                        return -EINVAL;
        }
@@ -1881,7 +1884,7 @@ xfs_ioc_getfsmap(
        info.mp = ip->i_mount;
        info.data = arg;
        error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+        if (error == -ECANCELED) {
                error = 0;
                aborted = true;
        } else if (error)
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7bd7534f5051..1e08bf79b478 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -381,7 +381,7 @@ xfs_compat_attrlist_by_handle(
                return PTR_ERR(dentry);
        error = -ENOMEM;
-        kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+        kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
        if (!kbuf)
                goto out_dput;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3a4310d7cb59..f780e223b118 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -58,7 +58,7 @@ xfs_bmbt_to_iomap(
 {
        struct xfs_mount        *mp = ip->i_mount;
-        if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+        if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
                return xfs_alert_fsblock_zero(ip, imap);
        if (imap->br_startblock == HOLESTARTBLOCK) {
@@ -297,7 +297,7 @@ xfs_iomap_write_direct(
                goto out_unlock;
        }
-        if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+        if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
                error = xfs_alert_fsblock_zero(ip, imap);
 out_unlock:
@@ -814,7 +814,7 @@ xfs_iomap_write_unwritten(
                if (error)
                        return error;
-                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+                if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
                        return xfs_alert_fsblock_zero(ip, &imap);
                if ((numblks_fsb = imap.br_blockcount) == 0) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f5c955d35be4..884950adbd16 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -137,7 +137,7 @@ xfs_bulkstat_one_int(
        xfs_irele(ip);
        error = bc->formatter(bc->breq, buf);
-        if (error == XFS_IBULK_ABORT)
+        if (error == -ECANCELED)
                goto out_advance;
        if (error)
                goto out;
@@ -169,7 +169,7 @@ xfs_bulkstat_one(
        ASSERT(breq->icount == 1);
        bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
-                        KM_SLEEP | KM_MAYFAIL);
+                        KM_MAYFAIL);
        if (!bc.buf)
                return -ENOMEM;
@@ -181,7 +181,7 @@ xfs_bulkstat_one(
         * If we reported one inode to userspace then we abort because we hit
         * the end of the buffer.  Don't leak that back to userspace.
         */
-        if (error == XFS_IWALK_ABORT)
+        if (error == -ECANCELED)
                error = 0;
        return error;
@@ -243,7 +243,7 @@ xfs_bulkstat(
                return 0;
        bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
-                        KM_SLEEP | KM_MAYFAIL);
+                        KM_MAYFAIL);
        if (!bc.buf)
                return -ENOMEM;
@@ -342,7 +342,7 @@ xfs_inumbers_walk(
        int                     error;
        error = ic->formatter(ic->breq, &inogrp);
-        if (error && error != XFS_IBULK_ABORT)
+        if (error && error != -ECANCELED)
                return error;
        ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index e90c1fc5b981..96a1e2a9be3f 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -18,9 +18,6 @@ struct xfs_ibulk {
 /* Only iterate within the same AG as startino */
 #define XFS_IBULK_SAME_AG       (XFS_IWALK_SAME_AG)
-/* Return value that means we want to abort the walk. */
-#define XFS_IBULK_ABORT         (XFS_IWALK_ABORT)
 /*
 * Advance the user buffer pointer by one record of the given size.  If the
 * buffer is now full, return the appropriate error code.
@@ -34,13 +31,21 @@ xfs_ibulk_advance(
        breq->ubuffer = b + bytes;
        breq->ocount++;
-        return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0;
+        return breq->ocount == breq->icount ? -ECANCELED : 0;
 }
 /*
 * Return stat information in bulk (by-inode) for the filesystem.
 */
+/*
+ * Return codes for the formatter function are 0 to continue iterating, and
+ * non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * bulkstat/inumbers caller.  The special value -ECANCELED can be used to stop
+ * iteration, as neither bulkstat nor inumbers will ever generate that error
+ * code on their own.
+ */
 typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
                const struct xfs_bulkstat *bstat);
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 8c7d727149ea..aa375cf53021 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -31,7 +31,7 @@
 * inode it finds, it calls a walk function with the relevant inode number and
 * a pointer to caller-provided data.  The walk function can return the usual
 * negative error code to stop the iteration; 0 to continue the iteration; or
- * XFS_IWALK_ABORT to stop the iteration.  This return value is returned to the
+ * -ECANCELED to stop the iteration.  This return value is returned to the
 * caller.
 *
 * Internally, we allow the walk function to do anything, which means that we
@@ -616,7 +616,7 @@ xfs_iwalk_threaded(
                if (xfs_pwork_ctl_want_abort(&pctl))
                        break;
-                iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
+                iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
                iwag->mp = mp;
                iwag->iwalk_fn = iwalk_fn;
                iwag->data = data;
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 6c960e10ed4d..37a795f03267 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -6,12 +6,17 @@
 #ifndef __XFS_IWALK_H__
 #define __XFS_IWALK_H__
+/*
+ * Return codes for the inode/inobt walk function are 0 to continue iterating,
+ * and non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * iwalk or inobt_walk caller.  The special value -ECANCELED can be used to
+ * stop iteration, as neither iwalk nor inobt_walk will ever generate that
+ * error code on their own.
+ */
 /* Walk all inodes in the filesystem starting from @startino. */
 typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
                            xfs_ino_t ino, void *data);
-/* Return values for xfs_iwalk_fn. */
-#define XFS_IWALK_CONTINUE      (XFS_ITER_CONTINUE)
-#define XFS_IWALK_ABORT         (XFS_ITER_ABORT)
 int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
                unsigned int flags, xfs_iwalk_fn iwalk_fn,
@@ -30,8 +35,6 @@ typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
                                 xfs_agnumber_t agno,
                                 const struct xfs_inobt_rec_incore *irec,
                                 void *data);
-/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */
-#define XFS_INOBT_WALK_ABORT    (XFS_IWALK_ABORT)
 int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
                xfs_ino_t startino, unsigned int flags,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7fc3c1ad36bc..a2beee9f74da 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -214,15 +214,42 @@ xlog_grant_head_wake(
 {
        struct xlog_ticket      *tic;
        int                     need_bytes;
+        bool                    woken_task = false;
        list_for_each_entry(tic, &head->waiters, t_queue) {
+                /*
+                 * There is a chance that the size of the CIL checkpoints in
+                 * progress at the last AIL push target calculation resulted in
+                 * limiting the target to the log head (l_last_sync_lsn) at the
+                 * time. This may not reflect where the log head is now as the
+                 * CIL checkpoints may have completed.
+                 *
+                 * Hence when we are woken here, it may be that the head of the
+                 * log that has moved rather than the tail. As the tail didn't
+                 * move, there still won't be space available for the
+                 * reservation we require.  However, if the AIL has already
+                 * pushed to the target defined by the old log head location, we
+                 * will hang here waiting for something else to update the AIL
+                 * push target.
+                 *
+                 * Therefore, if there isn't space to wake the first waiter on
+                 * the grant head, we need to push the AIL again to ensure the
+                 * target reflects both the current log tail and log head
+                 * position before we wait for the tail to move again.
+                 */
                need_bytes = xlog_ticket_reservation(log, head, tic);
-                if (*free_bytes < need_bytes)
+                if (*free_bytes < need_bytes) {
+                        if (!woken_task)
+                                xlog_grant_push_ail(log, need_bytes);
                        return false;
+                }
                *free_bytes -= need_bytes;
                trace_xfs_log_grant_wake_up(log, tic);
                wake_up_process(tic->t_task);
+                woken_task = true;
        }
        return true;
@@ -428,8 +455,7 @@ xfs_log_reserve(
        XFS_STATS_INC(mp, xs_try_logspace);
        ASSERT(*ticp == NULL);
-        tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
+        tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
-                                KM_SLEEP);
        *ticp = tic;
        xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -1404,6 +1430,7 @@ xlog_alloc_log(
         */
        ASSERT(log->l_iclog_size >= 4096);
        for (i = 0; i < log->l_iclog_bufs; i++) {
+                int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
                size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
                                sizeof(struct bio_vec);
@@ -1415,8 +1442,8 @@ xlog_alloc_log(
                iclog->ic_prev = prev_iclog;
                prev_iclog = iclog;
-                iclog->ic_data = kmem_alloc_large(log->l_iclog_size,
+                iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
-                                KM_MAYFAIL);
+                                                KM_MAYFAIL);
                if (!iclog->ic_data)
                        goto out_free_iclog;
 #ifdef DEBUG
@@ -2496,21 +2523,35 @@ next_lv:
 *****************************************************************************
 */
-/* Clean iclogs starting from the head.  This ordering must be
+/*
- * maintained, so an iclog doesn't become ACTIVE beyond one that
+ * An iclog has just finished IO completion processing, so we need to update
- * is SYNCING.  This is also required to maintain the notion that we use
+ * the iclog state and propagate that up into the overall log state. Hence we
- * a ordered wait queue to hold off would be writers to the log when every
+ * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
- * iclog is trying to sync to disk.
+ * starting from the head, and then wake up any threads that are waiting for the
+ * iclog to be marked clean.
+ *
+ * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
+ * doesn't become ACTIVE beyond one that is SYNCING.  This is also required to
+ * maintain the notion that we use a ordered wait queue to hold off would be
+ * writers to the log when every iclog is trying to sync to disk.
+ *
+ * Caller must hold the icloglock before calling us.
 *
- * State Change: DIRTY -> ACTIVE
+ * State Change: !IOERROR -> DIRTY -> ACTIVE
 */
 STATIC void
-xlog_state_clean_log(
+xlog_state_clean_iclog(
-        struct xlog *log)
+        struct xlog             *log,
+        struct xlog_in_core     *dirty_iclog)
 {
-        xlog_in_core_t  *iclog;
+        struct xlog_in_core     *iclog;
-        int changed = 0;
+        int                     changed = 0;
+        /* Prepare the completed iclog. */
+        if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
+                dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+        /* Walk all the iclogs to update the ordered active state. */
        iclog = log->l_iclog;
        do {
                if (iclog->ic_state == XLOG_STATE_DIRTY) {
@@ -2548,7 +2589,13 @@ xlog_state_clean_log(
                iclog = iclog->ic_next;
        } while (iclog != log->l_iclog);
-        /* log is locked when we are called */
+        /*
+         * Wake up threads waiting in xfs_log_force() for the dirty iclog
+         * to be cleaned.
+         */
+        wake_up_all(&dirty_iclog->ic_force_wait);
        /*
         * Change state for the dummy log recording.
         * We usually go to NEED. But we go to NEED2 if the changed indicates
@@ -2582,7 +2629,7 @@ xlog_state_clean_log(
                        ASSERT(0);
                }
        }
-}       /* xlog_state_clean_log */
+}
 STATIC xfs_lsn_t
 xlog_get_lowest_lsn(
@@ -2603,30 +2650,205 @@ xlog_get_lowest_lsn(
        return lowest_lsn;
 }
+/*
+ * Completion of a iclog IO does not imply that a transaction has completed, as
+ * transactions can be large enough to span many iclogs. We cannot change the
+ * tail of the log half way through a transaction as this may be the only
+ * transaction in the log and moving the tail to point to the middle of it
+ * will prevent recovery from finding the start of the transaction. Hence we
+ * should only update the last_sync_lsn if this iclog contains transaction
+ * completion callbacks on it.
+ *
+ * We have to do this before we drop the icloglock to ensure we are the only one
+ * that can update it.
+ *
+ * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
+ * the reservation grant head pushing. This is due to the fact that the push
+ * target is bound by the current last_sync_lsn value. Hence if we have a large
+ * amount of log space bound up in this committing transaction then the
+ * last_sync_lsn value may be the limiting factor preventing tail pushing from
+ * freeing space in the log. Hence once we've updated the last_sync_lsn we
+ * should push the AIL to ensure the push target (and hence the grant head) is
+ * no longer bound by the old log head location and can move forwards and make
+ * progress again.
+ */
+static void
+xlog_state_set_callback(
+        struct xlog             *log,
+        struct xlog_in_core     *iclog,
+        xfs_lsn_t               header_lsn)
+{
+        iclog->ic_state = XLOG_STATE_CALLBACK;
+        ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+                           header_lsn) <= 0);
+        if (list_empty_careful(&iclog->ic_callbacks))
+                return;
+        atomic64_set(&log->l_last_sync_lsn, header_lsn);
+        xlog_grant_push_ail(log, 0);
+}
+/*
+ * Return true if we need to stop processing, false to continue to the next
+ * iclog. The caller will need to run callbacks if the iclog is returned in the
+ * XLOG_STATE_CALLBACK state.
+ */
+static bool
+xlog_state_iodone_process_iclog(
+        struct xlog             *log,
+        struct xlog_in_core     *iclog,
+        struct xlog_in_core     *completed_iclog,
+        bool                    *ioerror)
+{
+        xfs_lsn_t               lowest_lsn;
+        xfs_lsn_t               header_lsn;
+        /* Skip all iclogs in the ACTIVE & DIRTY states */
+        if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+                return false;
+        /*
+         * Between marking a filesystem SHUTDOWN and stopping the log, we do
+         * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
+         * want things to go smoothly in case of just a SHUTDOWN  w/o a
+         * LOG_IO_ERROR.
+         */
+        if (iclog->ic_state & XLOG_STATE_IOERROR) {
+                *ioerror = true;
+                return false;
+        }
+        /*
+         * Can only perform callbacks in order.  Since this iclog is not in the
+         * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
+         * up.  If we set our iclog to DO_CALLBACK, we will not process it when
+         * we retry since a previous iclog is in the CALLBACK and the state
+         * cannot change since we are holding the l_icloglock.
+         */
+        if (!(iclog->ic_state &
+                        (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
+                if (completed_iclog &&
+                    (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
+                        completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
+                }
+                return true;
+        }
+        /*
+         * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
+         * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
+         * by the above if and are going to clean (i.e. we aren't doing their
+         * callbacks) see the above if.
+         *
+         * We will do one more check here to see if we have chased our tail
+         * around. If this is not the lowest lsn iclog, then we will leave it
+         * for another completion to process.
+         */
+        header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+        lowest_lsn = xlog_get_lowest_lsn(log);
+        if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+                return false;
+        xlog_state_set_callback(log, iclog, header_lsn);
+        return false;
+}
+/*
+ * Keep processing entries in the iclog callback list until we come around and
+ * it is empty.  We need to atomically see that the list is empty and change the
+ * state to DIRTY so that we don't miss any more callbacks being added.
+ *
+ * This function is called with the icloglock held and returns with it held. We
+ * drop it while running callbacks, however, as holding it over thousands of
+ * callbacks is unnecessary and causes excessive contention if we do.
+ */
+static void
+xlog_state_do_iclog_callbacks(
+        struct xlog             *log,
+        struct xlog_in_core     *iclog,
+        bool                    aborted)
+{
+        spin_unlock(&log->l_icloglock);
+        spin_lock(&iclog->ic_callback_lock);
+        while (!list_empty(&iclog->ic_callbacks)) {
+                LIST_HEAD(tmp);
+                list_splice_init(&iclog->ic_callbacks, &tmp);
+                spin_unlock(&iclog->ic_callback_lock);
+                xlog_cil_process_committed(&tmp, aborted);
+                spin_lock(&iclog->ic_callback_lock);
+        }
+        /*
+         * Pick up the icloglock while still holding the callback lock so we
+         * serialise against anyone trying to add more callbacks to this iclog
+         * now we've finished processing.
+         */
+        spin_lock(&log->l_icloglock);
+        spin_unlock(&iclog->ic_callback_lock);
+}
+#ifdef DEBUG
+/*
+ * Make one last gasp attempt to see if iclogs are being left in limbo.  If the
+ * above loop finds an iclog earlier than the current iclog and in one of the
+ * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
+ * are deferred to the completion of the earlier iclog. Walk the iclogs in order
+ * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
+ * one of the syncing states.
+ *
+ * Note that SYNCING|IOERROR is a valid state so we cannot just check for
+ * ic_state == SYNCING.
+ */
+static void
+xlog_state_callback_check_state(
+        struct xlog             *log)
+{
+        struct xlog_in_core     *first_iclog = log->l_iclog;
+        struct xlog_in_core     *iclog = first_iclog;
+        do {
+                ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+                /*
+                 * Terminate the loop if iclogs are found in states
+                 * which will cause other threads to clean up iclogs.
+                 *
+                 * SYNCING - i/o completion will go through logs
+                 * DONE_SYNC - interrupt thread should be waiting for
+                 *              l_icloglock
+                 * IOERROR - give up hope all ye who enter here
+                 */
+                if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+                    iclog->ic_state & XLOG_STATE_SYNCING ||
+                    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+                    iclog->ic_state == XLOG_STATE_IOERROR )
+                        break;
+                iclog = iclog->ic_next;
+        } while (first_iclog != iclog);
+}
+#else
+#define xlog_state_callback_check_state(l)      ((void)0)
+#endif
 STATIC void
 xlog_state_do_callback(
        struct xlog             *log,
        bool                    aborted,
        struct xlog_in_core     *ciclog)
 {
-        xlog_in_core_t     *iclog;
+        struct xlog_in_core     *iclog;
-        xlog_in_core_t     *first_iclog;        /* used to know when we've
+        struct xlog_in_core     *first_iclog;
-                                                 * processed all iclogs once */
+        bool                    did_callbacks = false;
-        int                flushcnt = 0;
+        bool                    cycled_icloglock;
-        xfs_lsn_t          lowest_lsn;
+        bool                    ioerror;
-        int                ioerrors;    /* counter: iclogs with errors */
+        int                     flushcnt = 0;
-        int                loopdidcallbacks; /* flag: inner loop did callbacks*/
+        int                     repeats = 0;
-        int                funcdidcallbacks; /* flag: function did callbacks */
-        int                repeats;     /* for issuing console warnings if
-                                         * looping too many times */
-        int                wake = 0;
        spin_lock(&log->l_icloglock);
-        first_iclog = iclog = log->l_iclog;
-        ioerrors = 0;
-        funcdidcallbacks = 0;
-        repeats = 0;
        do {
                /*
                 * Scan all iclogs starting with the one pointed to by the
@@ -2638,137 +2860,34 @@ xlog_state_do_callback(
                 */
                first_iclog = log->l_iclog;
                iclog = log->l_iclog;
-                loopdidcallbacks = 0;
+                cycled_icloglock = false;
+                ioerror = false;
                repeats++;
                do {
+                        if (xlog_state_iodone_process_iclog(log, iclog,
+                                                        ciclog, &ioerror))
+                                break;
-                        /* skip all iclogs in the ACTIVE & DIRTY states */
+                        if (!(iclog->ic_state &
-                        if (iclog->ic_state &
+                              (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
-                            (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
                                iclog = iclog->ic_next;
                                continue;
                        }
                        /*
-                         * Between marking a filesystem SHUTDOWN and stopping
+                         * Running callbacks will drop the icloglock which means
-                         * the log, we do flush all iclogs to disk (if there
+                         * we'll have to run at least one more complete loop.
-                         * wasn't a log I/O error). So, we do want things to
-                         * go smoothly in case of just a SHUTDOWN  w/o a
-                         * LOG_IO_ERROR.
-                         */
-                        if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
-                                /*
-                                 * Can only perform callbacks in order.  Since
-                                 * this iclog is not in the DONE_SYNC/
-                                 * DO_CALLBACK state, we skip the rest and
-                                 * just try to clean up.  If we set our iclog
-                                 * to DO_CALLBACK, we will not process it when
-                                 * we retry since a previous iclog is in the
-                                 * CALLBACK and the state cannot change since
-                                 * we are holding the l_icloglock.
-                                 */
-                                if (!(iclog->ic_state &
-                                        (XLOG_STATE_DONE_SYNC |
-                                                 XLOG_STATE_DO_CALLBACK))) {
-                                        if (ciclog && (ciclog->ic_state ==
-                                                        XLOG_STATE_DONE_SYNC)) {
-                                                ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
-                                        }
-                                        break;
-                                }
-                                /*
-                                 * We now have an iclog that is in either the
-                                 * DO_CALLBACK or DONE_SYNC states. The other
-                                 * states (WANT_SYNC, SYNCING, or CALLBACK were
-                                 * caught by the above if and are going to
-                                 * clean (i.e. we aren't doing their callbacks)
-                                 * see the above if.
-                                 */
-                                /*
-                                 * We will do one more check here to see if we
-                                 * have chased our tail around.
-                                 */
-                                lowest_lsn = xlog_get_lowest_lsn(log);
-                                if (lowest_lsn &&
-                                    XFS_LSN_CMP(lowest_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
-                                        iclog = iclog->ic_next;
-                                        continue; /* Leave this iclog for
-                                                   * another thread */
-                                }
-                                iclog->ic_state = XLOG_STATE_CALLBACK;
-                                /*
-                                 * Completion of a iclog IO does not imply that
-                                 * a transaction has completed, as transactions
-                                 * can be large enough to span many iclogs. We
-                                 * cannot change the tail of the log half way
-                                 * through a transaction as this may be the only
-                                 * transaction in the log and moving th etail to
-                                 * point to the middle of it will prevent
-                                 * recovery from finding the start of the
-                                 * transaction. Hence we should only update the
-                                 * last_sync_lsn if this iclog contains
-                                 * transaction completion callbacks on it.
-                                 *
-                                 * We have to do this before we drop the
-                                 * icloglock to ensure we are the only one that
-                                 * can update it.
-                                 */
-                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                if (!list_empty_careful(&iclog->ic_callbacks))
-                                        atomic64_set(&log->l_last_sync_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn));
-                        } else
-                                ioerrors++;
-                        spin_unlock(&log->l_icloglock);
-                        /*
-                         * Keep processing entries in the callback list until
-                         * we come around and it is empty.  We need to
-                         * atomically see that the list is empty and change the
-                         * state to DIRTY so that we don't miss any more
-                         * callbacks being added.
-                         */
-                        spin_lock(&iclog->ic_callback_lock);
-                        while (!list_empty(&iclog->ic_callbacks)) {
-                                LIST_HEAD(tmp);
-                                list_splice_init(&iclog->ic_callbacks, &tmp);
-                                spin_unlock(&iclog->ic_callback_lock);
-                                xlog_cil_process_committed(&tmp, aborted);
-                                spin_lock(&iclog->ic_callback_lock);
-                        }
-                        loopdidcallbacks++;
-                        funcdidcallbacks++;
-                        spin_lock(&log->l_icloglock);
-                        spin_unlock(&iclog->ic_callback_lock);
-                        if (!(iclog->ic_state & XLOG_STATE_IOERROR))
-                                iclog->ic_state = XLOG_STATE_DIRTY;
-                        /*
-                         * Transition from DIRTY to ACTIVE if applicable.
-                         * NOP if STATE_IOERROR.
                         */
-                        xlog_state_clean_log(log);
+                        cycled_icloglock = true;
+                        xlog_state_do_iclog_callbacks(log, iclog, aborted);
-                        /* wake up threads waiting in xfs_log_force() */
-                        wake_up_all(&iclog->ic_force_wait);
+                        xlog_state_clean_iclog(log, iclog);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
+                did_callbacks |= cycled_icloglock;
                if (repeats > 5000) {
                        flushcnt += repeats;
                        repeats = 0;
@@ -2776,50 +2895,15 @@ xlog_state_do_callback(
                                "%s: possible infinite loop (%d iterations)",
                                __func__, flushcnt);
                }
-        } while (!ioerrors && loopdidcallbacks);
+        } while (!ioerror && cycled_icloglock);
-#ifdef DEBUG
+        if (did_callbacks)
-        /*
+                xlog_state_callback_check_state(log);
-         * Make one last gasp attempt to see if iclogs are being left in limbo.
-         * If the above loop finds an iclog earlier than the current iclog and
-         * in one of the syncing states, the current iclog is put into
-         * DO_CALLBACK and the callbacks are deferred to the completion of the
-         * earlier iclog. Walk the iclogs in order and make sure that no iclog
-         * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
-         * states.
-         *
-         * Note that SYNCING|IOABORT is a valid state so we cannot just check
-         * for ic_state == SYNCING.
-         */
-        if (funcdidcallbacks) {
-                first_iclog = iclog = log->l_iclog;
-                do {
-                        ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
-                        /*
-                         * Terminate the loop if iclogs are found in states
-                         * which will cause other threads to clean up iclogs.
-                         *
-                         * SYNCING - i/o completion will go through logs
-                         * DONE_SYNC - interrupt thread should be waiting for
-                         *              l_icloglock
-                         * IOERROR - give up hope all ye who enter here
-                         */
-                        if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
-                            iclog->ic_state & XLOG_STATE_SYNCING ||
-                            iclog->ic_state == XLOG_STATE_DONE_SYNC ||
-                            iclog->ic_state == XLOG_STATE_IOERROR )
-                                break;
-                        iclog = iclog->ic_next;
-                } while (first_iclog != iclog);
-        }
-#endif
        if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
-                wake = 1;
-        spin_unlock(&log->l_icloglock);
-        if (wake)
                wake_up_all(&log->l_flush_wait);
+        spin_unlock(&log->l_icloglock);
 }
@@ -3919,7 +4003,9 @@ xfs_log_force_umount(
         * item committed callback functions will do this again under lock to
         * avoid races.
         */
+        spin_lock(&log->l_cilp->xc_push_lock);
        wake_up_all(&log->l_cilp->xc_commit_wait);
+        spin_unlock(&log->l_cilp->xc_push_lock);
        xlog_state_do_callback(log, true, NULL);
 #ifdef XFSERRORDEBUG
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index fa5602d0fd7f..ef652abd112c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -38,7 +38,7 @@ xlog_cil_ticket_alloc(
        struct xlog_ticket *tic;
        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
-                                KM_SLEEP|KM_NOFS);
+                                KM_NOFS);
        /*
         * set the current reservation to zero so we know to steal the basic
@@ -186,7 +186,7 @@ xlog_cil_alloc_shadow_bufs(
                         */
                        kmem_free(lip->li_lv_shadow);
-                        lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS);
+                        lv = kmem_alloc_large(buf_size, KM_NOFS);
                        memset(lv, 0, xlog_cil_iovec_space(niovecs));
                        lv->lv_item = lip;
@@ -660,7 +660,7 @@ xlog_cil_push(
        if (!cil)
                return 0;
-        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
        new_ctx->ticket = xlog_cil_ticket_alloc(log);
        down_write(&cil->xc_ctx_lock);
@@ -1179,11 +1179,11 @@ xlog_cil_init(
        struct xfs_cil  *cil;
        struct xfs_cil_ctx *ctx;
-        cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+        cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
        if (!cil)
                return -ENOMEM;
-        ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+        ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
        if (!ctx) {
                kmem_free(cil);
                return -ENOMEM;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13d1d3e95b88..508319039dce 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -97,6 +97,8 @@ xlog_alloc_buffer(
        struct xlog     *log,
        int             nbblks)
 {
+        int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
        /*
         * Pass log block 0 since we don't have an addr yet, buffer will be
         * verified on read.
@@ -125,7 +127,7 @@ xlog_alloc_buffer(
        if (nbblks > 1 && log->l_sectBBsize > 1)
                nbblks += log->l_sectBBsize;
        nbblks = round_up(nbblks, log->l_sectBBsize);
-        return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL);
+        return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
 }
 /*
@@ -1960,7 +1962,7 @@ xlog_recover_buffer_pass1(
                }
        }
-        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
        bcp->bc_blkno = buf_f->blf_blkno;
        bcp->bc_len = buf_f->blf_len;
        bcp->bc_refcount = 1;
@@ -2930,7 +2932,7 @@ xlog_recover_inode_pass2(
        if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
-                in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
+                in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
                need_free = 1;
                error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
                if (error)
@@ -4161,7 +4163,7 @@ xlog_recover_add_item(
 {
        xlog_recover_item_t     *item;
-        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+        item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
        INIT_LIST_HEAD(&item->ri_list);
        list_add_tail(&item->ri_list, head);
 }
@@ -4201,7 +4203,7 @@ xlog_recover_add_to_cont_trans(
        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
        old_len = item->ri_buf[item->ri_cnt-1].i_len;
-        ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
+        ptr = kmem_realloc(old_ptr, len + old_len, 0);
        memcpy(&ptr[old_len], dp, len);
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4261,7 +4263,7 @@ xlog_recover_add_to_trans(
                return 0;
        }
-        ptr = kmem_alloc(len, KM_SLEEP);
+        ptr = kmem_alloc(len, 0);
        memcpy(ptr, dp, len);
        in_f = (struct xfs_inode_log_format *)ptr;
@@ -4289,7 +4291,7 @@ xlog_recover_add_to_trans(
                item->ri_total = in_f->ilf_size;
                item->ri_buf =
                        kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
-                                    KM_SLEEP);
+                                    0);
        }
        ASSERT(item->ri_total > item->ri_cnt);
        /* Description region is ri_buf[0] */
@@ -4423,7 +4425,7 @@ xlog_recover_ophdr_to_trans(
         * This is a new transaction so allocate a new recovery container to
         * hold the recovery ops that will follow.
         */
-        trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+        trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
        trans->r_log_tid = tid;
        trans->r_lsn = be64_to_cpu(rhead->h_lsn);
        INIT_LIST_HEAD(&trans->r_itemq);
@@ -5022,16 +5024,27 @@ xlog_recover_process_one_iunlink(
 }
 /*
- * xlog_iunlink_recover
+ * Recover AGI unlinked lists
+ *
+ * This is called during recovery to process any inodes which we unlinked but
+ * not freed when the system crashed.  These inodes will be on the lists in the
+ * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
+ * any inodes found on the lists. Each inode is removed from the lists when it
+ * has been fully truncated and is freed. The freeing of the inode and its
+ * removal from the list must be atomic.
+ *
+ * If everything we touch in the agi processing loop is already in memory, this
+ * loop can hold the cpu for a long time. It runs without lock contention,
+ * memory allocation contention, the need wait for IO, etc, and so will run
+ * until we either run out of inodes to process, run low on memory or we run out
+ * of log space.
 *
- * This is called during recovery to process any inodes which
+ * This behaviour is bad for latency on single CPU and non-preemptible kernels,
- * we unlinked but not freed when the system crashed.  These
+ * and can prevent other filesytem work (such as CIL pushes) from running. This
- * inodes will be on the lists in the AGI blocks.  What we do
+ * can lead to deadlocks if the recovery process runs out of log reservation
- * here is scan all the AGIs and fully truncate and free any
+ * space. Hence we need to yield the CPU when there is other kernel work
- * inodes found on the lists.  Each inode is removed from the
+ * scheduled on this CPU to ensure other scheduled work can run without undue
- * lists when it has been fully truncated and is freed.  The
+ * latency.
- * freeing of the inode and its removal from the list must be
- * atomic.
 */
 STATIC void
 xlog_recover_process_iunlinks(
@@ -5078,6 +5091,7 @@ xlog_recover_process_iunlinks(
                        while (agino != NULLAGINO) {
                                agino = xlog_recover_process_one_iunlink(mp,
                                                        agno, agino, bucket);
+                                cond_resched();
                        }
                }
                xfs_buf_rele(agibp);
@@ -5527,7 +5541,7 @@ xlog_do_log_recovery(
         */
        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
                                                 sizeof(struct list_head),
-                                                 KM_SLEEP);
+                                                 0);
        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 322da6909290..ba5b6f3b2b88 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -82,7 +82,7 @@ xfs_uuid_mount(
        if (hole < 0) {
                xfs_uuid_table = kmem_realloc(xfs_uuid_table,
                        (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
-                        KM_SLEEP);
+                        0);
                hole = xfs_uuid_table_size++;
        }
        xfs_uuid_table[hole] = *uuid;
@@ -214,7 +214,7 @@ xfs_initialize_perag(
                spin_lock(&mp->m_perag_lock);
                if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
-                        BUG();
+                        WARN_ON_ONCE(1);
                        spin_unlock(&mp->m_perag_lock);
                        radix_tree_preload_end();
                        error = -EEXIST;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4adb6837439a..fdb60e09a9c5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,13 +327,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 }
 /* per-AG block reservation data structures*/
-enum xfs_ag_resv_type {
-        XFS_AG_RESV_NONE = 0,
-        XFS_AG_RESV_AGFL,
-        XFS_AG_RESV_METADATA,
-        XFS_AG_RESV_RMAPBT,
-};
 struct xfs_ag_resv {
        /* number of blocks originally reserved here */
        xfs_extlen_t                    ar_orig_reserved;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 74738813f60d..a06661dac5be 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,12 +333,12 @@ xfs_mru_cache_create(
        if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
                return -EINVAL;
-        if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
+        if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
                return -ENOMEM;
        /* An extra list is needed to avoid reaping up to a grp_time early. */
        mru->grp_count = grp_count + 1;
-        mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+        mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
        if (!mru->lists) {
                err = -ENOMEM;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5e7a37f0cf84..ecd8ce152ab1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -642,7 +642,7 @@ xfs_qm_init_quotainfo(
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
        error = list_lru_init(&qinf->qi_lru);
        if (error)
@@ -978,7 +978,7 @@ xfs_qm_reset_dqcounts_buf(
        if (qip->i_d.di_nblocks == 0)
                return 0;
-        map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+        map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
        lblkno = 0;
        maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index d8288aa0670a..2328268e6245 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -144,9 +144,9 @@ xfs_cui_init(
        ASSERT(nextents > 0);
        if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
                cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
-                                KM_SLEEP);
+                                0);
        else
-                cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP);
+                cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
        xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
        cuip->cui_format.cui_nextents = nextents;
@@ -223,7 +223,7 @@ xfs_trans_get_cud(
 {
        struct xfs_cud_log_item         *cudp;
-        cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
+        cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
        xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
                          &xfs_cud_item_ops);
        cudp->cud_cuip = cuip;
@@ -555,26 +555,24 @@ xfs_cui_recover(
                        irec.br_blockcount = new_len;
                        switch (type) {
                        case XFS_REFCOUNT_INCREASE:
-                                error = xfs_refcount_increase_extent(tp, &irec);
+                                xfs_refcount_increase_extent(tp, &irec);
                                break;
                        case XFS_REFCOUNT_DECREASE:
-                                error = xfs_refcount_decrease_extent(tp, &irec);
+                                xfs_refcount_decrease_extent(tp, &irec);
                                break;
                        case XFS_REFCOUNT_ALLOC_COW:
-                                error = xfs_refcount_alloc_cow_extent(tp,
+                                xfs_refcount_alloc_cow_extent(tp,
                                                irec.br_startblock,
                                                irec.br_blockcount);
                                break;
                        case XFS_REFCOUNT_FREE_COW:
-                                error = xfs_refcount_free_cow_extent(tp,
+                                xfs_refcount_free_cow_extent(tp,
                                                irec.br_startblock,
                                                irec.br_blockcount);
                                break;
                        default:
                                ASSERT(0);
                        }
-                        if (error)
-                                goto abort_error;
                        requeue_only = true;
                }
        }
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index edbe37b7f636..0f08153b4994 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -495,10 +495,8 @@ xfs_reflink_cancel_cow_blocks(
                        ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
                        /* Free the CoW orphan record. */
-                        error = xfs_refcount_free_cow_extent(*tpp,
+                        xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
-                                        del.br_startblock, del.br_blockcount);
+                                        del.br_blockcount);
-                        if (error)
-                                break;
                        xfs_bmap_add_free(*tpp, del.br_startblock,
                                          del.br_blockcount, NULL);
@@ -675,15 +673,10 @@ xfs_reflink_end_cow_extent(
        trace_xfs_reflink_cow_remap(ip, &del);
        /* Free the CoW orphan record. */
-        error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
+        xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
-                        del.br_blockcount);
-        if (error)
-                goto out_cancel;
        /* Map the new blocks into the data fork. */
-        error = xfs_bmap_map_extent(tp, ip, &del);
+        xfs_bmap_map_extent(tp, ip, &del);
-        if (error)
-                goto out_cancel;
        /* Charge this new data fork mapping to the on-disk quota. */
        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1070,14 +1063,10 @@ xfs_reflink_remap_extent(
                                uirec.br_blockcount, uirec.br_startblock);
                /* Update the refcount tree */
-                error = xfs_refcount_increase_extent(tp, &uirec);
+                xfs_refcount_increase_extent(tp, &uirec);
-                if (error)
-                        goto out_cancel;
                /* Map the new blocks into the data fork. */
-                error = xfs_bmap_map_extent(tp, ip, &uirec);
+                xfs_bmap_map_extent(tp, ip, &uirec);
-                if (error)
-                        goto out_cancel;
                /* Update quota accounting. */
                xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 77ed557b6127..8939e0ea09cd 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -142,9 +142,9 @@ xfs_rui_init(
        ASSERT(nextents > 0);
        if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
-                ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+                ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
        else
-                ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
+                ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
        xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
        ruip->rui_format.rui_nextents = nextents;
@@ -244,7 +244,7 @@ xfs_trans_get_rud(
 {
        struct xfs_rud_log_item         *rudp;
-        rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
+        rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
        xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
                          &xfs_rud_item_ops);
        rudp->rud_ruip = ruip;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5fa4db3c3e32..4a48a8c75b4f 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -865,7 +865,7 @@ xfs_alloc_rsum_cache(
         * lower bound on the minimum level with any free extents. We can
         * continue without the cache if it couldn't be allocated.
         */
-        mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP);
+        mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
        if (!mp->m_rsum_cache)
                xfs_warn(mp, "could not allocate realtime summary cache");
 }
@@ -963,7 +963,7 @@ xfs_growfs_rt(
        /*
         * Allocate a new (fake) mount/sb.
         */
-        nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+        nmp = kmem_alloc(sizeof(*nmp), 0);
        /*
         * Loop over the bitmap blocks.
         * We will do everything one bitmap block at a time.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f9450235533c..391b4748cae3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -818,7 +818,8 @@ xfs_init_mount_workqueues(
                goto out_destroy_buf;
        mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
-                        WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+                        WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+                        0, mp->m_fsname);
        if (!mp->m_cil_workqueue)
                goto out_destroy_unwritten;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8094b1920eef..eaae275ed430 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -23,6 +23,7 @@ struct xlog;
 struct xlog_ticket;
 struct xlog_recover;
 struct xlog_recover_item;
+struct xlog_rec_header;
 struct xfs_buf_log_format;
 struct xfs_inode_log_format;
 struct xfs_bmbt_irec;
@@ -30,6 +31,10 @@ struct xfs_btree_cur;
 struct xfs_refcount_irec;
 struct xfs_fsmap;
 struct xfs_rmap_irec;
+struct xfs_icreate_log;
+struct xfs_owner_info;
+struct xfs_trans_res;
+struct xfs_inobt_rec_incore;
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
        TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -3575,6 +3580,35 @@ TRACE_EVENT(xfs_pwork_init,
                  __entry->nr_threads, __entry->pid)
 )
+DECLARE_EVENT_CLASS(xfs_kmem_class,
+        TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
+        TP_ARGS(size, flags, caller_ip),
+        TP_STRUCT__entry(
+                __field(ssize_t, size)
+                __field(int, flags)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->size = size;
+                __entry->flags = flags;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("size %zd flags 0x%x caller %pS",
+                  __entry->size,
+                  __entry->flags,
+                  (char *)__entry->caller_ip)
+)
+#define DEFINE_KMEM_EVENT(name) \
+DEFINE_EVENT(xfs_kmem_class, name, \
+        TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
+        TP_ARGS(size, flags, caller_ip))
+DEFINE_KMEM_EVENT(kmem_alloc);
+DEFINE_KMEM_EVENT(kmem_alloc_io);
+DEFINE_KMEM_EVENT(kmem_alloc_large);
+DEFINE_KMEM_EVENT(kmem_realloc);
+DEFINE_KMEM_EVENT(kmem_zone_alloc);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d42a68d8313b..f4795fdb7389 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -90,7 +90,7 @@ xfs_trans_dup(
        trace_xfs_trans_dup(tp, _RET_IP_);
-        ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+        ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
        /*
         * Initialize the new transaction structure.
@@ -263,7 +263,7 @@ xfs_trans_alloc(
         * GFP_NOFS allocation context so that we avoid lockdep false positives
         * by doing GFP_KERNEL allocations inside sb_start_intwrite().
         */
-        tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+        tp = kmem_zone_zalloc(xfs_trans_zone, 0);
        if (!(flags & XFS_TRANS_NO_WRITECOUNT))
                sb_start_intwrite(mp->m_super);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 1027c9ca6eb8..16457465833b 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -863,7 +863,7 @@ STATIC void
 xfs_trans_alloc_dqinfo(
        xfs_trans_t     *tp)
 {
-        tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
+        tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
 }
 void
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 3123b5aaad2a..cb895b1df5e4 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -30,7 +30,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
                value = NULL;
        }
-        error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+        error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
        if (error)
                return error;
        return asize;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae6648145d18..ffe35d97afcb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3543,6 +3543,8 @@ extern void inode_nohighmem(struct inode *inode);
 /* mm/fadvise.c */
 extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
                       int advice);
+extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
+                           int advice);
 #if defined(CONFIG_IO_URING)
 extern struct sock *io_uring_get_socket(struct file *file);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd032037..4f17c83db575 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -27,8 +27,7 @@
 * deactivate the pages and clear PG_Referenced.
 */
-static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
+int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
-                           int advice)
 {
        struct inode *inode;
        struct address_space *mapping;
@@ -178,6 +177,7 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
        }
        return 0;
 }
+EXPORT_SYMBOL(generic_fadvise);
 int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 {
diff --git a/mm/madvise.c b/mm/madvise.c
index 968df3aa069f..bac973b9f2cc 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -14,6 +14,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
+#include <linux/fadvise.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/fs.h>
@@ -275,6 +276,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end)
 {
        struct file *file = vma->vm_file;
+        loff_t offset;
        *prev = vma;
 #ifdef CONFIG_SWAP
@@ -298,12 +300,20 @@ static long madvise_willneed(struct vm_area_struct *vma,
                return 0;
        }
-        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+        /*
-        if (end > vma->vm_end)
+         * Filesystem's fadvise may need to take various locks.  We need to
-                end = vma->vm_end;
+         * explicitly grab a reference because the vma (and hence the
-        end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+         * vma's reference to the file) can go away as soon as we drop
+         * mmap_sem.
-        force_page_cache_readahead(file->f_mapping, file, start, end - start);
+         */
+        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
+        get_file(file);
+        up_read(&current->mm->mmap_sem);
+        offset = (loff_t)(start - vma->vm_start)
+                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+        vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
+        fput(file);
+        down_read(&current->mm->mmap_sem);
        return 0;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2019-09-18 21:32:43 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-09-18 21:32:43 -0400
commit	b41dae061bbd722b9d7fa828f35d22035b218e18 (patch)
tree	a5c0bade0c3d221483b54204bfc47e4fdbf09316
parent	e6bc9de714972cac34daa1dc1567ee48a47a9342 (diff)
parent	14e15f1bcd738dc13dd7c1e78e4800e8bc577980 (diff)