81 files changed, 1315 insertions, 1089 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 16bb9a328678..da031b93e182 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -3,10 +3,10 @@
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 */
-#include <linux/sched/mm.h>
+#include "xfs.h"
 #include <linux/backing-dev.h>
-#include "kmem.h"
 #include "xfs_message.h"
+#include "xfs_trace.h"
 void *
 kmem_alloc(size_t size, xfs_km_flags_t flags)
@@ -15,9 +15,11 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
        gfp_t   lflags = kmem_flags_convert(flags);
        void    *ptr;
+        trace_kmem_alloc(size, flags, _RET_IP_);
        do {
                ptr = kmalloc(size, lflags);
-                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                if (ptr || (flags & KM_MAYFAIL))
                        return ptr;
                if (!(++retries % 100))
                        xfs_err(NULL,
@@ -28,28 +30,24 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
        } while (1);
 }
-void *
-kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+/*
+ * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
+ * we need to tell memory reclaim that we are in such a context via
+ * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
+ * and potentially deadlocking.
+ */
+static void *
+__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
 {
        unsigned nofs_flag = 0;
        void    *ptr;
-        gfp_t   lflags;
+        gfp_t   lflags = kmem_flags_convert(flags);
-        ptr = kmem_alloc(size, flags | KM_MAYFAIL);
-        if (ptr)
-                return ptr;
-        /*
-         * __vmalloc() will allocate data pages and auxillary structures (e.g.
-         * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
-         * here. Hence we need to tell memory reclaim that we are in such a
-         * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
-         * the filesystem here and potentially deadlocking.
-         */
        if (flags & KM_NOFS)
                nofs_flag = memalloc_nofs_save();
-        lflags = kmem_flags_convert(flags);
        ptr = __vmalloc(size, lflags, PAGE_KERNEL);
        if (flags & KM_NOFS)
@@ -58,6 +56,44 @@ kmem_alloc_large(size_t size, xfs_km_flags_t flags)
        return ptr;
 }
+/*
+ * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
+ * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
+ * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
+ * aligned region.
+ */
+void *
+kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
+{
+        void    *ptr;
+        trace_kmem_alloc_io(size, flags, _RET_IP_);
+        if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
+                align_mask = PAGE_SIZE - 1;
+        ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+        if (ptr) {
+                if (!((uintptr_t)ptr & align_mask))
+                        return ptr;
+                kfree(ptr);
+        }
+        return __kmem_vmalloc(size, flags);
+}
+void *
+kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+{
+        void    *ptr;
+        trace_kmem_alloc_large(size, flags, _RET_IP_);
+        ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+        if (ptr)
+                return ptr;
+        return __kmem_vmalloc(size, flags);
+}
 void *
 kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
 {
@@ -65,9 +101,11 @@ kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
        gfp_t   lflags = kmem_flags_convert(flags);
        void    *ptr;
+        trace_kmem_realloc(newsize, flags, _RET_IP_);
        do {
                ptr = krealloc(old, newsize, lflags);
-                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                if (ptr || (flags & KM_MAYFAIL))
                        return ptr;
                if (!(++retries % 100))
                        xfs_err(NULL,
@@ -85,9 +123,10 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
        gfp_t   lflags = kmem_flags_convert(flags);
        void    *ptr;
+        trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
        do {
                ptr = kmem_cache_alloc(zone, lflags);
-                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                if (ptr || (flags & KM_MAYFAIL))
                        return ptr;
                if (!(++retries % 100))
                        xfs_err(NULL,
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 267655acd426..8170d95cf930 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -16,8 +16,6 @@
 */
 typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_SLEEP        ((__force xfs_km_flags_t)0x0001u)
-#define KM_NOSLEEP      ((__force xfs_km_flags_t)0x0002u)
 #define KM_NOFS         ((__force xfs_km_flags_t)0x0004u)
 #define KM_MAYFAIL      ((__force xfs_km_flags_t)0x0008u)
 #define KM_ZERO         ((__force xfs_km_flags_t)0x0010u)
@@ -32,15 +30,11 @@ kmem_flags_convert(xfs_km_flags_t flags)
 {
        gfp_t   lflags;
-        BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
+        BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
-        if (flags & KM_NOSLEEP) {
+        lflags = GFP_KERNEL | __GFP_NOWARN;
-                lflags = GFP_ATOMIC | __GFP_NOWARN;
+        if (flags & KM_NOFS)
-        } else {
+                lflags &= ~__GFP_FS;
-                lflags = GFP_KERNEL | __GFP_NOWARN;
-                if (flags & KM_NOFS)
-                        lflags &= ~__GFP_FS;
-        }
        /*
         * Default page/slab allocator behavior is to retry for ever
@@ -59,6 +53,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 }
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
 extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
 extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
 static inline void  kmem_free(const void *ptr)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 372ad55631fc..533b04aaf6f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2205,7 +2205,7 @@ xfs_defer_agfl_block(
        ASSERT(xfs_bmap_free_item_zone != NULL);
        ASSERT(oinfo != NULL);
-        new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+        new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
        new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
        new->xefi_blockcount = 1;
        new->xefi_oinfo = *oinfo;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d6ed5d2c07c2..58fa85cec325 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -81,10 +81,9 @@ typedef struct xfs_alloc_arg {
 /*
 * Defines for datatype
 */
-#define XFS_ALLOC_USERDATA              (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA     (1 << 0)/* special case start of file */
-#define XFS_ALLOC_INITIAL_USER_DATA     (1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO         (1 << 1)/* zero extent on allocation */
-#define XFS_ALLOC_USERDATA_ZERO         (1 << 2)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY                (1 << 2)/* Busy extents not allowed */
-#define XFS_ALLOC_NOBUSY                (1 << 3)/* Busy extents not allowed */
 static inline bool
 xfs_alloc_is_userdata(int datatype)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index d48fcf11cc35..510ca6974604 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -97,7 +97,10 @@ xfs_inode_hasattr(
 * Overall external interface routines.
 *========================================================================*/
-/* Retrieve an extended attribute and its value.  Must have ilock. */
+/*
+ * Retrieve an extended attribute and its value.  Must have ilock.
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
 int
 xfs_attr_get_ilocked(
        struct xfs_inode        *ip,
@@ -115,12 +118,28 @@ xfs_attr_get_ilocked(
                return xfs_attr_node_get(args);
 }
-/* Retrieve an extended attribute by name, and its value. */
+/*
+ * Retrieve an extended attribute by name, and its value if requested.
+ *
+ * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
+ * just an indication whether the attribute exists and the size of the value if
+ * it exists. The size is returned in @valuelenp,
+ *
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * @valuelenp, return -ERANGE with the size of the attribute that was found in
+ * @valuelenp.
+ *
+ * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
+ * existence of the attribute has been determined. On success, return that
+ * buffer to the caller and leave them to free it. On failure, free any
+ * allocated buffer and ensure the buffer pointer returned to the caller is
+ * null.
+ */
 int
 xfs_attr_get(
        struct xfs_inode        *ip,
        const unsigned char     *name,
-        unsigned char           *value,
+        unsigned char           **value,
        int                     *valuelenp,
        int                     flags)
 {
@@ -128,6 +147,8 @@ xfs_attr_get(
        uint                    lock_mode;
        int                     error;
+        ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
        XFS_STATS_INC(ip->i_mount, xs_attr_get);
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -137,17 +158,29 @@ xfs_attr_get(
        if (error)
                return error;
-        args.value = value;
-        args.valuelen = *valuelenp;
        /* Entirely possible to look up a name which doesn't exist */
        args.op_flags = XFS_DA_OP_OKNOENT;
+        if (flags & ATTR_ALLOC)
+                args.op_flags |= XFS_DA_OP_ALLOCVAL;
+        else
+                args.value = *value;
+        args.valuelen = *valuelenp;
        lock_mode = xfs_ilock_attr_map_shared(ip);
        error = xfs_attr_get_ilocked(ip, &args);
        xfs_iunlock(ip, lock_mode);
        *valuelenp = args.valuelen;
-        return error == -EEXIST ? 0 : error;
+        /* on error, we have to clean up allocated value buffers */
+        if (error) {
+                if (flags & ATTR_ALLOC) {
+                        kmem_free(args.value);
+                        *value = NULL;
+                }
+                return error;
+        }
+        *value = args.value;
+        return 0;
 }
 /*
@@ -768,6 +801,8 @@ xfs_attr_leaf_removename(
 *
 * This leaf block cannot have a "remote" value, we only call this routine
 * if bmap_one_block() says there is only one block (ie: no remote blks).
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
 */
 STATIC int
 xfs_attr_leaf_get(xfs_da_args_t *args)
@@ -789,9 +824,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
        }
        error = xfs_attr3_leaf_getvalue(bp, args);
        xfs_trans_brelse(args->trans, bp);
-        if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
-                error = xfs_attr_rmtval_get(args);
-        }
        return error;
 }
@@ -1268,11 +1300,13 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 }
 /*
- * Look up a filename in a node attribute list.
+ * Retrieve the attribute data from a node attribute list.
 *
 * This routine gets called for any attribute fork that has more than one
 * block, ie: both true Btree attr lists and for single-leaf-blocks with
 * "remote" values taking up more blocks.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
 */
 STATIC int
 xfs_attr_node_get(xfs_da_args_t *args)
@@ -1294,24 +1328,21 @@ xfs_attr_node_get(xfs_da_args_t *args)
        error = xfs_da3_node_lookup_int(state, &retval);
        if (error) {
                retval = error;
-        } else if (retval == -EEXIST) {
+                goto out_release;
-                blk = &state->path.blk[ state->path.active-1 ];
-                ASSERT(blk->bp != NULL);
-                ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-                /*
-                 * Get the value, local or "remote"
-                 */
-                retval = xfs_attr3_leaf_getvalue(blk->bp, args);
-                if (!retval && (args->rmtblkno > 0)
-                    && !(args->flags & ATTR_KERNOVAL)) {
-                        retval = xfs_attr_rmtval_get(args);
-                }
        }
+        if (retval != -EEXIST)
+                goto out_release;
+        /*
+         * Get the value, local or "remote"
+         */
+        blk = &state->path.blk[state->path.active - 1];
+        retval = xfs_attr3_leaf_getvalue(blk->bp, args);
        /*
         * If not in a transaction, we have to release all the buffers.
         */
+out_release:
        for (i = 0; i < state->path.active; i++) {
                xfs_trans_brelse(args->trans, state->path.blk[i].bp);
                state->path.blk[i].bp = NULL;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index ff28ebf3b635..94badfa1743e 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -37,6 +37,7 @@ struct xfs_attr_list_context;
 #define ATTR_KERNOVAL   0x2000  /* [kernel] get attr size only, not value */
 #define ATTR_INCOMPLETE 0x4000  /* [kernel] return INCOMPLETE attr keys */
+#define ATTR_ALLOC      0x8000  /* allocate xattr buffer on demand */
 #define XFS_ATTR_FLAGS \
        { ATTR_DONTFOLLOW,      "DONTFOLLOW" }, \
@@ -47,7 +48,8 @@ struct xfs_attr_list_context;
        { ATTR_REPLACE,         "REPLACE" }, \
        { ATTR_KERNOTIME,       "KERNOTIME" }, \
        { ATTR_KERNOVAL,        "KERNOVAL" }, \
-        { ATTR_INCOMPLETE,      "INCOMPLETE" }
+        { ATTR_INCOMPLETE,      "INCOMPLETE" }, \
+        { ATTR_ALLOC,           "ALLOC" }
 /*
 * The maximum size (into the kernel or returned from the kernel) of an
@@ -143,7 +145,7 @@ int xfs_attr_list_int(struct xfs_attr_list_context *);
 int xfs_inode_hasattr(struct xfs_inode *ip);
 int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
 int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
-                 unsigned char *value, int *valuelenp, int flags);
+                 unsigned char **value, int *valuelenp, int flags);
 int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
                 unsigned char *value, int valuelen, int flags);
 int xfs_attr_set_args(struct xfs_da_args *args);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 70eb941d02e4..b9f019603d0b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -393,6 +393,50 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
        return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
 }
+static int
+xfs_attr_copy_value(
+        struct xfs_da_args      *args,
+        unsigned char           *value,
+        int                     valuelen)
+{
+        /*
+         * No copy if all we have to do is get the length
+         */
+        if (args->flags & ATTR_KERNOVAL) {
+                args->valuelen = valuelen;
+                return 0;
+        }
+        /*
+         * No copy if the length of the existing buffer is too small
+         */
+        if (args->valuelen < valuelen) {
+                args->valuelen = valuelen;
+                return -ERANGE;
+        }
+        if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
+                args->value = kmem_alloc_large(valuelen, 0);
+                if (!args->value)
+                        return -ENOMEM;
+        }
+        args->valuelen = valuelen;
+        /* remote block xattr requires IO for copy-in */
+        if (args->rmtblkno)
+                return xfs_attr_rmtval_get(args);
+        /*
+         * This is to prevent a GCC warning because the remote xattr case
+         * doesn't have a value to pass in. In that case, we never reach here,
+         * but GCC can't work that out and so throws a "passing NULL to
+         * memcpy" warning.
+         */
+        if (!value)
+                return -EINVAL;
+        memcpy(args->value, value, valuelen);
+        return 0;
+}
 /*========================================================================
 * External routines when attribute fork size < XFS_LITINO(mp).
@@ -720,15 +764,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
 }
 /*
- * Look up a name in a shortform attribute list structure.
+ * Retreive the attribute value and length.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
 */
-/*ARGSUSED*/
 int
-xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+xfs_attr_shortform_getvalue(
+        struct xfs_da_args      *args)
 {
-        xfs_attr_shortform_t *sf;
+        struct xfs_attr_shortform *sf;
-        xfs_attr_sf_entry_t *sfe;
+        struct xfs_attr_sf_entry *sfe;
-        int i;
+        int                     i;
        ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
        sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
@@ -741,18 +789,8 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
                        continue;
                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
                        continue;
-                if (args->flags & ATTR_KERNOVAL) {
+                return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
-                        args->valuelen = sfe->valuelen;
+                                                sfe->valuelen);
-                        return -EEXIST;
-                }
-                if (args->valuelen < sfe->valuelen) {
-                        args->valuelen = sfe->valuelen;
-                        return -ERANGE;
-                }
-                args->valuelen = sfe->valuelen;
-                memcpy(args->value, &sfe->nameval[args->namelen],
-                                                    args->valuelen);
-                return -EEXIST;
        }
        return -ENOATTR;
 }
@@ -782,7 +820,7 @@ xfs_attr_shortform_to_leaf(
        ifp = dp->i_afp;
        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
        size = be16_to_cpu(sf->hdr.totsize);
-        tmpbuffer = kmem_alloc(size, KM_SLEEP);
+        tmpbuffer = kmem_alloc(size, 0);
        ASSERT(tmpbuffer != NULL);
        memcpy(tmpbuffer, ifp->if_u1.if_data, size);
        sf = (xfs_attr_shortform_t *)tmpbuffer;
@@ -985,7 +1023,7 @@ xfs_attr3_leaf_to_shortform(
        trace_xfs_attr_leaf_to_sf(args);
-        tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+        tmpbuffer = kmem_alloc(args->geo->blksize, 0);
        if (!tmpbuffer)
                return -ENOMEM;
@@ -1448,7 +1486,7 @@ xfs_attr3_leaf_compact(
        trace_xfs_attr_leaf_compact(args);
-        tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+        tmpbuffer = kmem_alloc(args->geo->blksize, 0);
        memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
        memset(bp->b_addr, 0, args->geo->blksize);
        leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -2167,7 +2205,7 @@ xfs_attr3_leaf_unbalance(
                struct xfs_attr_leafblock *tmp_leaf;
                struct xfs_attr3_icleaf_hdr tmphdr;
-                tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+                tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
                /*
                 * Copy the header into the temp leaf so that all the stuff
@@ -2350,6 +2388,10 @@ xfs_attr3_leaf_lookup_int(
 /*
 * Get the value associated with an attribute name from a leaf attribute
 * list structure.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
 */
 int
 xfs_attr3_leaf_getvalue(
@@ -2361,7 +2403,6 @@ xfs_attr3_leaf_getvalue(
        struct xfs_attr_leaf_entry *entry;
        struct xfs_attr_leaf_name_local *name_loc;
        struct xfs_attr_leaf_name_remote *name_rmt;
-        int                     valuelen;
        leaf = bp->b_addr;
        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
@@ -2373,36 +2414,19 @@ xfs_attr3_leaf_getvalue(
                name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
                ASSERT(name_loc->namelen == args->namelen);
                ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
-                valuelen = be16_to_cpu(name_loc->valuelen);
+                return xfs_attr_copy_value(args,
-                if (args->flags & ATTR_KERNOVAL) {
+                                        &name_loc->nameval[args->namelen],
-                        args->valuelen = valuelen;
+                                        be16_to_cpu(name_loc->valuelen));
-                        return 0;
+        }
-                }
-                if (args->valuelen < valuelen) {
+        name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
-                        args->valuelen = valuelen;
+        ASSERT(name_rmt->namelen == args->namelen);
-                        return -ERANGE;
+        ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-                }
+        args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-                args->valuelen = valuelen;
+        args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
+        args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-        } else {
+                                               args->rmtvaluelen);
-                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+        return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
-                ASSERT(name_rmt->namelen == args->namelen);
-                ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-                args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
-                args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-                                                       args->rmtvaluelen);
-                if (args->flags & ATTR_KERNOVAL) {
-                        args->valuelen = args->rmtvaluelen;
-                        return 0;
-                }
-                if (args->valuelen < args->rmtvaluelen) {
-                        args->valuelen = args->rmtvaluelen;
-                        return -ERANGE;
-                }
-                args->valuelen = args->rmtvaluelen;
-        }
-        return 0;
 }
 /*========================================================================
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4eb30d357045..3e39b7d40f25 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -358,6 +358,8 @@ xfs_attr_rmtval_copyin(
 /*
 * Read the value associated with an attribute from the out-of-line buffer
 * that we stored it in.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
 */
 int
 xfs_attr_rmtval_get(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 07aad70f3931..054b4ce30033 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -553,7 +553,7 @@ __xfs_bmap_add_free(
 #endif
        ASSERT(xfs_bmap_free_item_zone != NULL);
-        new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+        new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
        new->xefi_startblock = bno;
        new->xefi_blockcount = (xfs_extlen_t)len;
        if (oinfo)
@@ -1099,7 +1099,7 @@ xfs_bmap_add_attrfork(
        if (error)
                goto trans_cancel;
        ASSERT(ip->i_afp == NULL);
-        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
        ip->i_afp->if_flags = XFS_IFEXTENTS;
        logflags = 0;
        switch (ip->i_d.di_format) {
@@ -1985,11 +1985,8 @@ xfs_bmap_add_extent_delay_real(
        }
        /* add reverse mapping unless caller opted out */
-        if (!(bma->flags & XFS_BMAPI_NORMAP)) {
+        if (!(bma->flags & XFS_BMAPI_NORMAP))
-                error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
+                xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
-                if (error)
-                        goto done;
-        }
        /* convert to a btree if necessary */
        if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2471,9 +2468,7 @@ xfs_bmap_add_extent_unwritten_real(
        }
        /* update reverse mappings */
-        error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
+        xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
-        if (error)
-                goto done;
        /* convert to a btree if necessary */
        if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -2832,11 +2827,8 @@ xfs_bmap_add_extent_hole_real(
        }
        /* add reverse mapping unless caller opted out */
-        if (!(flags & XFS_BMAPI_NORMAP)) {
+        if (!(flags & XFS_BMAPI_NORMAP))
-                error = xfs_rmap_map_extent(tp, ip, whichfork, new);
+                xfs_rmap_map_extent(tp, ip, whichfork, new);
-                if (error)
-                        goto done;
-        }
        /* convert to a btree if necessary */
        if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -4050,12 +4042,8 @@ xfs_bmapi_allocate(
         */
        if (!(bma->flags & XFS_BMAPI_METADATA)) {
                bma->datatype = XFS_ALLOC_NOBUSY;
-                if (whichfork == XFS_DATA_FORK) {
+                if (whichfork == XFS_DATA_FORK && bma->offset == 0)
-                        if (bma->offset == 0)
+                        bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
-                                bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
-                        else
-                                bma->datatype |= XFS_ALLOC_USERDATA;
-                }
                if (bma->flags & XFS_BMAPI_ZERO)
                        bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
        }
@@ -4401,12 +4389,9 @@ xfs_bmapi_write(
                         * If this is a CoW allocation, record the data in
                         * the refcount btree for orphan recovery.
                         */
-                        if (whichfork == XFS_COW_FORK) {
+                        if (whichfork == XFS_COW_FORK)
-                                error = xfs_refcount_alloc_cow_extent(tp,
+                                xfs_refcount_alloc_cow_extent(tp, bma.blkno,
-                                                bma.blkno, bma.length);
+                                                bma.length);
-                                if (error)
-                                        goto error0;
-                        }
                }
                /* Deal with the allocated space we found.  */
@@ -4530,7 +4515,7 @@ xfs_bmapi_convert_delalloc(
        if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
                goto out_finish;
        error = -EFSCORRUPTED;
-        if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+        if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
                goto out_finish;
        XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
@@ -4540,12 +4525,8 @@ xfs_bmapi_convert_delalloc(
        *imap = bma.got;
        *seq = READ_ONCE(ifp->if_seq);
-        if (whichfork == XFS_COW_FORK) {
+        if (whichfork == XFS_COW_FORK)
-                error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+                xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
-                                bma.length);
-                if (error)
-                        goto out_finish;
-        }
        error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
                        whichfork);
@@ -5149,18 +5130,14 @@ xfs_bmap_del_extent_real(
        }
        /* remove reverse mapping */
-        error = xfs_rmap_unmap_extent(tp, ip, whichfork, del);
+        xfs_rmap_unmap_extent(tp, ip, whichfork, del);
-        if (error)
-                goto done;
        /*
         * If we need to, add to list of extents to delete.
         */
        if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
                if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
-                        error = xfs_refcount_decrease_extent(tp, del);
+                        xfs_refcount_decrease_extent(tp, del);
-                        if (error)
-                                goto done;
                } else {
                        __xfs_bmap_add_free(tp, del->br_startblock,
                                        del->br_blockcount, NULL,
@@ -5651,12 +5628,11 @@ done:
                        &new);
        /* update reverse mapping. rmap functions merge the rmaps for us */
-        error = xfs_rmap_unmap_extent(tp, ip, whichfork, got);
+        xfs_rmap_unmap_extent(tp, ip, whichfork, got);
-        if (error)
-                return error;
        memcpy(&new, got, sizeof(new));
        new.br_startoff = left->br_startoff + left->br_blockcount;
-        return xfs_rmap_map_extent(tp, ip, whichfork, &new);
+        xfs_rmap_map_extent(tp, ip, whichfork, &new);
+        return 0;
 }
 static int
@@ -5695,10 +5671,9 @@ xfs_bmap_shift_update_extent(
                        got);
        /* update reverse mapping */
-        error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
+        xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
-        if (error)
+        xfs_rmap_map_extent(tp, ip, whichfork, got);
-                return error;
+        return 0;
-        return xfs_rmap_map_extent(tp, ip, whichfork, got);
 }
 int
@@ -6094,7 +6069,7 @@ __xfs_bmap_add(
                        bmap->br_blockcount,
                        bmap->br_state);
-        bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+        bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
        INIT_LIST_HEAD(&bi->bi_list);
        bi->bi_type = type;
        bi->bi_owner = ip;
@@ -6106,29 +6081,29 @@ __xfs_bmap_add(
 }
 /* Map an extent into a file. */
-int
+void
 xfs_bmap_map_extent(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_bmap_is_update_needed(PREV))
-                return 0;
+                return;
-        return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+        __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
 }
 /* Unmap an extent out of a file. */
-int
+void
 xfs_bmap_unmap_extent(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_bmap_is_update_needed(PREV))
-                return 0;
+                return;
-        return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+        __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
 }
 /*
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8f597f9abdbe..5bb446d80542 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -171,6 +171,13 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
                !isnullstartblock(irec->br_startblock);
 }
+/*
+ * Check the mapping for obviously garbage allocations that could trash the
+ * filesystem immediately.
+ */
+#define xfs_valid_startblock(ip, startblock) \
+        ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
 void    xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
                xfs_filblks_t len);
 int     xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
@@ -254,9 +261,9 @@ int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
                enum xfs_bmap_intent_type type, int whichfork,
                xfs_fileoff_t startoff, xfs_fsblock_t startblock,
                xfs_filblks_t *blockcount, xfs_exntst_t state);
-int     xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void    xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
                struct xfs_bmbt_irec *imap);
-int     xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void    xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
                struct xfs_bmbt_irec *imap);
 static inline int xfs_bmap_fork_to_state(int whichfork)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fbb18ba5d905..ffe608d2a2d9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -400,8 +400,20 @@ xfs_bmbt_diff_two_keys(
        union xfs_btree_key     *k1,
        union xfs_btree_key     *k2)
 {
-        return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) -
+        uint64_t                a = be64_to_cpu(k1->bmbt.br_startoff);
-                          be64_to_cpu(k2->bmbt.br_startoff);
+        uint64_t                b = be64_to_cpu(k2->bmbt.br_startoff);
+        /*
+         * Note: This routine previously casted a and b to int64 and subtracted
+         * them to generate a result.  This lead to problems if b was the
+         * "maximum" key value (all ones) being signed incorrectly, hence this
+         * somewhat less efficient version.
+         */
+        if (a > b)
+                return 1;
+        if (b > a)
+                return -1;
+        return 0;
 }
 static xfs_failaddr_t
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f1048efa4268..71de937f9e64 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4466,8 +4466,6 @@ xfs_btree_lblock_verify(
 *                                    btree block
 *
 * @bp: buffer containing the btree block
- * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
- * @pag_max_level: pointer to the per-ag max level field
 */
 xfs_failaddr_t
 xfs_btree_sblock_v5hdr_verify(
@@ -4600,7 +4598,7 @@ xfs_btree_simple_query_range(
                /* Callback */
                error = fn(cur, recp, priv);
-                if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
+                if (error)
                        break;
 advloop:
@@ -4702,8 +4700,7 @@ pop_up:
                         */
                        if (ldiff >= 0 && hdiff >= 0) {
                                error = fn(cur, recp, priv);
-                                if (error < 0 ||
+                                if (error)
-                                    error == XFS_BTREE_QUERY_RANGE_ABORT)
                                        break;
                        } else if (hdiff < 0) {
                                /* Record is larger than high key; pop. */
@@ -4774,8 +4771,7 @@ out:
 * Query a btree for all records overlapping a given interval of keys.  The
 * supplied function will be called with each record found; return one of the
 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
- * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
+ * code.  This function returns -ECANCELED, zero, or a negative error code.
- * negative error code.
 */
 int
 xfs_btree_query_range(
@@ -4891,7 +4887,7 @@ xfs_btree_has_record_helper(
        union xfs_btree_rec             *rec,
        void                            *priv)
 {
-        return XFS_BTREE_QUERY_RANGE_ABORT;
+        return -ECANCELED;
 }
 /* Is there a record covering a given range of keys? */
@@ -4906,7 +4902,7 @@ xfs_btree_has_record(
        error = xfs_btree_query_range(cur, low, high,
                        &xfs_btree_has_record_helper, NULL);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+        if (error == -ECANCELED) {
                *exists = true;
                return 0;
        }
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index fa3cd8ab9aba..ced1e65d1483 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -464,9 +464,13 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
 uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
 unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
-/* return codes */
+/*
-#define XFS_BTREE_QUERY_RANGE_CONTINUE  (XFS_ITER_CONTINUE) /* keep iterating */
+ * Return codes for the query range iterator function are 0 to continue
-#define XFS_BTREE_QUERY_RANGE_ABORT     (XFS_ITER_ABORT)    /* stop iterating */
+ * iterating, and non-zero to stop iterating.  Any non-zero value will be
+ * passed up to the _query_range caller.  The special value -ECANCELED can be
+ * used to stop iteration, because _query_range never generates that error
+ * code on its own.
+ */
 typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
                union xfs_btree_rec *rec, void *priv);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 0bf56e94bfe9..4fd1223c1bd5 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2098,7 +2098,7 @@ xfs_da_grow_inode_int(
                 * If we didn't get it and the block might work if fragmented,
                 * try without the CONTIG flag.  Loop until we get it all.
                 */
-                mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+                mapp = kmem_alloc(sizeof(*mapp) * count, 0);
                for (b = *bno, mapi = 0; b < *bno + count; ) {
                        nmap = min(XFS_BMAP_MAX_NMAP, count);
                        c = (int)(*bno + count - b);
@@ -2480,7 +2480,7 @@ xfs_buf_map_from_irec(
        if (nirecs > 1) {
                map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
-                                  KM_SLEEP | KM_NOFS);
+                                  KM_NOFS);
                if (!map)
                        return -ENOMEM;
                *mapp = map;
@@ -2539,7 +2539,7 @@ xfs_dabuf_map(
                 */
                if (nfsb != 1)
                        irecs = kmem_zalloc(sizeof(irec) * nfsb,
-                                            KM_SLEEP | KM_NOFS);
+                                            KM_NOFS);
                nirecs = nfsb;
                error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 84dd865b6c3d..ae0bbd20d9ca 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -81,13 +81,15 @@ typedef struct xfs_da_args {
 #define XFS_DA_OP_ADDNAME       0x0004  /* this is an add operation */
 #define XFS_DA_OP_OKNOENT       0x0008  /* lookup/add op, ENOENT ok, else die */
 #define XFS_DA_OP_CILOOKUP      0x0010  /* lookup to return CI name if found */
+#define XFS_DA_OP_ALLOCVAL      0x0020  /* lookup to alloc buffer if found  */
 #define XFS_DA_OP_FLAGS \
        { XFS_DA_OP_JUSTCHECK,  "JUSTCHECK" }, \
        { XFS_DA_OP_RENAME,     "RENAME" }, \
        { XFS_DA_OP_ADDNAME,    "ADDNAME" }, \
        { XFS_DA_OP_OKNOENT,    "OKNOENT" }, \
-        { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }
+        { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }, \
+        { XFS_DA_OP_ALLOCVAL,   "ALLOCVAL" }
 /*
 * Storage for holding state during Btree searches and split/join ops.
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index eb2be2a6a25a..22557527cfdb 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -517,7 +517,7 @@ xfs_defer_add(
        }
        if (!dfp) {
                dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
-                                KM_SLEEP | KM_NOFS);
+                                KM_NOFS);
                dfp->dfp_type = type;
                dfp->dfp_intent = NULL;
                dfp->dfp_done = NULL;
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 67840723edbb..867c5dee0751 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -110,9 +110,9 @@ xfs_da_mount(
        nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
        mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-                                    KM_SLEEP | KM_MAYFAIL);
+                                    KM_MAYFAIL);
        mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-                                     KM_SLEEP | KM_MAYFAIL);
+                                     KM_MAYFAIL);
        if (!mp->m_dir_geo || !mp->m_attr_geo) {
                kmem_free(mp->m_dir_geo);
                kmem_free(mp->m_attr_geo);
@@ -217,7 +217,7 @@ xfs_dir_init(
        if (error)
                return error;
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        if (!args)
                return -ENOMEM;
@@ -254,7 +254,7 @@ xfs_dir_createname(
                XFS_STATS_INC(dp->i_mount, xs_dir_create);
        }
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        if (!args)
                return -ENOMEM;
@@ -353,7 +353,7 @@ xfs_dir_lookup(
         * lockdep Doing this avoids having to add a bunch of lockdep class
         * annotations into the reclaim path for the ilock.
         */
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        args->geo = dp->i_mount->m_dir_geo;
        args->name = name->name;
        args->namelen = name->len;
@@ -422,7 +422,7 @@ xfs_dir_removename(
        ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
        XFS_STATS_INC(dp->i_mount, xs_dir_remove);
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        if (!args)
                return -ENOMEM;
@@ -483,7 +483,7 @@ xfs_dir_replace(
        if (rval)
                return rval;
-        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args = kmem_zalloc(sizeof(*args), KM_NOFS);
        if (!args)
                return -ENOMEM;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index a6fb0cc2085e..9595ced393dc 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1092,7 +1092,7 @@ xfs_dir2_sf_to_block(
         * Copy the directory into a temporary buffer.
         * Then pitch the incore inode data so we can make extents.
         */
-        sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+        sfp = kmem_alloc(ifp->if_bytes, 0);
        memcpy(sfp, oldsfp, ifp->if_bytes);
        xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 1fc44efc344d..705c4f562758 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -32,8 +32,6 @@ static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
 static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
                                 int index, xfs_da_state_blk_t *dblk,
                                 int *rval);
-static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
-                                     xfs_da_state_blk_t *fblk);
 /*
 * Check internal consistency of a leafn block.
@@ -1611,113 +1609,152 @@ xfs_dir2_leafn_unbalance(
 }
 /*
- * Top-level node form directory addname routine.
+ * Add a new data block to the directory at the free space index that the caller
+ * has specified.
 */
-int                                             /* error */
+static int
-xfs_dir2_node_addname(
+xfs_dir2_node_add_datablk(
-        xfs_da_args_t           *args)          /* operation arguments */
+        struct xfs_da_args      *args,
+        struct xfs_da_state_blk *fblk,
+        xfs_dir2_db_t           *dbno,
+        struct xfs_buf          **dbpp,
+        struct xfs_buf          **fbpp,
+        int                     *findex)
 {
-        xfs_da_state_blk_t      *blk;           /* leaf block for insert */
+        struct xfs_inode        *dp = args->dp;
-        int                     error;          /* error return value */
+        struct xfs_trans        *tp = args->trans;
-        int                     rval;           /* sub-return value */
+        struct xfs_mount        *mp = dp->i_mount;
-        xfs_da_state_t          *state;         /* btree cursor */
+        struct xfs_dir3_icfree_hdr freehdr;
+        struct xfs_dir2_data_free *bf;
+        struct xfs_dir2_data_hdr *hdr;
+        struct xfs_dir2_free    *free = NULL;
+        xfs_dir2_db_t           fbno;
+        struct xfs_buf          *fbp;
+        struct xfs_buf          *dbp;
+        __be16                  *bests = NULL;
+        int                     error;
-        trace_xfs_dir2_node_addname(args);
+        /* Not allowed to allocate, return failure. */
+        if (args->total == 0)
+                return -ENOSPC;
+        /* Allocate and initialize the new data block.  */
+        error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
+        if (error)
+                return error;
+        error = xfs_dir3_data_init(args, *dbno, &dbp);
+        if (error)
+                return error;
        /*
-         * Allocate and initialize the state (btree cursor).
+         * Get the freespace block corresponding to the data block
-         */
+         * that was just allocated.
-        state = xfs_da_state_alloc();
-        state->args = args;
-        state->mp = args->dp->i_mount;
-        /*
-         * Look up the name.  We're not supposed to find it, but
-         * this gives us the insertion point.
         */
-        error = xfs_da3_node_lookup_int(state, &rval);
+        fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
+        error = xfs_dir2_free_try_read(tp, dp,
+                               xfs_dir2_db_to_da(args->geo, fbno), &fbp);
        if (error)
-                rval = error;
+                return error;
-        if (rval != -ENOENT) {
-                goto done;
-        }
        /*
-         * Add the data entry to a data block.
+         * If there wasn't a freespace block, the read will
-         * Extravalid is set to a freeblock found by lookup.
+         * return a NULL fbp.  Allocate and initialize a new one.
         */
-        rval = xfs_dir2_node_addname_int(args,
+        if (!fbp) {
-                state->extravalid ? &state->extrablk : NULL);
+                error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
-        if (rval) {
+                if (error)
-                goto done;
+                        return error;
+                if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
+                        xfs_alert(mp,
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
+                                __func__, (unsigned long long)dp->i_ino,
+                                (long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
+                                (long long)*dbno, (long long)fbno);
+                        if (fblk) {
+                                xfs_alert(mp,
+                        " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
+                                        fblk, (unsigned long long)fblk->blkno,
+                                        fblk->index, fblk->magic);
+                        } else {
+                                xfs_alert(mp, " ... fblk is NULL");
+                        }
+                        XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+                        return -EFSCORRUPTED;
+                }
+                /* Get a buffer for the new block. */
+                error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+                if (error)
+                        return error;
+                free = fbp->b_addr;
+                bests = dp->d_ops->free_bests_p(free);
+                dp->d_ops->free_hdr_from_disk(&freehdr, free);
+                /* Remember the first slot as our empty slot. */
+                freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+                                                        XFS_DIR2_FREE_OFFSET)) *
+                                dp->d_ops->free_max_bests(args->geo);
+        } else {
+                free = fbp->b_addr;
+                bests = dp->d_ops->free_bests_p(free);
+                dp->d_ops->free_hdr_from_disk(&freehdr, free);
        }
-        blk = &state->path.blk[state->path.active - 1];
-        ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        /* Set the freespace block index from the data block number. */
+        *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
+        /* Extend the freespace table if the new data block is off the end. */
+        if (*findex >= freehdr.nvalid) {
+                ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
+                freehdr.nvalid = *findex + 1;
+                bests[*findex] = cpu_to_be16(NULLDATAOFF);
+        }
        /*
-         * Add the new leaf entry.
+         * If this entry was for an empty data block (this should always be
+         * true) then update the header.
         */
-        rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+        if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
-        if (rval == 0) {
+                freehdr.nused++;
-                /*
+                dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-                 * It worked, fix the hash values up the btree.
+                xfs_dir2_free_log_header(args, fbp);
-                 */
-                if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
-                        xfs_da3_fixhashpath(state, &state->path);
-        } else {
-                /*
-                 * It didn't work, we need to split the leaf block.
-                 */
-                if (args->total == 0) {
-                        ASSERT(rval == -ENOSPC);
-                        goto done;
-                }
-                /*
-                 * Split the leaf block and insert the new entry.
-                 */
-                rval = xfs_da3_split(state);
        }
-done:
-        xfs_da_state_free(state);
+        /* Update the freespace value for the new block in the table. */
-        return rval;
+        hdr = dbp->b_addr;
+        bf = dp->d_ops->data_bestfree_p(hdr);
+        bests[*findex] = bf[0].length;
+        *dbpp = dbp;
+        *fbpp = fbp;
+        return 0;
 }
-/*
+static int
- * Add the data entry for a node-format directory name addition.
+xfs_dir2_node_find_freeblk(
- * The leaf entry is added in xfs_dir2_leafn_add.
+        struct xfs_da_args      *args,
- * We may enter with a freespace block that the lookup found.
+        struct xfs_da_state_blk *fblk,
- */
+        xfs_dir2_db_t           *dbnop,
-static int                                      /* error */
+        struct xfs_buf          **fbpp,
-xfs_dir2_node_addname_int(
+        int                     *findexp,
-        xfs_da_args_t           *args,          /* operation arguments */
+        int                     length)
-        xfs_da_state_blk_t      *fblk)          /* optional freespace block */
 {
-        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-        xfs_dir2_db_t           dbno;           /* data block number */
-        struct xfs_buf          *dbp;           /* data block buffer */
-        xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
-        xfs_inode_t             *dp;            /* incore directory inode */
-        xfs_dir2_data_unused_t  *dup;           /* data unused entry pointer */
-        int                     error;          /* error return value */
-        xfs_dir2_db_t           fbno;           /* freespace block number */
-        struct xfs_buf          *fbp;           /* freespace buffer */
-        int                     findex;         /* freespace entry index */
-        xfs_dir2_free_t         *free=NULL;     /* freespace block structure */
-        xfs_dir2_db_t           ifbno;          /* initial freespace block no */
-        xfs_dir2_db_t           lastfbno=0;     /* highest freespace block no */
-        int                     length;         /* length of the new entry */
-        int                     logfree;        /* need to log free entry */
-        xfs_mount_t             *mp;            /* filesystem mount point */
-        int                     needlog;        /* need to log data header */
-        int                     needscan;       /* need to rescan data frees */
-        __be16                  *tagp;          /* data entry tag pointer */
-        xfs_trans_t             *tp;            /* transaction pointer */
-        __be16                  *bests;
        struct xfs_dir3_icfree_hdr freehdr;
-        struct xfs_dir2_data_free *bf;
+        struct xfs_dir2_free    *free = NULL;
-        xfs_dir2_data_aoff_t    aoff;
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_trans        *tp = args->trans;
+        struct xfs_buf          *fbp = NULL;
+        xfs_dir2_db_t           firstfbno;
+        xfs_dir2_db_t           lastfbno;
+        xfs_dir2_db_t           ifbno = -1;
+        xfs_dir2_db_t           dbno = -1;
+        xfs_dir2_db_t           fbno;
+        xfs_fileoff_t           fo;
+        __be16                  *bests = NULL;
+        int                     findex = 0;
+        int                     error;
-        dp = args->dp;
-        mp = dp->i_mount;
-        tp = args->trans;
-        length = dp->d_ops->data_entsize(args->namelen);
        /*
         * If we came in with a freespace block that means that lookup
         * found an entry with our hash value.  This is the freespace
@@ -1725,288 +1762,157 @@ xfs_dir2_node_addname_int(
         */
        if (fblk) {
                fbp = fblk->bp;
-                /*
-                 * Remember initial freespace block number.
-                 */
-                ifbno = fblk->blkno;
                free = fbp->b_addr;
                findex = fblk->index;
-                bests = dp->d_ops->free_bests_p(free);
-                dp->d_ops->free_hdr_from_disk(&freehdr, free);
-                /*
-                 * This means the free entry showed that the data block had
-                 * space for our entry, so we remembered it.
-                 * Use that data block.
-                 */
                if (findex >= 0) {
+                        /* caller already found the freespace for us. */
+                        bests = dp->d_ops->free_bests_p(free);
+                        dp->d_ops->free_hdr_from_disk(&freehdr, free);
                        ASSERT(findex < freehdr.nvalid);
                        ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
                        ASSERT(be16_to_cpu(bests[findex]) >= length);
                        dbno = freehdr.firstdb + findex;
-                } else {
+                        goto found_block;
-                        /*
-                         * The data block looked at didn't have enough room.
-                         * We'll start at the beginning of the freespace entries.
-                         */
-                        dbno = -1;
-                        findex = 0;
                }
-        } else {
                /*
-                 * Didn't come in with a freespace block, so no data block.
+                 * The data block looked at didn't have enough room.
+                 * We'll start at the beginning of the freespace entries.
                 */
-                ifbno = dbno = -1;
+                ifbno = fblk->blkno;
+                xfs_trans_brelse(tp, fbp);
                fbp = NULL;
-                findex = 0;
+                fblk->bp = NULL;
        }
        /*
-         * If we don't have a data block yet, we're going to scan the
+         * If we don't have a data block yet, we're going to scan the freespace
-         * freespace blocks looking for one.  Figure out what the
+         * data for a data block with enough free space in it.
-         * highest freespace block number is.
-         */
-        if (dbno == -1) {
-                xfs_fileoff_t   fo;             /* freespace block number */
-                if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
-                        return error;
-                lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
-                fbno = ifbno;
-        }
-        /*
-         * While we haven't identified a data block, search the freeblock
-         * data for a good data block.  If we find a null freeblock entry,
-         * indicating a hole in the data blocks, remember that.
         */
-        while (dbno == -1) {
+        error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
-                /*
+        if (error)
-                 * If we don't have a freeblock in hand, get the next one.
+                return error;
-                 */
+        lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
-                if (fbp == NULL) {
+        firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
-                        /*
-                         * Happens the first time through unless lookup gave
-                         * us a freespace block to start with.
-                         */
-                        if (++fbno == 0)
-                                fbno = xfs_dir2_byte_to_db(args->geo,
-                                                        XFS_DIR2_FREE_OFFSET);
-                        /*
-                         * If it's ifbno we already looked at it.
-                         */
-                        if (fbno == ifbno)
-                                fbno++;
-                        /*
-                         * If it's off the end we're done.
-                         */
-                        if (fbno >= lastfbno)
-                                break;
-                        /*
-                         * Read the block.  There can be holes in the
-                         * freespace blocks, so this might not succeed.
-                         * This should be really rare, so there's no reason
-                         * to avoid it.
-                         */
-                        error = xfs_dir2_free_try_read(tp, dp,
-                                        xfs_dir2_db_to_da(args->geo, fbno),
-                                        &fbp);
-                        if (error)
-                                return error;
-                        if (!fbp)
-                                continue;
-                        free = fbp->b_addr;
-                        findex = 0;
-                }
-                /*
-                 * Look at the current free entry.  Is it good enough?
-                 *
-                 * The bests initialisation should be where the bufer is read in
-                 * the above branch. But gcc is too stupid to realise that bests
-                 * and the freehdr are actually initialised if they are placed
-                 * there, so we have to do it here to avoid warnings. Blech.
-                 */
-                bests = dp->d_ops->free_bests_p(free);
-                dp->d_ops->free_hdr_from_disk(&freehdr, free);
-                if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-                    be16_to_cpu(bests[findex]) >= length)
-                        dbno = freehdr.firstdb + findex;
-                else {
-                        /*
-                         * Are we done with the freeblock?
-                         */
-                        if (++findex == freehdr.nvalid) {
-                                /*
-                                 * Drop the block.
-                                 */
-                                xfs_trans_brelse(tp, fbp);
-                                fbp = NULL;
-                                if (fblk && fblk->bp)
-                                        fblk->bp = NULL;
-                        }
-                }
-        }
-        /*
-         * If we don't have a data block, we need to allocate one and make
-         * the freespace entries refer to it.
-         */
-        if (unlikely(dbno == -1)) {
-                /*
-                 * Not allowed to allocate, return failure.
-                 */
-                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-                        return -ENOSPC;
-                /*
-                 * Allocate and initialize the new data block.
-                 */
-                if (unlikely((error = xfs_dir2_grow_inode(args,
-                                                         XFS_DIR2_DATA_SPACE,
-                                                         &dbno)) ||
-                    (error = xfs_dir3_data_init(args, dbno, &dbp))))
-                        return error;
-                /*
+        for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
-                 * If (somehow) we have a freespace block, get rid of it.
+                /* If it's ifbno we already looked at it. */
-                 */
+                if (fbno == ifbno)
-                if (fbp)
+                        continue;
-                        xfs_trans_brelse(tp, fbp);
-                if (fblk && fblk->bp)
-                        fblk->bp = NULL;
                /*
-                 * Get the freespace block corresponding to the data block
+                 * Read the block.  There can be holes in the freespace blocks,
-                 * that was just allocated.
+                 * so this might not succeed.  This should be really rare, so
+                 * there's no reason to avoid it.
                 */
-                fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
                error = xfs_dir2_free_try_read(tp, dp,
-                                       xfs_dir2_db_to_da(args->geo, fbno),
+                                xfs_dir2_db_to_da(args->geo, fbno),
-                                       &fbp);
+                                &fbp);
                if (error)
                        return error;
+                if (!fbp)
+                        continue;
-                /*
+                free = fbp->b_addr;
-                 * If there wasn't a freespace block, the read will
+                bests = dp->d_ops->free_bests_p(free);
-                 * return a NULL fbp.  Allocate and initialize a new one.
+                dp->d_ops->free_hdr_from_disk(&freehdr, free);
-                 */
-                if (!fbp) {
-                        error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
-                                                    &fbno);
-                        if (error)
-                                return error;
-                        if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
+                /* Scan the free entry array for a large enough free space. */
-                                xfs_alert(mp,
+                for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
-"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
+                        if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-                                        __func__, (unsigned long long)dp->i_ino,
+                            be16_to_cpu(bests[findex]) >= length) {
-                                        (long long)dp->d_ops->db_to_fdb(
+                                dbno = freehdr.firstdb + findex;
-                                                                args->geo, dbno),
+                                goto found_block;
-                                        (long long)dbno, (long long)fbno,
-                                        (unsigned long long)ifbno, lastfbno);
-                                if (fblk) {
-                                        xfs_alert(mp,
-                                " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
-                                                fblk,
-                                                (unsigned long long)fblk->blkno,
-                                                fblk->index,
-                                                fblk->magic);
-                                } else {
-                                        xfs_alert(mp, " ... fblk is NULL");
-                                }
-                                XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                return -EFSCORRUPTED;
                        }
-                        /*
-                         * Get a buffer for the new block.
-                         */
-                        error = xfs_dir3_free_get_buf(args, fbno, &fbp);
-                        if (error)
-                                return error;
-                        free = fbp->b_addr;
-                        bests = dp->d_ops->free_bests_p(free);
-                        dp->d_ops->free_hdr_from_disk(&freehdr, free);
-                        /*
-                         * Remember the first slot as our empty slot.
-                         */
-                        freehdr.firstdb =
-                                (fbno - xfs_dir2_byte_to_db(args->geo,
-                                                        XFS_DIR2_FREE_OFFSET)) *
-                                        dp->d_ops->free_max_bests(args->geo);
-                } else {
-                        free = fbp->b_addr;
-                        bests = dp->d_ops->free_bests_p(free);
-                        dp->d_ops->free_hdr_from_disk(&freehdr, free);
                }
-                /*
+                /* Didn't find free space, go on to next free block */
-                 * Set the freespace block index from the data block number.
+                xfs_trans_brelse(tp, fbp);
-                 */
-                findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
-                /*
-                 * If it's after the end of the current entries in the
-                 * freespace block, extend that table.
-                 */
-                if (findex >= freehdr.nvalid) {
-                        ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
-                        freehdr.nvalid = findex + 1;
-                        /*
-                         * Tag new entry so nused will go up.
-                         */
-                        bests[findex] = cpu_to_be16(NULLDATAOFF);
-                }
-                /*
-                 * If this entry was for an empty data block
-                 * (this should always be true) then update the header.
-                 */
-                if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
-                        freehdr.nused++;
-                        dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
-                        xfs_dir2_free_log_header(args, fbp);
-                }
-                /*
-                 * Update the real value in the table.
-                 * We haven't allocated the data entry yet so this will
-                 * change again.
-                 */
-                hdr = dbp->b_addr;
-                bf = dp->d_ops->data_bestfree_p(hdr);
-                bests[findex] = bf[0].length;
-                logfree = 1;
        }
+found_block:
+        *dbnop = dbno;
+        *fbpp = fbp;
+        *findexp = findex;
+        return 0;
+}
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int
+xfs_dir2_node_addname_int(
+        struct xfs_da_args      *args,          /* operation arguments */
+        struct xfs_da_state_blk *fblk)          /* optional freespace block */
+{
+        struct xfs_dir2_data_unused *dup;       /* data unused entry pointer */
+        struct xfs_dir2_data_entry *dep;        /* data entry pointer */
+        struct xfs_dir2_data_hdr *hdr;          /* data block header */
+        struct xfs_dir2_data_free *bf;
+        struct xfs_dir2_free    *free = NULL;   /* freespace block structure */
+        struct xfs_trans        *tp = args->trans;
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_buf          *dbp;           /* data block buffer */
+        struct xfs_buf          *fbp;           /* freespace buffer */
+        xfs_dir2_data_aoff_t    aoff;
+        xfs_dir2_db_t           dbno;           /* data block number */
+        int                     error;          /* error return value */
+        int                     findex;         /* freespace entry index */
+        int                     length;         /* length of the new entry */
+        int                     logfree = 0;    /* need to log free entry */
+        int                     needlog = 0;    /* need to log data header */
+        int                     needscan = 0;   /* need to rescan data frees */
+        __be16                  *tagp;          /* data entry tag pointer */
+        __be16                  *bests;
+        length = dp->d_ops->data_entsize(args->namelen);
+        error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
+                                           length);
+        if (error)
+                return error;
        /*
-         * We had a data block so we don't have to make a new one.
+         * Now we know if we must allocate blocks, so if we are checking whether
+         * we can insert without allocation then we can return now.
         */
-        else {
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
-                /*
+                if (dbno == -1)
-                 * If just checking, we succeeded.
+                        return -ENOSPC;
-                 */
+                return 0;
-                if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+        }
-                        return 0;
-                /*
+        /*
-                 * Read the data block in.
+         * If we don't have a data block, we need to allocate one and make
-                 */
+         * the freespace entries refer to it.
+         */
+        if (dbno == -1) {
+                /* we're going to have to log the free block index later */
+                logfree = 1;
+                error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
+                                                  &findex);
+        } else {
+                /* Read the data block in. */
                error = xfs_dir3_data_read(tp, dp,
                                           xfs_dir2_db_to_da(args->geo, dbno),
                                           -1, &dbp);
-                if (error)
-                        return error;
-                hdr = dbp->b_addr;
-                bf = dp->d_ops->data_bestfree_p(hdr);
-                logfree = 0;
        }
+        if (error)
+                return error;
+        /* setup for data block up now */
+        hdr = dbp->b_addr;
+        bf = dp->d_ops->data_bestfree_p(hdr);
        ASSERT(be16_to_cpu(bf[0].length) >= length);
-        /*
-         * Point to the existing unused space.
+        /* Point to the existing unused space. */
-         */
        dup = (xfs_dir2_data_unused_t *)
              ((char *)hdr + be16_to_cpu(bf[0].offset));
-        needscan = needlog = 0;
-        /*
+        /* Mark the first part of the unused space, inuse for us. */
-         * Mark the first part of the unused space, inuse for us.
-         */
        aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
        error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
                        &needlog, &needscan);
@@ -2014,9 +1920,8 @@ xfs_dir2_node_addname_int(
                xfs_trans_brelse(tp, dbp);
                return error;
        }
-        /*
-         * Fill in the new entry and log it.
+        /* Fill in the new entry and log it. */
-         */
        dep = (xfs_dir2_data_entry_t *)dup;
        dep->inumber = cpu_to_be64(args->inumber);
        dep->namelen = args->namelen;
@@ -2025,38 +1930,101 @@ xfs_dir2_node_addname_int(
        tagp = dp->d_ops->data_entry_tag_p(dep);
        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
        xfs_dir2_data_log_entry(args, dbp, dep);
-        /*
-         * Rescan the block for bestfree if needed.
+        /* Rescan the freespace and log the data block if needed. */
-         */
        if (needscan)
                xfs_dir2_data_freescan(dp, hdr, &needlog);
-        /*
-         * Log the data block header if needed.
-         */
        if (needlog)
                xfs_dir2_data_log_header(args, dbp);
-        /*
-         * If the freespace entry is now wrong, update it.
+        /* If the freespace block entry is now wrong, update it. */
-         */
+        free = fbp->b_addr;
-        bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
+        bests = dp->d_ops->free_bests_p(free);
-        if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+        if (bests[findex] != bf[0].length) {
                bests[findex] = bf[0].length;
                logfree = 1;
        }
-        /*
-         * Log the freespace entry if needed.
+        /* Log the freespace entry if needed. */
-         */
        if (logfree)
                xfs_dir2_free_log_bests(args, fbp, findex, findex);
-        /*
-         * Return the data block and offset in args, then drop the data block.
+        /* Return the data block and offset in args. */
-         */
        args->blkno = (xfs_dablk_t)dbno;
        args->index = be16_to_cpu(*tagp);
        return 0;
 }
 /*
+ * Top-level node form directory addname routine.
+ */
+int                                             /* error */
+xfs_dir2_node_addname(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_da_state_blk_t      *blk;           /* leaf block for insert */
+        int                     error;          /* error return value */
+        int                     rval;           /* sub-return value */
+        xfs_da_state_t          *state;         /* btree cursor */
+        trace_xfs_dir2_node_addname(args);
+        /*
+         * Allocate and initialize the state (btree cursor).
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        /*
+         * Look up the name.  We're not supposed to find it, but
+         * this gives us the insertion point.
+         */
+        error = xfs_da3_node_lookup_int(state, &rval);
+        if (error)
+                rval = error;
+        if (rval != -ENOENT) {
+                goto done;
+        }
+        /*
+         * Add the data entry to a data block.
+         * Extravalid is set to a freeblock found by lookup.
+         */
+        rval = xfs_dir2_node_addname_int(args,
+                state->extravalid ? &state->extrablk : NULL);
+        if (rval) {
+                goto done;
+        }
+        blk = &state->path.blk[state->path.active - 1];
+        ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        /*
+         * Add the new leaf entry.
+         */
+        rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+        if (rval == 0) {
+                /*
+                 * It worked, fix the hash values up the btree.
+                 */
+                if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+                        xfs_da3_fixhashpath(state, &state->path);
+        } else {
+                /*
+                 * It didn't work, we need to split the leaf block.
+                 */
+                if (args->total == 0) {
+                        ASSERT(rval == -ENOSPC);
+                        goto done;
+                }
+                /*
+                 * Split the leaf block and insert the new entry.
+                 */
+                rval = xfs_da3_split(state);
+        }
+done:
+        xfs_da_state_free(state);
+        return rval;
+}
+/*
 * Lookup an entry in a node-format directory.
 * All the real work happens in xfs_da3_node_lookup_int.
 * The only real output is the inode number of the entry.
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 033589257f54..85f14fc2a8da 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -164,7 +164,7 @@ xfs_dir2_block_to_sf(
         * can free the block and copy the formatted data into the inode literal
         * area.
         */
-        dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+        dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
        hdr = bp->b_addr;
        /*
@@ -436,7 +436,7 @@ xfs_dir2_sf_addname_hard(
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        old_isize = (int)dp->i_d.di_size;
-        buf = kmem_alloc(old_isize, KM_SLEEP);
+        buf = kmem_alloc(old_isize, 0);
        oldsfp = (xfs_dir2_sf_hdr_t *)buf;
        memcpy(oldsfp, sfp, old_isize);
        /*
@@ -1096,7 +1096,7 @@ xfs_dir2_sf_toino4(
         * Don't want xfs_idata_realloc copying the data here.
         */
        oldsize = dp->i_df.if_bytes;
-        buf = kmem_alloc(oldsize, KM_SLEEP);
+        buf = kmem_alloc(oldsize, 0);
        oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        ASSERT(oldsfp->i8count == 1);
        memcpy(buf, oldsfp, oldsize);
@@ -1169,7 +1169,7 @@ xfs_dir2_sf_toino8(
         * Don't want xfs_idata_realloc copying the data here.
         */
        oldsize = dp->i_df.if_bytes;
-        buf = kmem_alloc(oldsize, KM_SLEEP);
+        buf = kmem_alloc(oldsize, 0);
        oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        ASSERT(oldsfp->i8count == 0);
        memcpy(buf, oldsfp, oldsize);
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 52d03a3a02a4..39dd2b908106 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -287,7 +287,7 @@ struct xfs_ag_geometry {
        uint32_t        ag_ifree;       /* o: inodes free */
        uint32_t        ag_sick;        /* o: sick things in ag */
        uint32_t        ag_checked;     /* o: checked metadata in ag */
-        uint32_t        ag_reserved32;  /* o: zero */
+        uint32_t        ag_flags;       /* i/o: flags for this ag */
        uint64_t        ag_reserved[12];/* o: zero */
 };
 #define XFS_AG_GEOM_SICK_SB     (1 << 0)  /* superblock */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 04377ab75863..588d44613094 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2787,8 +2787,13 @@ xfs_ialloc_setup_geometry(
        igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
                        inodes);
-        /* Set the maximum inode count for this filesystem. */
+        /*
-        if (sbp->sb_imax_pct) {
+         * Set the maximum inode count for this filesystem, being careful not
+         * to use obviously garbage sb_inopblog/sb_inopblock values.  Regular
+         * users should never get here due to failing sb verification, but
+         * certain users (xfs_db) need to be usable even with corrupt metadata.
+         */
+        if (sbp->sb_imax_pct && igeo->ialloc_blks) {
                /*
                 * Make sure the maximum inode count is a multiple
                 * of the units we allocate inodes in.
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 27aa3f2bc4bc..7bc87408f1a0 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -616,7 +616,7 @@ xfs_iext_realloc_root(
 * sequence counter is seen before the modifications to the extent tree itself
 * take effect.
 */
-static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
+static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
 {
        WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
 }
@@ -633,7 +633,7 @@ xfs_iext_insert(
        struct xfs_iext_leaf    *new = NULL;
        int                     nr_entries, i;
-        xfs_iext_inc_seq(ifp, state);
+        xfs_iext_inc_seq(ifp);
        if (ifp->if_height == 0)
                xfs_iext_alloc_root(ifp, cur);
@@ -875,7 +875,7 @@ xfs_iext_remove(
        ASSERT(ifp->if_u1.if_root != NULL);
        ASSERT(xfs_iext_valid(ifp, cur));
-        xfs_iext_inc_seq(ifp, state);
+        xfs_iext_inc_seq(ifp);
        nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
        for (i = cur->pos; i < nr_entries; i++)
@@ -983,7 +983,7 @@ xfs_iext_update_extent(
 {
        struct xfs_ifork        *ifp = xfs_iext_state_to_fork(ip, state);
-        xfs_iext_inc_seq(ifp, state);
+        xfs_iext_inc_seq(ifp);
        if (cur->pos == 0) {
                struct xfs_bmbt_irec    old;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bf3e04018246..c643beeb5a24 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -94,7 +94,7 @@ xfs_iformat_fork(
                return 0;
        ASSERT(ip->i_afp == NULL);
-        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
        switch (dip->di_aformat) {
        case XFS_DINODE_FMT_LOCAL:
@@ -147,7 +147,7 @@ xfs_init_local_fork(
        if (size) {
                real_size = roundup(mem_size, 4);
-                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+                ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
                memcpy(ifp->if_u1.if_data, data, size);
                if (zero_terminate)
                        ifp->if_u1.if_data[size] = '\0';
@@ -302,7 +302,7 @@ xfs_iformat_btree(
        }
        ifp->if_broot_bytes = size;
-        ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+        ifp->if_broot = kmem_alloc(size, KM_NOFS);
        ASSERT(ifp->if_broot != NULL);
        /*
         * Copy and convert from the on-disk structure
@@ -367,7 +367,7 @@ xfs_iroot_realloc(
                 */
                if (ifp->if_broot_bytes == 0) {
                        new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
-                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+                        ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
                        ifp->if_broot_bytes = (int)new_size;
                        return;
                }
@@ -382,7 +382,7 @@ xfs_iroot_realloc(
                new_max = cur_max + rec_diff;
                new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                                KM_SLEEP | KM_NOFS);
+                                KM_NOFS);
                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -408,7 +408,7 @@ xfs_iroot_realloc(
        else
                new_size = 0;
        if (new_size > 0) {
-                new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+                new_broot = kmem_alloc(new_size, KM_NOFS);
                /*
                 * First copy over the btree block header.
                 */
@@ -492,7 +492,7 @@ xfs_idata_realloc(
         * We enforce that here.
         */
        ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
-                        roundup(new_size, 4), KM_SLEEP | KM_NOFS);
+                        roundup(new_size, 4), KM_NOFS);
        ifp->if_bytes = new_size;
 }
@@ -683,7 +683,7 @@ xfs_ifork_init_cow(
                return;
        ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
-                                       KM_SLEEP | KM_NOFS);
+                                       KM_NOFS);
        ip->i_cowfp->if_flags = XFS_IFEXTENTS;
        ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
        ip->i_cnextents = 0;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 51bb9bdb0e84..9a7fadb1361c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1174,7 +1174,7 @@ out_cur:
 /*
 * Record a refcount intent for later processing.
 */
-static int
+static void
 __xfs_refcount_add(
        struct xfs_trans                *tp,
        enum xfs_refcount_intent_type   type,
@@ -1189,44 +1189,43 @@ __xfs_refcount_add(
                        blockcount);
        ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
-                        KM_SLEEP | KM_NOFS);
+                        KM_NOFS);
        INIT_LIST_HEAD(&ri->ri_list);
        ri->ri_type = type;
        ri->ri_startblock = startblock;
        ri->ri_blockcount = blockcount;
        xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
-        return 0;
 }
 /*
 * Increase the reference count of the blocks backing a file's extent.
 */
-int
+void
 xfs_refcount_increase_extent(
        struct xfs_trans                *tp,
        struct xfs_bmbt_irec            *PREV)
 {
        if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
-                return 0;
+                return;
-        return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE,
+        __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
-                        PREV->br_startblock, PREV->br_blockcount);
+                        PREV->br_blockcount);
 }
 /*
 * Decrease the reference count of the blocks backing a file's extent.
 */
-int
+void
 xfs_refcount_decrease_extent(
        struct xfs_trans                *tp,
        struct xfs_bmbt_irec            *PREV)
 {
        if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
-                return 0;
+                return;
-        return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE,
+        __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
-                        PREV->br_startblock, PREV->br_blockcount);
+                        PREV->br_blockcount);
 }
 /*
@@ -1541,47 +1540,40 @@ __xfs_refcount_cow_free(
 }
 /* Record a CoW staging extent in the refcount btree. */
-int
+void
 xfs_refcount_alloc_cow_extent(
        struct xfs_trans                *tp,
        xfs_fsblock_t                   fsb,
        xfs_extlen_t                    len)
 {
        struct xfs_mount                *mp = tp->t_mountp;
-        int                             error;
        if (!xfs_sb_version_hasreflink(&mp->m_sb))
-                return 0;
+                return;
-        error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
+        __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
-        if (error)
-                return error;
        /* Add rmap entry */
-        return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+        xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
                        XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
 }
 /* Forget a CoW staging event in the refcount btree. */
-int
+void
 xfs_refcount_free_cow_extent(
        struct xfs_trans                *tp,
        xfs_fsblock_t                   fsb,
        xfs_extlen_t                    len)
 {
        struct xfs_mount                *mp = tp->t_mountp;
-        int                             error;
        if (!xfs_sb_version_hasreflink(&mp->m_sb))
-                return 0;
+                return;
        /* Remove rmap entry */
-        error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+        xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
                        XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
-        if (error)
+        __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
-                return error;
-        return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
 }
 struct xfs_refcount_recovery {
@@ -1602,7 +1594,7 @@ xfs_refcount_recover_extent(
        if (be32_to_cpu(rec->refc.rc_refcount) != 1)
                return -EFSCORRUPTED;
-        rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+        rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
        xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
        list_add_tail(&rr->rr_list, debris);
@@ -1679,10 +1671,8 @@ xfs_refcount_recover_cow_leftovers(
                /* Free the orphan record */
                agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
                fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
-                error = xfs_refcount_free_cow_extent(tp, fsb,
+                xfs_refcount_free_cow_extent(tp, fsb,
                                rr->rr_rrec.rc_blockcount);
-                if (error)
-                        goto out_trans;
                /* Free the block. */
                xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 1d9c518575e7..209795539c8d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -29,9 +29,9 @@ struct xfs_refcount_intent {
        xfs_extlen_t                            ri_blockcount;
 };
-extern int xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp,
                struct xfs_bmbt_irec *irec);
-extern int xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp,
                struct xfs_bmbt_irec *irec);
 extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
@@ -45,10 +45,10 @@ extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
                xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
                xfs_extlen_t *flen, bool find_end_of_shared);
-extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp,
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-                xfs_fsblock_t fsb, xfs_extlen_t len);
+                xfs_extlen_t len);
-extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp,
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
-                xfs_fsblock_t fsb, xfs_extlen_t len);
+                xfs_extlen_t len);
 extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
                xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index e6aeb390b2fb..38e9414878b3 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -168,7 +168,6 @@ xfs_rmap_btrec_to_irec(
        union xfs_btree_rec     *rec,
        struct xfs_rmap_irec    *irec)
 {
-        irec->rm_flags = 0;
        irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
        irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
        irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
@@ -254,15 +253,15 @@ xfs_rmap_find_left_neighbor_helper(
                        rec->rm_flags);
        if (rec->rm_owner != info->high.rm_owner)
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
            !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
            rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        *info->irec = *rec;
        *info->stat = 1;
-        return XFS_BTREE_QUERY_RANGE_ABORT;
+        return -ECANCELED;
 }
 /*
@@ -305,7 +304,7 @@ xfs_rmap_find_left_neighbor(
        error = xfs_rmap_query_range(cur, &info.high, &info.high,
                        xfs_rmap_find_left_neighbor_helper, &info);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+        if (error == -ECANCELED)
                error = 0;
        if (*stat)
                trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
@@ -330,16 +329,16 @@ xfs_rmap_lookup_le_range_helper(
                        rec->rm_flags);
        if (rec->rm_owner != info->high.rm_owner)
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
            !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
            (rec->rm_offset > info->high.rm_offset ||
             rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        *info->irec = *rec;
        *info->stat = 1;
-        return XFS_BTREE_QUERY_RANGE_ABORT;
+        return -ECANCELED;
 }
 /*
@@ -377,7 +376,7 @@ xfs_rmap_lookup_le_range(
                        cur->bc_private.a.agno, bno, 0, owner, offset, flags);
        error = xfs_rmap_query_range(cur, &info.high, &info.high,
                        xfs_rmap_lookup_le_range_helper, &info);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+        if (error == -ECANCELED)
                error = 0;
        if (*stat)
                trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
@@ -2268,7 +2267,7 @@ xfs_rmap_update_is_needed(
 * Record a rmap intent; the list is kept sorted first by AG and then by
 * increasing age.
 */
-static int
+static void
 __xfs_rmap_add(
        struct xfs_trans                *tp,
        enum xfs_rmap_intent_type       type,
@@ -2287,7 +2286,7 @@ __xfs_rmap_add(
                        bmap->br_blockcount,
                        bmap->br_state);
-        ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS);
+        ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
        INIT_LIST_HEAD(&ri->ri_list);
        ri->ri_type = type;
        ri->ri_owner = owner;
@@ -2295,11 +2294,10 @@ __xfs_rmap_add(
        ri->ri_bmap = *bmap;
        xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
-        return 0;
 }
 /* Map an extent into a file. */
-int
+void
 xfs_rmap_map_extent(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
@@ -2307,15 +2305,15 @@ xfs_rmap_map_extent(
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
-                return 0;
+                return;
-        return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+        __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
                        XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
                        whichfork, PREV);
 }
 /* Unmap an extent out of a file. */
-int
+void
 xfs_rmap_unmap_extent(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
@@ -2323,9 +2321,9 @@ xfs_rmap_unmap_extent(
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
-                return 0;
+                return;
-        return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+        __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
                        XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
                        whichfork, PREV);
 }
@@ -2336,7 +2334,7 @@ xfs_rmap_unmap_extent(
 * Note that tp can be NULL here as no transaction is used for COW fork
 * unwritten conversion.
 */
-int
+void
 xfs_rmap_convert_extent(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
@@ -2345,15 +2343,15 @@ xfs_rmap_convert_extent(
        struct xfs_bmbt_irec    *PREV)
 {
        if (!xfs_rmap_update_is_needed(mp, whichfork))
-                return 0;
+                return;
-        return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+        __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
                        XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
                        whichfork, PREV);
 }
 /* Schedule the creation of an rmap for non-file data. */
-int
+void
 xfs_rmap_alloc_extent(
        struct xfs_trans        *tp,
        xfs_agnumber_t          agno,
@@ -2364,18 +2362,18 @@ xfs_rmap_alloc_extent(
        struct xfs_bmbt_irec    bmap;
        if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
-                return 0;
+                return;
        bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
        bmap.br_blockcount = len;
        bmap.br_startoff = 0;
        bmap.br_state = XFS_EXT_NORM;
-        return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+        __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
 }
 /* Schedule the deletion of an rmap for non-file data. */
-int
+void
 xfs_rmap_free_extent(
        struct xfs_trans        *tp,
        xfs_agnumber_t          agno,
@@ -2386,14 +2384,14 @@ xfs_rmap_free_extent(
        struct xfs_bmbt_irec    bmap;
        if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
-                return 0;
+                return;
        bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
        bmap.br_blockcount = len;
        bmap.br_startoff = 0;
        bmap.br_state = XFS_EXT_NORM;
-        return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+        __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
 }
 /* Compare rmap records.  Returns -1 if a < b, 1 if a > b, and 0 if equal. */
@@ -2511,7 +2509,7 @@ xfs_rmap_has_other_keys_helper(
            ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
                return 0;
        rks->has_rmap = true;
-        return XFS_BTREE_QUERY_RANGE_ABORT;
+        return -ECANCELED;
 }
 /*
@@ -2540,8 +2538,11 @@ xfs_rmap_has_other_keys(
        error = xfs_rmap_query_range(cur, &low, &high,
                        xfs_rmap_has_other_keys_helper, &rks);
+        if (error < 0)
+                return error;
        *has_rmap = rks.has_rmap;
-        return error;
+        return 0;
 }
 const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index e21ed0294e5c..abe633403fd1 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -68,6 +68,7 @@ xfs_rmap_irec_offset_unpack(
        if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
                return -EFSCORRUPTED;
        irec->rm_offset = XFS_RMAP_OFF(offset);
+        irec->rm_flags = 0;
        if (offset & XFS_RMAP_OFF_ATTR_FORK)
                irec->rm_flags |= XFS_RMAP_ATTR_FORK;
        if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
@@ -161,16 +162,16 @@ struct xfs_rmap_intent {
 };
 /* functions for updating the rmapbt based on bmbt map/unmap operations */
-int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
                int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
                int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
+void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
                struct xfs_inode *ip, int whichfork,
                struct xfs_bmbt_irec *imap);
-int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
                xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
                xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
 void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index e0641b7337b3..c45acbd3add9 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -177,10 +177,4 @@ struct xfs_ino_geometry {
        unsigned int    agino_log;      /* #bits for agino in inum */
 };
-/* Keep iterating the data structure. */
-#define XFS_ITER_CONTINUE       (0)
-/* Stop iterating the data structure. */
-#define XFS_ITER_ABORT          (1)
 #endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 802b34cd10fe..300b3e91ca3a 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -169,6 +169,14 @@ typedef struct xfs_bmbt_irec
        xfs_exntst_t    br_state;       /* extent state */
 } xfs_bmbt_irec_t;
+/* per-AG block reservation types */
+enum xfs_ag_resv_type {
+        XFS_AG_RESV_NONE = 0,
+        XFS_AG_RESV_AGFL,
+        XFS_AG_RESV_METADATA,
+        XFS_AG_RESV_RMAPBT,
+};
 /*
 * Type verifier functions
 */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 16b09b941441..ba0f747c82e8 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -639,7 +639,7 @@ xchk_agfl_block(
        xchk_agfl_block_xref(sc, agbno);
        if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                return XFS_ITER_ABORT;
+                return -ECANCELED;
        return 0;
 }
@@ -730,7 +730,7 @@ xchk_agfl(
        /* Check the blocks in the AGFL. */
        error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
                        sc->sa.agfl_bp, xchk_agfl_block, &sai);
-        if (error == XFS_ITER_ABORT) {
+        if (error == -ECANCELED) {
                error = 0;
                goto out_free;
        }
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 1afc58bf71dd..0edc7f8eb96e 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -80,7 +80,7 @@ xchk_setup_xattr(
         * without the inode lock held, which means we can sleep.
         */
        if (sc->flags & XCHK_TRY_HARDER) {
-                error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP);
+                error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
                if (error)
                        return error;
        }
@@ -163,8 +163,6 @@ xchk_xattr_listent(
        args.valuelen = valuelen;
        error = xfs_attr_get_ilocked(context->dp, &args);
-        if (error == -EEXIST)
-                error = 0;
        if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
                        &error))
                goto fail_xref;
@@ -173,7 +171,7 @@ xchk_xattr_listent(
                                             args.blkno);
 fail_xref:
        if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                context->seen_enough = XFS_ITER_ABORT;
+                context->seen_enough = 1;
        return;
 }
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 1bd29fdc2ab5..fa6ea6407992 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -75,6 +75,7 @@ struct xchk_bmap_info {
        xfs_fileoff_t           lastoff;
        bool                    is_rt;
        bool                    is_shared;
+        bool                    was_loaded;
        int                     whichfork;
 };
@@ -213,25 +214,20 @@ xchk_bmap_xref_rmap(
 /* Cross-reference a single rtdev extent record. */
 STATIC void
-xchk_bmap_rt_extent_xref(
+xchk_bmap_rt_iextent_xref(
-        struct xchk_bmap_info   *info,
        struct xfs_inode        *ip,
-        struct xfs_btree_cur    *cur,
+        struct xchk_bmap_info   *info,
        struct xfs_bmbt_irec    *irec)
 {
-        if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                return;
        xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
                        irec->br_blockcount);
 }
 /* Cross-reference a single datadev extent record. */
 STATIC void
-xchk_bmap_extent_xref(
+xchk_bmap_iextent_xref(
-        struct xchk_bmap_info   *info,
        struct xfs_inode        *ip,
-        struct xfs_btree_cur    *cur,
+        struct xchk_bmap_info   *info,
        struct xfs_bmbt_irec    *irec)
 {
        struct xfs_mount        *mp = info->sc->mp;
@@ -240,9 +236,6 @@ xchk_bmap_extent_xref(
        xfs_extlen_t            len;
        int                     error;
-        if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                return;
        agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
        agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
        len = irec->br_blockcount;
@@ -300,20 +293,15 @@ xchk_bmap_dirattr_extent(
 /* Scrub a single extent record. */
 STATIC int
-xchk_bmap_extent(
+xchk_bmap_iextent(
        struct xfs_inode        *ip,
-        struct xfs_btree_cur    *cur,
        struct xchk_bmap_info   *info,
        struct xfs_bmbt_irec    *irec)
 {
        struct xfs_mount        *mp = info->sc->mp;
-        struct xfs_buf          *bp = NULL;
        xfs_filblks_t           end;
        int                     error = 0;
-        if (cur)
-                xfs_btree_get_block(cur, 0, &bp);
        /*
         * Check for out-of-order extents.  This record could have come
         * from the incore list, for which there is no ordering check.
@@ -364,10 +352,13 @@ xchk_bmap_extent(
                xchk_fblock_set_corrupt(info->sc, info->whichfork,
                                irec->br_startoff);
+        if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+                return 0;
        if (info->is_rt)
-                xchk_bmap_rt_extent_xref(info, ip, cur, irec);
+                xchk_bmap_rt_iextent_xref(ip, info, irec);
        else
-                xchk_bmap_extent_xref(info, ip, cur, irec);
+                xchk_bmap_iextent_xref(ip, info, irec);
        info->lastoff = irec->br_startoff + irec->br_blockcount;
        return error;
@@ -380,10 +371,13 @@ xchk_bmapbt_rec(
        union xfs_btree_rec     *rec)
 {
        struct xfs_bmbt_irec    irec;
+        struct xfs_bmbt_irec    iext_irec;
+        struct xfs_iext_cursor  icur;
        struct xchk_bmap_info   *info = bs->private;
        struct xfs_inode        *ip = bs->cur->bc_private.b.ip;
        struct xfs_buf          *bp = NULL;
        struct xfs_btree_block  *block;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, info->whichfork);
        uint64_t                owner;
        int                     i;
@@ -402,9 +396,26 @@ xchk_bmapbt_rec(
                }
        }
-        /* Set up the in-core record and scrub it. */
+        /*
+         * Check that the incore extent tree contains an extent that matches
+         * this one exactly.  We validate those cached bmaps later, so we don't
+         * need to check them here.  If the incore extent tree was just loaded
+         * from disk by the scrubber, we assume that its contents match what's
+         * on disk (we still hold the ILOCK) and skip the equivalence check.
+         */
+        if (!info->was_loaded)
+                return 0;
        xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
-        return xchk_bmap_extent(ip, bs->cur, info, &irec);
+        if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
+                                &iext_irec) ||
+            irec.br_startoff != iext_irec.br_startoff ||
+            irec.br_startblock != iext_irec.br_startblock ||
+            irec.br_blockcount != iext_irec.br_blockcount ||
+            irec.br_state != iext_irec.br_state)
+                xchk_fblock_set_corrupt(bs->sc, info->whichfork,
+                                irec.br_startoff);
+        return 0;
 }
 /* Scan the btree records. */
@@ -415,15 +426,26 @@ xchk_bmap_btree(
        struct xchk_bmap_info   *info)
 {
        struct xfs_owner_info   oinfo;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
        struct xfs_mount        *mp = sc->mp;
        struct xfs_inode        *ip = sc->ip;
        struct xfs_btree_cur    *cur;
        int                     error;
+        /* Load the incore bmap cache if it's not loaded. */
+        info->was_loaded = ifp->if_flags & XFS_IFEXTENTS;
+        if (!info->was_loaded) {
+                error = xfs_iread_extents(sc->tp, ip, whichfork);
+                if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
+                        goto out;
+        }
+        /* Check the btree structure. */
        cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
        xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
        error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info);
        xfs_btree_del_cursor(cur, error);
+out:
        return error;
 }
@@ -500,7 +522,7 @@ xchk_bmap_check_rmap(
 out:
        if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-                return XFS_BTREE_QUERY_RANGE_ABORT;
+                return -ECANCELED;
        return 0;
 }
@@ -529,7 +551,7 @@ xchk_bmap_check_ag_rmaps(
        sbcri.sc = sc;
        sbcri.whichfork = whichfork;
        error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+        if (error == -ECANCELED)
                error = 0;
        xfs_btree_del_cursor(cur, error);
@@ -671,13 +693,6 @@ xchk_bmap(
        if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
                goto out;
-        /* Now try to scrub the in-memory extent list. */
-        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-                error = xfs_iread_extents(sc->tp, ip, whichfork);
-                if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
-                        goto out;
-        }
        /* Find the offset of the last extent in the mapping. */
        error = xfs_bmap_last_offset(ip, &endoff, whichfork);
        if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
@@ -689,7 +704,7 @@ xchk_bmap(
        for_each_xfs_iext(ifp, &icur, &irec) {
                if (xchk_should_terminate(sc, &error) ||
                    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
-                        break;
+                        goto out;
                if (isnullstartblock(irec.br_startblock))
                        continue;
                if (irec.br_startoff >= endoff) {
@@ -697,7 +712,7 @@ xchk_bmap(
                                        irec.br_startoff);
                        goto out;
                }
-                error = xchk_bmap_extent(ip, NULL, &info, &irec);
+                error = xchk_bmap_iextent(ip, &info, &irec);
                if (error)
                        goto out;
        }
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index fc3f510c9034..98f82d7c8b40 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -125,7 +125,7 @@ xchk_setup_fscounters(
        struct xchk_fscounters  *fsc;
        int                     error;
-        sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP);
+        sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
        if (!sc->buf)
                return -ENOMEM;
        fsc = sc->buf;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4cfeec57fb05..b70a88bc975e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -351,7 +351,7 @@ xrep_init_btblock(
        xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
        xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
-        xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+        xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
        bp->b_ops = ops;
        *bpp = bp;
@@ -664,7 +664,7 @@ xrep_findroot_agfl_walk(
 {
        xfs_agblock_t           *agbno = priv;
-        return (*agbno == bno) ? XFS_ITER_ABORT : 0;
+        return (*agbno == bno) ? -ECANCELED : 0;
 }
 /* Does this block match the btree information passed in? */
@@ -694,7 +694,7 @@ xrep_findroot_block(
        if (owner == XFS_RMAP_OWN_AG) {
                error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
                                xrep_findroot_agfl_walk, &agbno);
-                if (error == XFS_ITER_ABORT)
+                if (error == -ECANCELED)
                        return 0;
                if (error)
                        return error;
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 99c0b1234c3c..5641ae512c9e 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -22,7 +22,7 @@ xchk_setup_symlink(
        struct xfs_inode        *ip)
 {
        /* Allocate the buffer without the inode lock held. */
-        sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP);
+        sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
        if (!sc->buf)
                return -ENOMEM;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cbda40d40326..96d7071cfa46 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -112,7 +112,7 @@ xfs_get_acl(struct inode *inode, int type)
 {
        struct xfs_inode *ip = XFS_I(inode);
        struct posix_acl *acl = NULL;
-        struct xfs_acl *xfs_acl;
+        struct xfs_acl *xfs_acl = NULL;
        unsigned char *ea_name;
        int error;
        int len;
@@ -135,12 +135,8 @@ xfs_get_acl(struct inode *inode, int type)
         * go out to the disk.
         */
        len = XFS_ACL_MAX_SIZE(ip->i_mount);
-        xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+        error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
-        if (!xfs_acl)
+                                ATTR_ALLOC | ATTR_ROOT);
-                return ERR_PTR(-ENOMEM);
-        error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
-                                                        &len, ATTR_ROOT);
        if (error) {
                /*
                 * If the attribute doesn't exist make sure we have a negative
@@ -151,8 +147,8 @@ xfs_get_acl(struct inode *inode, int type)
        } else  {
                acl = xfs_acl_from_disk(xfs_acl, len,
                                        XFS_ACL_MAX_ENTRIES(ip->i_mount));
+                kmem_free(xfs_acl);
        }
-        kmem_free(xfs_acl);
        return acl;
 }
@@ -180,7 +176,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
                struct xfs_acl *xfs_acl;
                int len = XFS_ACL_MAX_SIZE(ip->i_mount);
-                xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+                xfs_acl = kmem_zalloc_large(len, 0);
                if (!xfs_acl)
                        return -ENOMEM;
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index dc93c51c17de..a640a285cc52 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -147,7 +147,7 @@ xfs_attr3_leaf_inactive(
         * Allocate storage for a list of all the "remote" value extents.
         */
        size = count * sizeof(xfs_attr_inactive_list_t);
-        list = kmem_alloc(size, KM_SLEEP);
+        list = kmem_alloc(size, 0);
        /*
         * Identify each of the "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 58fc820a70c6..00758fdc2fec 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -109,7 +109,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+        sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
        /*
         * Scan the attribute list for the rest of the entries, storing
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9fa4a7ee8cfc..83d24e983d4c 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -141,7 +141,7 @@ xfs_bui_init(
 {
        struct xfs_bui_log_item         *buip;
-        buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP);
+        buip = kmem_zone_zalloc(xfs_bui_zone, 0);
        xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
        buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -218,7 +218,7 @@ xfs_trans_get_bud(
 {
        struct xfs_bud_log_item         *budp;
-        budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
+        budp = kmem_zone_zalloc(xfs_bud_zone, 0);
        xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
                          &xfs_bud_item_ops);
        budp->bud_buip = buip;
@@ -542,9 +542,7 @@ xfs_bui_recover(
                irec.br_blockcount = count;
                irec.br_startoff = bmap->me_startoff;
                irec.br_state = state;
-                error = xfs_bmap_unmap_extent(tp, ip, &irec);
+                xfs_bmap_unmap_extent(tp, ip, &irec);
-                if (error)
-                        goto err_inode;
        }
        set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 98c6a7a71427..0910cb75b65d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -39,9 +39,9 @@
 xfs_daddr_t
 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 {
-        return (XFS_IS_REALTIME_INODE(ip) ? \
+        if (XFS_IS_REALTIME_INODE(ip))
-                 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+                return XFS_FSB_TO_BB(ip->i_mount, fsb);
-                 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+        return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
 }
 /*
@@ -1532,24 +1532,16 @@ xfs_swap_extent_rmap(
                        trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
                        /* Remove the mapping from the donor file. */
-                        error = xfs_bmap_unmap_extent(tp, tip, &uirec);
+                        xfs_bmap_unmap_extent(tp, tip, &uirec);
-                        if (error)
-                                goto out;
                        /* Remove the mapping from the source file. */
-                        error = xfs_bmap_unmap_extent(tp, ip, &irec);
+                        xfs_bmap_unmap_extent(tp, ip, &irec);
-                        if (error)
-                                goto out;
                        /* Map the donor file's blocks into the source file. */
-                        error = xfs_bmap_map_extent(tp, ip, &uirec);
+                        xfs_bmap_map_extent(tp, ip, &uirec);
-                        if (error)
-                                goto out;
                        /* Map the source file's blocks into the donor file. */
-                        error = xfs_bmap_map_extent(tp, tip, &irec);
+                        xfs_bmap_map_extent(tp, tip, &irec);
-                        if (error)
-                                goto out;
                        error = xfs_defer_finish(tpp);
                        tp = *tpp;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ca0849043f54..120ef99d09e8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -353,7 +353,8 @@ xfs_buf_allocate_memory(
         */
        size = BBTOB(bp->b_length);
        if (size < PAGE_SIZE) {
-                bp->b_addr = kmem_alloc(size, KM_NOFS);
+                int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
+                bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS);
                if (!bp->b_addr) {
                        /* low memory - use alloc_page loop instead */
                        goto use_alloc_page;
@@ -368,7 +369,7 @@ xfs_buf_allocate_memory(
                }
                bp->b_offset = offset_in_page(bp->b_addr);
                bp->b_pages = bp->b_page_array;
-                bp->b_pages[0] = virt_to_page(bp->b_addr);
+                bp->b_pages[0] = kmem_to_page(bp->b_addr);
                bp->b_page_count = 1;
                bp->b_flags |= _XBF_KMEM;
                return 0;
@@ -1741,7 +1742,7 @@ xfs_alloc_buftarg(
 {
        xfs_buftarg_t           *btp;
-        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
+        btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c6e57a3f409e..f6ce17d8d848 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -350,6 +350,12 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
 #define xfs_getsize_buftarg(buftarg)    block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)   bdev_read_only((buftarg)->bt_bdev)
+static inline int
+xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
+{
+        return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
+}
 int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
 bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
 bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7dcaec54a20b..d74fbd1e9d3e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -702,7 +702,7 @@ xfs_buf_item_get_format(
        }
        bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
-                                KM_SLEEP);
+                                0);
        if (!bip->bli_formats)
                return -ENOMEM;
        return 0;
@@ -747,7 +747,7 @@ xfs_buf_item_init(
                return 0;
        }
-        bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
+        bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
        xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
        bip->bli_buf = bp;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index fb1ad4483081..aeb95e7391c1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -440,7 +440,7 @@ xfs_dquot_alloc(
 {
        struct xfs_dquot        *dqp;
-        dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+        dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
        dqp->dq_flags = type;
        dqp->q_core.d_id = cpu_to_be32(id);
@@ -1239,7 +1239,7 @@ xfs_qm_exit(void)
 /*
 * Iterate every dquot of a particular type.  The caller must ensure that the
 * particular quota type is active.  iter_fn can return negative error codes,
- * or XFS_ITER_ABORT to indicate that it wants to stop iterating.
+ * or -ECANCELED to indicate that it wants to stop iterating.
 */
 int
 xfs_qm_dqiterate(
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 282ec5af293e..d60647d7197b 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -347,7 +347,7 @@ xfs_qm_qoff_logitem_init(
 {
        struct xfs_qoff_logitem *qf;
-        qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+        qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
        xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
                        &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 544c9482a0ef..849fd4476950 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -213,7 +213,7 @@ xfs_errortag_init(
        struct xfs_mount        *mp)
 {
        mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
-                        KM_SLEEP | KM_MAYFAIL);
+                        KM_MAYFAIL);
        if (!mp->m_errortag)
                return -ENOMEM;
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 0ed68379e551..2183d87be4cf 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -33,7 +33,7 @@ xfs_extent_busy_insert(
        struct rb_node          **rbp;
        struct rb_node          *parent = NULL;
-        new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
+        new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
        new->agno = agno;
        new->bno = bno;
        new->length = len;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 86f6512d6864..e44efc41a041 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -163,9 +163,9 @@ xfs_efi_init(
        if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
                size = (uint)(sizeof(xfs_efi_log_item_t) +
                        ((nextents - 1) * sizeof(xfs_extent_t)));
-                efip = kmem_zalloc(size, KM_SLEEP);
+                efip = kmem_zalloc(size, 0);
        } else {
-                efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
+                efip = kmem_zone_zalloc(xfs_efi_zone, 0);
        }
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -333,9 +333,9 @@ xfs_trans_get_efd(
        if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
                efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
                                (nextents - 1) * sizeof(struct xfs_extent),
-                                KM_SLEEP);
+                                0);
        } else {
-                efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+                efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
        }
        xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 28101bbc0b78..d952d5962e93 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -28,6 +28,7 @@
 #include <linux/falloc.h>
 #include <linux/backing-dev.h>
 #include <linux/mman.h>
+#include <linux/fadvise.h>
 static const struct vm_operations_struct xfs_file_vm_ops;
@@ -933,6 +934,30 @@ out_unlock:
        return error;
 }
+STATIC int
+xfs_file_fadvise(
+        struct file     *file,
+        loff_t          start,
+        loff_t          end,
+        int             advice)
+{
+        struct xfs_inode *ip = XFS_I(file_inode(file));
+        int ret;
+        int lockflags = 0;
+        /*
+         * Operations creating pages in page cache need protection from hole
+         * punching and similar ops
+         */
+        if (advice == POSIX_FADV_WILLNEED) {
+                lockflags = XFS_IOLOCK_SHARED;
+                xfs_ilock(ip, lockflags);
+        }
+        ret = generic_fadvise(file, start, end, advice);
+        if (lockflags)
+                xfs_iunlock(ip, lockflags);
+        return ret;
+}
 STATIC loff_t
 xfs_file_remap_range(
@@ -1232,6 +1257,7 @@ const struct file_operations xfs_file_operations = {
        .fsync          = xfs_file_fsync,
        .get_unmapped_area = thp_get_unmapped_area,
        .fallocate      = xfs_file_fallocate,
+        .fadvise        = xfs_file_fadvise,
        .remap_file_range = xfs_file_remap_range,
 };
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a8f9641562a..d082143feb5a 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -250,7 +250,7 @@ xfs_getfsmap_helper(
                rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
                if (info->next_daddr < rec_daddr)
                        info->next_daddr = rec_daddr;
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        }
        /* Are we just counting mappings? */
@@ -259,14 +259,14 @@ xfs_getfsmap_helper(
                        info->head->fmh_entries++;
                if (info->last)
-                        return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                        return 0;
                info->head->fmh_entries++;
                rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
                if (info->next_daddr < rec_daddr)
                        info->next_daddr = rec_daddr;
-                return XFS_BTREE_QUERY_RANGE_CONTINUE;
+                return 0;
        }
        /*
@@ -276,7 +276,7 @@ xfs_getfsmap_helper(
         */
        if (rec_daddr > info->next_daddr) {
                if (info->head->fmh_entries >= info->head->fmh_count)
-                        return XFS_BTREE_QUERY_RANGE_ABORT;
+                        return -ECANCELED;
                fmr.fmr_device = info->dev;
                fmr.fmr_physical = info->next_daddr;
@@ -295,7 +295,7 @@ xfs_getfsmap_helper(
        /* Fill out the extent we found */
        if (info->head->fmh_entries >= info->head->fmh_count)
-                return XFS_BTREE_QUERY_RANGE_ABORT;
+                return -ECANCELED;
        trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
@@ -328,7 +328,7 @@ out:
        rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
        if (info->next_daddr < rec_daddr)
                info->next_daddr = rec_daddr;
-        return XFS_BTREE_QUERY_RANGE_CONTINUE;
+        return 0;
 }
 /* Transform a rmapbt irec into a fsmap */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0b0fd10a36d4..944add5ff8e0 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -40,7 +40,7 @@ xfs_inode_alloc(
         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
         * code up to do this anyway.
         */
-        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+        ip = kmem_zone_alloc(xfs_inode_zone, 0);
        if (!ip)
                return NULL;
        if (inode_init_always(mp->m_super, VFS_I(ip))) {
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d99a0a3e5f40..3ebd1b7f49d8 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -89,7 +89,7 @@ xfs_icreate_log(
 {
        struct xfs_icreate_item *icp;
-        icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+        icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
        xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
                          &xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6467d5e1df2d..18f4b262e61c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2018,7 +2018,7 @@ xfs_iunlink_add_backref(
        if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
                return 0;
-        iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+        iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
        iu->iu_agino = prev_agino;
        iu->iu_next_unlinked = this_agino;
@@ -3282,7 +3282,8 @@ xfs_rename(
                                        spaceres);
        /*
-         * Set up the target.
+         * Check for expected errors before we dirty the transaction
+         * so we can return an error without a transaction abort.
         */
        if (target_ip == NULL) {
                /*
@@ -3294,6 +3295,46 @@ xfs_rename(
                        if (error)
                                goto out_trans_cancel;
                }
+        } else {
+                /*
+                 * If target exists and it's a directory, check that whether
+                 * it can be destroyed.
+                 */
+                if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+                    (!xfs_dir_isempty(target_ip) ||
+                     (VFS_I(target_ip)->i_nlink > 2))) {
+                        error = -EEXIST;
+                        goto out_trans_cancel;
+                }
+        }
+        /*
+         * Directory entry creation below may acquire the AGF. Remove
+         * the whiteout from the unlinked list first to preserve correct
+         * AGI/AGF locking order. This dirties the transaction so failures
+         * after this point will abort and log recovery will clean up the
+         * mess.
+         *
+         * For whiteouts, we need to bump the link count on the whiteout
+         * inode. After this point, we have a real link, clear the tmpfile
+         * state flag from the inode so it doesn't accidentally get misused
+         * in future.
+         */
+        if (wip) {
+                ASSERT(VFS_I(wip)->i_nlink == 0);
+                error = xfs_iunlink_remove(tp, wip);
+                if (error)
+                        goto out_trans_cancel;
+                xfs_bumplink(tp, wip);
+                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
+                VFS_I(wip)->i_state &= ~I_LINKABLE;
+        }
+        /*
+         * Set up the target.
+         */
+        if (target_ip == NULL) {
                /*
                 * If target does not exist and the rename crosses
                 * directories, adjust the target directory link count
@@ -3312,22 +3353,6 @@ xfs_rename(
                }
        } else { /* target_ip != NULL */
                /*
-                 * If target exists and it's a directory, check that both
-                 * target and source are directories and that target can be
-                 * destroyed, or that neither is a directory.
-                 */
-                if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
-                        /*
-                         * Make sure target dir is empty.
-                         */
-                        if (!(xfs_dir_isempty(target_ip)) ||
-                            (VFS_I(target_ip)->i_nlink > 2)) {
-                                error = -EEXIST;
-                                goto out_trans_cancel;
-                        }
-                }
-                /*
                 * Link the source inode under the target name.
                 * If the source inode is a directory and we are moving
                 * it across directories, its ".." entry will be
@@ -3417,30 +3442,6 @@ xfs_rename(
        if (error)
                goto out_trans_cancel;
-        /*
-         * For whiteouts, we need to bump the link count on the whiteout inode.
-         * This means that failures all the way up to this point leave the inode
-         * on the unlinked list and so cleanup is a simple matter of dropping
-         * the remaining reference to it. If we fail here after bumping the link
-         * count, we're shutting down the filesystem so we'll never see the
-         * intermediate state on disk.
-         */
-        if (wip) {
-                ASSERT(VFS_I(wip)->i_nlink == 0);
-                xfs_bumplink(tp, wip);
-                error = xfs_iunlink_remove(tp, wip);
-                if (error)
-                        goto out_trans_cancel;
-                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
-                /*
-                 * Now we have a real link, clear the "I'm a tmpfile" state
-                 * flag from the inode so it doesn't accidentally get misused in
-                 * future.
-                 */
-                VFS_I(wip)->i_state &= ~I_LINKABLE;
-        }
        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
        if (new_parent)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c9a502eed204..bb8f076805b9 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,7 +651,7 @@ xfs_inode_item_init(
        struct xfs_inode_log_item *iip;
        ASSERT(ip->i_itemp == NULL);
-        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
        iip->ili_inode = ip;
        xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index affa557c2337..d58f0d6a699e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -396,7 +396,7 @@ xfs_attrlist_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+        kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
        if (!kbuf)
                goto out_dput;
@@ -434,11 +434,11 @@ xfs_attrmulti_attr_get(
        if (*len > XFS_XATTR_SIZE_MAX)
                return -EINVAL;
-        kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+        kbuf = kmem_zalloc_large(*len, 0);
        if (!kbuf)
                return -ENOMEM;
-        error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+        error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
        if (error)
                goto out_kfree;
@@ -831,7 +831,7 @@ xfs_bulkstat_fmt(
 /*
 * Check the incoming bulk request @hdr from userspace and initialize the
 * internal @breq bulk request appropriately.  Returns 0 if the bulk request
- * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual
+ * should proceed; -ECANCELED if there's nothing to do; or the usual
 * negative error code.
 */
 static int
@@ -889,13 +889,13 @@ xfs_bulk_ireq_setup(
                /* Asking for an inode past the end of the AG?  We're done! */
                if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
-                        return XFS_ITER_ABORT;
+                        return -ECANCELED;
        } else if (hdr->agno)
                return -EINVAL;
        /* Asking for an inode past the end of the FS?  We're done! */
        if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
-                return XFS_ITER_ABORT;
+                return -ECANCELED;
        return 0;
 }
@@ -936,7 +936,7 @@ xfs_ioc_bulkstat(
                return -EFAULT;
        error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
-        if (error == XFS_ITER_ABORT)
+        if (error == -ECANCELED)
                goto out_teardown;
        if (error < 0)
                return error;
@@ -986,7 +986,7 @@ xfs_ioc_inumbers(
                return -EFAULT;
        error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
-        if (error == XFS_ITER_ABORT)
+        if (error == -ECANCELED)
                goto out_teardown;
        if (error < 0)
                return error;
@@ -1038,6 +1038,10 @@ xfs_ioc_ag_geometry(
        if (copy_from_user(&ageo, arg, sizeof(ageo)))
                return -EFAULT;
+        if (ageo.ag_flags)
+                return -EINVAL;
+        if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
+                return -EINVAL;
        error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
        if (error)
@@ -1309,8 +1313,7 @@ xfs_ioctl_setattr_dax_invalidate(
        if (fa->fsx_xflags & FS_XFLAG_DAX) {
                if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
                        return -EINVAL;
-                if (S_ISREG(inode->i_mode) &&
+                if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
-                    !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
                                sb->s_blocksize))
                        return -EINVAL;
        }
@@ -1881,7 +1884,7 @@ xfs_ioc_getfsmap(
        info.mp = ip->i_mount;
        info.data = arg;
        error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
-        if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+        if (error == -ECANCELED) {
                error = 0;
                aborted = true;
        } else if (error)
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7bd7534f5051..1e08bf79b478 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -381,7 +381,7 @@ xfs_compat_attrlist_by_handle(
                return PTR_ERR(dentry);
        error = -ENOMEM;
-        kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+        kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
        if (!kbuf)
                goto out_dput;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3a4310d7cb59..f780e223b118 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -58,7 +58,7 @@ xfs_bmbt_to_iomap(
 {
        struct xfs_mount        *mp = ip->i_mount;
-        if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+        if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
                return xfs_alert_fsblock_zero(ip, imap);
        if (imap->br_startblock == HOLESTARTBLOCK) {
@@ -297,7 +297,7 @@ xfs_iomap_write_direct(
                goto out_unlock;
        }
-        if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+        if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
                error = xfs_alert_fsblock_zero(ip, imap);
 out_unlock:
@@ -814,7 +814,7 @@ xfs_iomap_write_unwritten(
                if (error)
                        return error;
-                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+                if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
                        return xfs_alert_fsblock_zero(ip, &imap);
                if ((numblks_fsb = imap.br_blockcount) == 0) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f5c955d35be4..884950adbd16 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -137,7 +137,7 @@ xfs_bulkstat_one_int(
        xfs_irele(ip);
        error = bc->formatter(bc->breq, buf);
-        if (error == XFS_IBULK_ABORT)
+        if (error == -ECANCELED)
                goto out_advance;
        if (error)
                goto out;
@@ -169,7 +169,7 @@ xfs_bulkstat_one(
        ASSERT(breq->icount == 1);
        bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
-                        KM_SLEEP | KM_MAYFAIL);
+                        KM_MAYFAIL);
        if (!bc.buf)
                return -ENOMEM;
@@ -181,7 +181,7 @@ xfs_bulkstat_one(
         * If we reported one inode to userspace then we abort because we hit
         * the end of the buffer.  Don't leak that back to userspace.
         */
-        if (error == XFS_IWALK_ABORT)
+        if (error == -ECANCELED)
                error = 0;
        return error;
@@ -243,7 +243,7 @@ xfs_bulkstat(
                return 0;
        bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
-                        KM_SLEEP | KM_MAYFAIL);
+                        KM_MAYFAIL);
        if (!bc.buf)
                return -ENOMEM;
@@ -342,7 +342,7 @@ xfs_inumbers_walk(
        int                     error;
        error = ic->formatter(ic->breq, &inogrp);
-        if (error && error != XFS_IBULK_ABORT)
+        if (error && error != -ECANCELED)
                return error;
        ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index e90c1fc5b981..96a1e2a9be3f 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -18,9 +18,6 @@ struct xfs_ibulk {
 /* Only iterate within the same AG as startino */
 #define XFS_IBULK_SAME_AG       (XFS_IWALK_SAME_AG)
-/* Return value that means we want to abort the walk. */
-#define XFS_IBULK_ABORT         (XFS_IWALK_ABORT)
 /*
 * Advance the user buffer pointer by one record of the given size.  If the
 * buffer is now full, return the appropriate error code.
@@ -34,13 +31,21 @@ xfs_ibulk_advance(
        breq->ubuffer = b + bytes;
        breq->ocount++;
-        return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0;
+        return breq->ocount == breq->icount ? -ECANCELED : 0;
 }
 /*
 * Return stat information in bulk (by-inode) for the filesystem.
 */
+/*
+ * Return codes for the formatter function are 0 to continue iterating, and
+ * non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * bulkstat/inumbers caller.  The special value -ECANCELED can be used to stop
+ * iteration, as neither bulkstat nor inumbers will ever generate that error
+ * code on their own.
+ */
 typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
                const struct xfs_bulkstat *bstat);
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 8c7d727149ea..aa375cf53021 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -31,7 +31,7 @@
 * inode it finds, it calls a walk function with the relevant inode number and
 * a pointer to caller-provided data.  The walk function can return the usual
 * negative error code to stop the iteration; 0 to continue the iteration; or
- * XFS_IWALK_ABORT to stop the iteration.  This return value is returned to the
+ * -ECANCELED to stop the iteration.  This return value is returned to the
 * caller.
 *
 * Internally, we allow the walk function to do anything, which means that we
@@ -616,7 +616,7 @@ xfs_iwalk_threaded(
                if (xfs_pwork_ctl_want_abort(&pctl))
                        break;
-                iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
+                iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
                iwag->mp = mp;
                iwag->iwalk_fn = iwalk_fn;
                iwag->data = data;
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 6c960e10ed4d..37a795f03267 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -6,12 +6,17 @@
 #ifndef __XFS_IWALK_H__
 #define __XFS_IWALK_H__
+/*
+ * Return codes for the inode/inobt walk function are 0 to continue iterating,
+ * and non-zero to stop iterating.  Any non-zero value will be passed up to the
+ * iwalk or inobt_walk caller.  The special value -ECANCELED can be used to
+ * stop iteration, as neither iwalk nor inobt_walk will ever generate that
+ * error code on their own.
+ */
 /* Walk all inodes in the filesystem starting from @startino. */
 typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
                            xfs_ino_t ino, void *data);
-/* Return values for xfs_iwalk_fn. */
-#define XFS_IWALK_CONTINUE      (XFS_ITER_CONTINUE)
-#define XFS_IWALK_ABORT         (XFS_ITER_ABORT)
 int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
                unsigned int flags, xfs_iwalk_fn iwalk_fn,
@@ -30,8 +35,6 @@ typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
                                 xfs_agnumber_t agno,
                                 const struct xfs_inobt_rec_incore *irec,
                                 void *data);
-/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */
-#define XFS_INOBT_WALK_ABORT    (XFS_IWALK_ABORT)
 int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
                xfs_ino_t startino, unsigned int flags,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7fc3c1ad36bc..a2beee9f74da 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -214,15 +214,42 @@ xlog_grant_head_wake(
 {
        struct xlog_ticket      *tic;
        int                     need_bytes;
+        bool                    woken_task = false;
        list_for_each_entry(tic, &head->waiters, t_queue) {
+                /*
+                 * There is a chance that the size of the CIL checkpoints in
+                 * progress at the last AIL push target calculation resulted in
+                 * limiting the target to the log head (l_last_sync_lsn) at the
+                 * time. This may not reflect where the log head is now as the
+                 * CIL checkpoints may have completed.
+                 *
+                 * Hence when we are woken here, it may be that the head of the
+                 * log that has moved rather than the tail. As the tail didn't
+                 * move, there still won't be space available for the
+                 * reservation we require.  However, if the AIL has already
+                 * pushed to the target defined by the old log head location, we
+                 * will hang here waiting for something else to update the AIL
+                 * push target.
+                 *
+                 * Therefore, if there isn't space to wake the first waiter on
+                 * the grant head, we need to push the AIL again to ensure the
+                 * target reflects both the current log tail and log head
+                 * position before we wait for the tail to move again.
+                 */
                need_bytes = xlog_ticket_reservation(log, head, tic);
-                if (*free_bytes < need_bytes)
+                if (*free_bytes < need_bytes) {
+                        if (!woken_task)
+                                xlog_grant_push_ail(log, need_bytes);
                        return false;
+                }
                *free_bytes -= need_bytes;
                trace_xfs_log_grant_wake_up(log, tic);
                wake_up_process(tic->t_task);
+                woken_task = true;
        }
        return true;
@@ -428,8 +455,7 @@ xfs_log_reserve(
        XFS_STATS_INC(mp, xs_try_logspace);
        ASSERT(*ticp == NULL);
-        tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
+        tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
-                                KM_SLEEP);
        *ticp = tic;
        xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -1404,6 +1430,7 @@ xlog_alloc_log(
         */
        ASSERT(log->l_iclog_size >= 4096);
        for (i = 0; i < log->l_iclog_bufs; i++) {
+                int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
                size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
                                sizeof(struct bio_vec);
@@ -1415,8 +1442,8 @@ xlog_alloc_log(
                iclog->ic_prev = prev_iclog;
                prev_iclog = iclog;
-                iclog->ic_data = kmem_alloc_large(log->l_iclog_size,
+                iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
-                                KM_MAYFAIL);
+                                                KM_MAYFAIL);
                if (!iclog->ic_data)
                        goto out_free_iclog;
 #ifdef DEBUG
@@ -2496,21 +2523,35 @@ next_lv:
 *****************************************************************************
 */
-/* Clean iclogs starting from the head.  This ordering must be
+/*
- * maintained, so an iclog doesn't become ACTIVE beyond one that
+ * An iclog has just finished IO completion processing, so we need to update
- * is SYNCING.  This is also required to maintain the notion that we use
+ * the iclog state and propagate that up into the overall log state. Hence we
- * a ordered wait queue to hold off would be writers to the log when every
+ * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
- * iclog is trying to sync to disk.
+ * starting from the head, and then wake up any threads that are waiting for the
+ * iclog to be marked clean.
+ *
+ * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
+ * doesn't become ACTIVE beyond one that is SYNCING.  This is also required to
+ * maintain the notion that we use a ordered wait queue to hold off would be
+ * writers to the log when every iclog is trying to sync to disk.
+ *
+ * Caller must hold the icloglock before calling us.
 *
- * State Change: DIRTY -> ACTIVE
+ * State Change: !IOERROR -> DIRTY -> ACTIVE
 */
 STATIC void
-xlog_state_clean_log(
+xlog_state_clean_iclog(
-        struct xlog *log)
+        struct xlog             *log,
+        struct xlog_in_core     *dirty_iclog)
 {
-        xlog_in_core_t  *iclog;
+        struct xlog_in_core     *iclog;
-        int changed = 0;
+        int                     changed = 0;
+        /* Prepare the completed iclog. */
+        if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
+                dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+        /* Walk all the iclogs to update the ordered active state. */
        iclog = log->l_iclog;
        do {
                if (iclog->ic_state == XLOG_STATE_DIRTY) {
@@ -2548,7 +2589,13 @@ xlog_state_clean_log(
                iclog = iclog->ic_next;
        } while (iclog != log->l_iclog);
-        /* log is locked when we are called */
+        /*
+         * Wake up threads waiting in xfs_log_force() for the dirty iclog
+         * to be cleaned.
+         */
+        wake_up_all(&dirty_iclog->ic_force_wait);
        /*
         * Change state for the dummy log recording.
         * We usually go to NEED. But we go to NEED2 if the changed indicates
@@ -2582,7 +2629,7 @@ xlog_state_clean_log(
                        ASSERT(0);
                }
        }
-}       /* xlog_state_clean_log */
+}
 STATIC xfs_lsn_t
 xlog_get_lowest_lsn(
@@ -2603,30 +2650,205 @@ xlog_get_lowest_lsn(
        return lowest_lsn;
 }
+/*
+ * Completion of a iclog IO does not imply that a transaction has completed, as
+ * transactions can be large enough to span many iclogs. We cannot change the
+ * tail of the log half way through a transaction as this may be the only
+ * transaction in the log and moving the tail to point to the middle of it
+ * will prevent recovery from finding the start of the transaction. Hence we
+ * should only update the last_sync_lsn if this iclog contains transaction
+ * completion callbacks on it.
+ *
+ * We have to do this before we drop the icloglock to ensure we are the only one
+ * that can update it.
+ *
+ * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
+ * the reservation grant head pushing. This is due to the fact that the push
+ * target is bound by the current last_sync_lsn value. Hence if we have a large
+ * amount of log space bound up in this committing transaction then the
+ * last_sync_lsn value may be the limiting factor preventing tail pushing from
+ * freeing space in the log. Hence once we've updated the last_sync_lsn we
+ * should push the AIL to ensure the push target (and hence the grant head) is
+ * no longer bound by the old log head location and can move forwards and make
+ * progress again.
+ */
+static void
+xlog_state_set_callback(
+        struct xlog             *log,
+        struct xlog_in_core     *iclog,
+        xfs_lsn_t               header_lsn)
+{
+        iclog->ic_state = XLOG_STATE_CALLBACK;
+        ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+                           header_lsn) <= 0);
+        if (list_empty_careful(&iclog->ic_callbacks))
+                return;
+        atomic64_set(&log->l_last_sync_lsn, header_lsn);
+        xlog_grant_push_ail(log, 0);
+}
+/*
+ * Return true if we need to stop processing, false to continue to the next
+ * iclog. The caller will need to run callbacks if the iclog is returned in the
+ * XLOG_STATE_CALLBACK state.
+ */
+static bool
+xlog_state_iodone_process_iclog(
+        struct xlog             *log,
+        struct xlog_in_core     *iclog,
+        struct xlog_in_core     *completed_iclog,
+        bool                    *ioerror)
+{
+        xfs_lsn_t               lowest_lsn;
+        xfs_lsn_t               header_lsn;
+        /* Skip all iclogs in the ACTIVE & DIRTY states */
+        if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+                return false;
+        /*
+         * Between marking a filesystem SHUTDOWN and stopping the log, we do
+         * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
+         * want things to go smoothly in case of just a SHUTDOWN  w/o a
+         * LOG_IO_ERROR.
+         */
+        if (iclog->ic_state & XLOG_STATE_IOERROR) {
+                *ioerror = true;
+                return false;
+        }
+        /*
+         * Can only perform callbacks in order.  Since this iclog is not in the
+         * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
+         * up.  If we set our iclog to DO_CALLBACK, we will not process it when
+         * we retry since a previous iclog is in the CALLBACK and the state
+         * cannot change since we are holding the l_icloglock.
+         */
+        if (!(iclog->ic_state &
+                        (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
+                if (completed_iclog &&
+                    (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
+                        completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
+                }
+                return true;
+        }
+        /*
+         * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
+         * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
+         * by the above if and are going to clean (i.e. we aren't doing their
+         * callbacks) see the above if.
+         *
+         * We will do one more check here to see if we have chased our tail
+         * around. If this is not the lowest lsn iclog, then we will leave it
+         * for another completion to process.
+         */
+        header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+        lowest_lsn = xlog_get_lowest_lsn(log);
+        if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+                return false;
+        xlog_state_set_callback(log, iclog, header_lsn);
+        return false;
+}
+/*
+ * Keep processing entries in the iclog callback list until we come around and
+ * it is empty.  We need to atomically see that the list is empty and change the
+ * state to DIRTY so that we don't miss any more callbacks being added.
+ *
+ * This function is called with the icloglock held and returns with it held. We
+ * drop it while running callbacks, however, as holding it over thousands of
+ * callbacks is unnecessary and causes excessive contention if we do.
+ */
+static void
+xlog_state_do_iclog_callbacks(
+        struct xlog             *log,
+        struct xlog_in_core     *iclog,
+        bool                    aborted)
+{
+        spin_unlock(&log->l_icloglock);
+        spin_lock(&iclog->ic_callback_lock);
+        while (!list_empty(&iclog->ic_callbacks)) {
+                LIST_HEAD(tmp);
+                list_splice_init(&iclog->ic_callbacks, &tmp);
+                spin_unlock(&iclog->ic_callback_lock);
+                xlog_cil_process_committed(&tmp, aborted);
+                spin_lock(&iclog->ic_callback_lock);
+        }
+        /*
+         * Pick up the icloglock while still holding the callback lock so we
+         * serialise against anyone trying to add more callbacks to this iclog
+         * now we've finished processing.
+         */
+        spin_lock(&log->l_icloglock);
+        spin_unlock(&iclog->ic_callback_lock);
+}
+#ifdef DEBUG
+/*
+ * Make one last gasp attempt to see if iclogs are being left in limbo.  If the
+ * above loop finds an iclog earlier than the current iclog and in one of the
+ * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
+ * are deferred to the completion of the earlier iclog. Walk the iclogs in order
+ * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
+ * one of the syncing states.
+ *
+ * Note that SYNCING|IOERROR is a valid state so we cannot just check for
+ * ic_state == SYNCING.
+ */
+static void
+xlog_state_callback_check_state(
+        struct xlog             *log)
+{
+        struct xlog_in_core     *first_iclog = log->l_iclog;
+        struct xlog_in_core     *iclog = first_iclog;
+        do {
+                ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+                /*
+                 * Terminate the loop if iclogs are found in states
+                 * which will cause other threads to clean up iclogs.
+                 *
+                 * SYNCING - i/o completion will go through logs
+                 * DONE_SYNC - interrupt thread should be waiting for
+                 *              l_icloglock
+                 * IOERROR - give up hope all ye who enter here
+                 */
+                if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+                    iclog->ic_state & XLOG_STATE_SYNCING ||
+                    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+                    iclog->ic_state == XLOG_STATE_IOERROR )
+                        break;
+                iclog = iclog->ic_next;
+        } while (first_iclog != iclog);
+}
+#else
+#define xlog_state_callback_check_state(l)      ((void)0)
+#endif
 STATIC void
 xlog_state_do_callback(
        struct xlog             *log,
        bool                    aborted,
        struct xlog_in_core     *ciclog)
 {
-        xlog_in_core_t     *iclog;
+        struct xlog_in_core     *iclog;
-        xlog_in_core_t     *first_iclog;        /* used to know when we've
+        struct xlog_in_core     *first_iclog;
-                                                 * processed all iclogs once */
+        bool                    did_callbacks = false;
-        int                flushcnt = 0;
+        bool                    cycled_icloglock;
-        xfs_lsn_t          lowest_lsn;
+        bool                    ioerror;
-        int                ioerrors;    /* counter: iclogs with errors */
+        int                     flushcnt = 0;
-        int                loopdidcallbacks; /* flag: inner loop did callbacks*/
+        int                     repeats = 0;
-        int                funcdidcallbacks; /* flag: function did callbacks */
-        int                repeats;     /* for issuing console warnings if
-                                         * looping too many times */
-        int                wake = 0;
        spin_lock(&log->l_icloglock);
-        first_iclog = iclog = log->l_iclog;
-        ioerrors = 0;
-        funcdidcallbacks = 0;
-        repeats = 0;
        do {
                /*
                 * Scan all iclogs starting with the one pointed to by the
@@ -2638,137 +2860,34 @@ xlog_state_do_callback(
                 */
                first_iclog = log->l_iclog;
                iclog = log->l_iclog;
-                loopdidcallbacks = 0;
+                cycled_icloglock = false;
+                ioerror = false;
                repeats++;
                do {
+                        if (xlog_state_iodone_process_iclog(log, iclog,
+                                                        ciclog, &ioerror))
+                                break;
-                        /* skip all iclogs in the ACTIVE & DIRTY states */
+                        if (!(iclog->ic_state &
-                        if (iclog->ic_state &
+                              (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
-                            (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
                                iclog = iclog->ic_next;
                                continue;
                        }
                        /*
-                         * Between marking a filesystem SHUTDOWN and stopping
+                         * Running callbacks will drop the icloglock which means
-                         * the log, we do flush all iclogs to disk (if there
+                         * we'll have to run at least one more complete loop.
-                         * wasn't a log I/O error). So, we do want things to
-                         * go smoothly in case of just a SHUTDOWN  w/o a
-                         * LOG_IO_ERROR.
-                         */
-                        if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
-                                /*
-                                 * Can only perform callbacks in order.  Since
-                                 * this iclog is not in the DONE_SYNC/
-                                 * DO_CALLBACK state, we skip the rest and
-                                 * just try to clean up.  If we set our iclog
-                                 * to DO_CALLBACK, we will not process it when
-                                 * we retry since a previous iclog is in the
-                                 * CALLBACK and the state cannot change since
-                                 * we are holding the l_icloglock.
-                                 */
-                                if (!(iclog->ic_state &
-                                        (XLOG_STATE_DONE_SYNC |
-                                                 XLOG_STATE_DO_CALLBACK))) {
-                                        if (ciclog && (ciclog->ic_state ==
-                                                        XLOG_STATE_DONE_SYNC)) {
-                                                ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
-                                        }
-                                        break;
-                                }
-                                /*
-                                 * We now have an iclog that is in either the
-                                 * DO_CALLBACK or DONE_SYNC states. The other
-                                 * states (WANT_SYNC, SYNCING, or CALLBACK were
-                                 * caught by the above if and are going to
-                                 * clean (i.e. we aren't doing their callbacks)
-                                 * see the above if.
-                                 */
-                                /*
-                                 * We will do one more check here to see if we
-                                 * have chased our tail around.
-                                 */
-                                lowest_lsn = xlog_get_lowest_lsn(log);
-                                if (lowest_lsn &&
-                                    XFS_LSN_CMP(lowest_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
-                                        iclog = iclog->ic_next;
-                                        continue; /* Leave this iclog for
-                                                   * another thread */
-                                }
-                                iclog->ic_state = XLOG_STATE_CALLBACK;
-                                /*
-                                 * Completion of a iclog IO does not imply that
-                                 * a transaction has completed, as transactions
-                                 * can be large enough to span many iclogs. We
-                                 * cannot change the tail of the log half way
-                                 * through a transaction as this may be the only
-                                 * transaction in the log and moving th etail to
-                                 * point to the middle of it will prevent
-                                 * recovery from finding the start of the
-                                 * transaction. Hence we should only update the
-                                 * last_sync_lsn if this iclog contains
-                                 * transaction completion callbacks on it.
-                                 *
-                                 * We have to do this before we drop the
-                                 * icloglock to ensure we are the only one that
-                                 * can update it.
-                                 */
-                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                if (!list_empty_careful(&iclog->ic_callbacks))
-                                        atomic64_set(&log->l_last_sync_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn));
-                        } else
-                                ioerrors++;
-                        spin_unlock(&log->l_icloglock);
-                        /*
-                         * Keep processing entries in the callback list until
-                         * we come around and it is empty.  We need to
-                         * atomically see that the list is empty and change the
-                         * state to DIRTY so that we don't miss any more
-                         * callbacks being added.
-                         */
-                        spin_lock(&iclog->ic_callback_lock);
-                        while (!list_empty(&iclog->ic_callbacks)) {
-                                LIST_HEAD(tmp);
-                                list_splice_init(&iclog->ic_callbacks, &tmp);
-                                spin_unlock(&iclog->ic_callback_lock);
-                                xlog_cil_process_committed(&tmp, aborted);
-                                spin_lock(&iclog->ic_callback_lock);
-                        }
-                        loopdidcallbacks++;
-                        funcdidcallbacks++;
-                        spin_lock(&log->l_icloglock);
-                        spin_unlock(&iclog->ic_callback_lock);
-                        if (!(iclog->ic_state & XLOG_STATE_IOERROR))
-                                iclog->ic_state = XLOG_STATE_DIRTY;
-                        /*
-                         * Transition from DIRTY to ACTIVE if applicable.
-                         * NOP if STATE_IOERROR.
                         */
-                        xlog_state_clean_log(log);
+                        cycled_icloglock = true;
+                        xlog_state_do_iclog_callbacks(log, iclog, aborted);
-                        /* wake up threads waiting in xfs_log_force() */
-                        wake_up_all(&iclog->ic_force_wait);
+                        xlog_state_clean_iclog(log, iclog);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
+                did_callbacks |= cycled_icloglock;
                if (repeats > 5000) {
                        flushcnt += repeats;
                        repeats = 0;
@@ -2776,50 +2895,15 @@ xlog_state_do_callback(
                                "%s: possible infinite loop (%d iterations)",
                                __func__, flushcnt);
                }
-        } while (!ioerrors && loopdidcallbacks);
+        } while (!ioerror && cycled_icloglock);
-#ifdef DEBUG
+        if (did_callbacks)
-        /*
+                xlog_state_callback_check_state(log);
-         * Make one last gasp attempt to see if iclogs are being left in limbo.
-         * If the above loop finds an iclog earlier than the current iclog and
-         * in one of the syncing states, the current iclog is put into
-         * DO_CALLBACK and the callbacks are deferred to the completion of the
-         * earlier iclog. Walk the iclogs in order and make sure that no iclog
-         * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
-         * states.
-         *
-         * Note that SYNCING|IOABORT is a valid state so we cannot just check
-         * for ic_state == SYNCING.
-         */
-        if (funcdidcallbacks) {
-                first_iclog = iclog = log->l_iclog;
-                do {
-                        ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
-                        /*
-                         * Terminate the loop if iclogs are found in states
-                         * which will cause other threads to clean up iclogs.
-                         *
-                         * SYNCING - i/o completion will go through logs
-                         * DONE_SYNC - interrupt thread should be waiting for
-                         *              l_icloglock
-                         * IOERROR - give up hope all ye who enter here
-                         */
-                        if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
-                            iclog->ic_state & XLOG_STATE_SYNCING ||
-                            iclog->ic_state == XLOG_STATE_DONE_SYNC ||
-                            iclog->ic_state == XLOG_STATE_IOERROR )
-                                break;
-                        iclog = iclog->ic_next;
-                } while (first_iclog != iclog);
-        }
-#endif
        if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
-                wake = 1;
-        spin_unlock(&log->l_icloglock);
-        if (wake)
                wake_up_all(&log->l_flush_wait);
+        spin_unlock(&log->l_icloglock);
 }
@@ -3919,7 +4003,9 @@ xfs_log_force_umount(
         * item committed callback functions will do this again under lock to
         * avoid races.
         */
+        spin_lock(&log->l_cilp->xc_push_lock);
        wake_up_all(&log->l_cilp->xc_commit_wait);
+        spin_unlock(&log->l_cilp->xc_push_lock);
        xlog_state_do_callback(log, true, NULL);
 #ifdef XFSERRORDEBUG
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index fa5602d0fd7f..ef652abd112c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -38,7 +38,7 @@ xlog_cil_ticket_alloc(
        struct xlog_ticket *tic;
        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
-                                KM_SLEEP|KM_NOFS);
+                                KM_NOFS);
        /*
         * set the current reservation to zero so we know to steal the basic
@@ -186,7 +186,7 @@ xlog_cil_alloc_shadow_bufs(
                         */
                        kmem_free(lip->li_lv_shadow);
-                        lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS);
+                        lv = kmem_alloc_large(buf_size, KM_NOFS);
                        memset(lv, 0, xlog_cil_iovec_space(niovecs));
                        lv->lv_item = lip;
@@ -660,7 +660,7 @@ xlog_cil_push(
        if (!cil)
                return 0;
-        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
        new_ctx->ticket = xlog_cil_ticket_alloc(log);
        down_write(&cil->xc_ctx_lock);
@@ -1179,11 +1179,11 @@ xlog_cil_init(
        struct xfs_cil  *cil;
        struct xfs_cil_ctx *ctx;
-        cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+        cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
        if (!cil)
                return -ENOMEM;
-        ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+        ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
        if (!ctx) {
                kmem_free(cil);
                return -ENOMEM;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13d1d3e95b88..508319039dce 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -97,6 +97,8 @@ xlog_alloc_buffer(
        struct xlog     *log,
        int             nbblks)
 {
+        int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
        /*
         * Pass log block 0 since we don't have an addr yet, buffer will be
         * verified on read.
@@ -125,7 +127,7 @@ xlog_alloc_buffer(
        if (nbblks > 1 && log->l_sectBBsize > 1)
                nbblks += log->l_sectBBsize;
        nbblks = round_up(nbblks, log->l_sectBBsize);
-        return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL);
+        return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL);
 }
 /*
@@ -1960,7 +1962,7 @@ xlog_recover_buffer_pass1(
                }
        }
-        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
        bcp->bc_blkno = buf_f->blf_blkno;
        bcp->bc_len = buf_f->blf_len;
        bcp->bc_refcount = 1;
@@ -2930,7 +2932,7 @@ xlog_recover_inode_pass2(
        if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
-                in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
+                in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
                need_free = 1;
                error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
                if (error)
@@ -4161,7 +4163,7 @@ xlog_recover_add_item(
 {
        xlog_recover_item_t     *item;
-        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+        item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
        INIT_LIST_HEAD(&item->ri_list);
        list_add_tail(&item->ri_list, head);
 }
@@ -4201,7 +4203,7 @@ xlog_recover_add_to_cont_trans(
        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
        old_len = item->ri_buf[item->ri_cnt-1].i_len;
-        ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
+        ptr = kmem_realloc(old_ptr, len + old_len, 0);
        memcpy(&ptr[old_len], dp, len);
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4261,7 +4263,7 @@ xlog_recover_add_to_trans(
                return 0;
        }
-        ptr = kmem_alloc(len, KM_SLEEP);
+        ptr = kmem_alloc(len, 0);
        memcpy(ptr, dp, len);
        in_f = (struct xfs_inode_log_format *)ptr;
@@ -4289,7 +4291,7 @@ xlog_recover_add_to_trans(
                item->ri_total = in_f->ilf_size;
                item->ri_buf =
                        kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
-                                    KM_SLEEP);
+                                    0);
        }
        ASSERT(item->ri_total > item->ri_cnt);
        /* Description region is ri_buf[0] */
@@ -4423,7 +4425,7 @@ xlog_recover_ophdr_to_trans(
         * This is a new transaction so allocate a new recovery container to
         * hold the recovery ops that will follow.
         */
-        trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+        trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
        trans->r_log_tid = tid;
        trans->r_lsn = be64_to_cpu(rhead->h_lsn);
        INIT_LIST_HEAD(&trans->r_itemq);
@@ -5022,16 +5024,27 @@ xlog_recover_process_one_iunlink(
 }
 /*
- * xlog_iunlink_recover
+ * Recover AGI unlinked lists
+ *
+ * This is called during recovery to process any inodes which we unlinked but
+ * not freed when the system crashed.  These inodes will be on the lists in the
+ * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
+ * any inodes found on the lists. Each inode is removed from the lists when it
+ * has been fully truncated and is freed. The freeing of the inode and its
+ * removal from the list must be atomic.
+ *
+ * If everything we touch in the agi processing loop is already in memory, this
+ * loop can hold the cpu for a long time. It runs without lock contention,
+ * memory allocation contention, the need wait for IO, etc, and so will run
+ * until we either run out of inodes to process, run low on memory or we run out
+ * of log space.
 *
- * This is called during recovery to process any inodes which
+ * This behaviour is bad for latency on single CPU and non-preemptible kernels,
- * we unlinked but not freed when the system crashed.  These
+ * and can prevent other filesytem work (such as CIL pushes) from running. This
- * inodes will be on the lists in the AGI blocks.  What we do
+ * can lead to deadlocks if the recovery process runs out of log reservation
- * here is scan all the AGIs and fully truncate and free any
+ * space. Hence we need to yield the CPU when there is other kernel work
- * inodes found on the lists.  Each inode is removed from the
+ * scheduled on this CPU to ensure other scheduled work can run without undue
- * lists when it has been fully truncated and is freed.  The
+ * latency.
- * freeing of the inode and its removal from the list must be
- * atomic.
 */
 STATIC void
 xlog_recover_process_iunlinks(
@@ -5078,6 +5091,7 @@ xlog_recover_process_iunlinks(
                        while (agino != NULLAGINO) {
                                agino = xlog_recover_process_one_iunlink(mp,
                                                        agno, agino, bucket);
+                                cond_resched();
                        }
                }
                xfs_buf_rele(agibp);
@@ -5527,7 +5541,7 @@ xlog_do_log_recovery(
         */
        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
                                                 sizeof(struct list_head),
-                                                 KM_SLEEP);
+                                                 0);
        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 322da6909290..ba5b6f3b2b88 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -82,7 +82,7 @@ xfs_uuid_mount(
        if (hole < 0) {
                xfs_uuid_table = kmem_realloc(xfs_uuid_table,
                        (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
-                        KM_SLEEP);
+                        0);
                hole = xfs_uuid_table_size++;
        }
        xfs_uuid_table[hole] = *uuid;
@@ -214,7 +214,7 @@ xfs_initialize_perag(
                spin_lock(&mp->m_perag_lock);
                if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
-                        BUG();
+                        WARN_ON_ONCE(1);
                        spin_unlock(&mp->m_perag_lock);
                        radix_tree_preload_end();
                        error = -EEXIST;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4adb6837439a..fdb60e09a9c5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,13 +327,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 }
 /* per-AG block reservation data structures*/
-enum xfs_ag_resv_type {
-        XFS_AG_RESV_NONE = 0,
-        XFS_AG_RESV_AGFL,
-        XFS_AG_RESV_METADATA,
-        XFS_AG_RESV_RMAPBT,
-};
 struct xfs_ag_resv {
        /* number of blocks originally reserved here */
        xfs_extlen_t                    ar_orig_reserved;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 74738813f60d..a06661dac5be 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,12 +333,12 @@ xfs_mru_cache_create(
        if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
                return -EINVAL;
-        if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
+        if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
                return -ENOMEM;
        /* An extra list is needed to avoid reaping up to a grp_time early. */
        mru->grp_count = grp_count + 1;
-        mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+        mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
        if (!mru->lists) {
                err = -ENOMEM;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5e7a37f0cf84..ecd8ce152ab1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -642,7 +642,7 @@ xfs_qm_init_quotainfo(
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
        error = list_lru_init(&qinf->qi_lru);
        if (error)
@@ -978,7 +978,7 @@ xfs_qm_reset_dqcounts_buf(
        if (qip->i_d.di_nblocks == 0)
                return 0;
-        map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+        map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
        lblkno = 0;
        maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index d8288aa0670a..2328268e6245 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -144,9 +144,9 @@ xfs_cui_init(
        ASSERT(nextents > 0);
        if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
                cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
-                                KM_SLEEP);
+                                0);
        else
-                cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP);
+                cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
        xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
        cuip->cui_format.cui_nextents = nextents;
@@ -223,7 +223,7 @@ xfs_trans_get_cud(
 {
        struct xfs_cud_log_item         *cudp;
-        cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
+        cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
        xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
                          &xfs_cud_item_ops);
        cudp->cud_cuip = cuip;
@@ -555,26 +555,24 @@ xfs_cui_recover(
                        irec.br_blockcount = new_len;
                        switch (type) {
                        case XFS_REFCOUNT_INCREASE:
-                                error = xfs_refcount_increase_extent(tp, &irec);
+                                xfs_refcount_increase_extent(tp, &irec);
                                break;
                        case XFS_REFCOUNT_DECREASE:
-                                error = xfs_refcount_decrease_extent(tp, &irec);
+                                xfs_refcount_decrease_extent(tp, &irec);
                                break;
                        case XFS_REFCOUNT_ALLOC_COW:
-                                error = xfs_refcount_alloc_cow_extent(tp,
+                                xfs_refcount_alloc_cow_extent(tp,
                                                irec.br_startblock,
                                                irec.br_blockcount);
                                break;
                        case XFS_REFCOUNT_FREE_COW:
-                                error = xfs_refcount_free_cow_extent(tp,
+                                xfs_refcount_free_cow_extent(tp,
                                                irec.br_startblock,
                                                irec.br_blockcount);
                                break;
                        default:
                                ASSERT(0);
                        }
-                        if (error)
-                                goto abort_error;
                        requeue_only = true;
                }
        }
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index edbe37b7f636..0f08153b4994 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -495,10 +495,8 @@ xfs_reflink_cancel_cow_blocks(
                        ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
                        /* Free the CoW orphan record. */
-                        error = xfs_refcount_free_cow_extent(*tpp,
+                        xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
-                                        del.br_startblock, del.br_blockcount);
+                                        del.br_blockcount);
-                        if (error)
-                                break;
                        xfs_bmap_add_free(*tpp, del.br_startblock,
                                          del.br_blockcount, NULL);
@@ -675,15 +673,10 @@ xfs_reflink_end_cow_extent(
        trace_xfs_reflink_cow_remap(ip, &del);
        /* Free the CoW orphan record. */
-        error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
+        xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
-                        del.br_blockcount);
-        if (error)
-                goto out_cancel;
        /* Map the new blocks into the data fork. */
-        error = xfs_bmap_map_extent(tp, ip, &del);
+        xfs_bmap_map_extent(tp, ip, &del);
-        if (error)
-                goto out_cancel;
        /* Charge this new data fork mapping to the on-disk quota. */
        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1070,14 +1063,10 @@ xfs_reflink_remap_extent(
                                uirec.br_blockcount, uirec.br_startblock);
                /* Update the refcount tree */
-                error = xfs_refcount_increase_extent(tp, &uirec);
+                xfs_refcount_increase_extent(tp, &uirec);
-                if (error)
-                        goto out_cancel;
                /* Map the new blocks into the data fork. */
-                error = xfs_bmap_map_extent(tp, ip, &uirec);
+                xfs_bmap_map_extent(tp, ip, &uirec);
-                if (error)
-                        goto out_cancel;
                /* Update quota accounting. */
                xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 77ed557b6127..8939e0ea09cd 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -142,9 +142,9 @@ xfs_rui_init(
        ASSERT(nextents > 0);
        if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
-                ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+                ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
        else
-                ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
+                ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
        xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
        ruip->rui_format.rui_nextents = nextents;
@@ -244,7 +244,7 @@ xfs_trans_get_rud(
 {
        struct xfs_rud_log_item         *rudp;
-        rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
+        rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
        xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
                          &xfs_rud_item_ops);
        rudp->rud_ruip = ruip;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5fa4db3c3e32..4a48a8c75b4f 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -865,7 +865,7 @@ xfs_alloc_rsum_cache(
         * lower bound on the minimum level with any free extents. We can
         * continue without the cache if it couldn't be allocated.
         */
-        mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP);
+        mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
        if (!mp->m_rsum_cache)
                xfs_warn(mp, "could not allocate realtime summary cache");
 }
@@ -963,7 +963,7 @@ xfs_growfs_rt(
        /*
         * Allocate a new (fake) mount/sb.
         */
-        nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+        nmp = kmem_alloc(sizeof(*nmp), 0);
        /*
         * Loop over the bitmap blocks.
         * We will do everything one bitmap block at a time.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f9450235533c..391b4748cae3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -818,7 +818,8 @@ xfs_init_mount_workqueues(
                goto out_destroy_buf;
        mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
-                        WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+                        WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+                        0, mp->m_fsname);
        if (!mp->m_cil_workqueue)
                goto out_destroy_unwritten;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8094b1920eef..eaae275ed430 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -23,6 +23,7 @@ struct xlog;
 struct xlog_ticket;
 struct xlog_recover;
 struct xlog_recover_item;
+struct xlog_rec_header;
 struct xfs_buf_log_format;
 struct xfs_inode_log_format;
 struct xfs_bmbt_irec;
@@ -30,6 +31,10 @@ struct xfs_btree_cur;
 struct xfs_refcount_irec;
 struct xfs_fsmap;
 struct xfs_rmap_irec;
+struct xfs_icreate_log;
+struct xfs_owner_info;
+struct xfs_trans_res;
+struct xfs_inobt_rec_incore;
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
        TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -3575,6 +3580,35 @@ TRACE_EVENT(xfs_pwork_init,
                  __entry->nr_threads, __entry->pid)
 )
+DECLARE_EVENT_CLASS(xfs_kmem_class,
+        TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
+        TP_ARGS(size, flags, caller_ip),
+        TP_STRUCT__entry(
+                __field(ssize_t, size)
+                __field(int, flags)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->size = size;
+                __entry->flags = flags;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("size %zd flags 0x%x caller %pS",
+                  __entry->size,
+                  __entry->flags,
+                  (char *)__entry->caller_ip)
+)
+#define DEFINE_KMEM_EVENT(name) \
+DEFINE_EVENT(xfs_kmem_class, name, \
+        TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
+        TP_ARGS(size, flags, caller_ip))
+DEFINE_KMEM_EVENT(kmem_alloc);
+DEFINE_KMEM_EVENT(kmem_alloc_io);
+DEFINE_KMEM_EVENT(kmem_alloc_large);
+DEFINE_KMEM_EVENT(kmem_realloc);
+DEFINE_KMEM_EVENT(kmem_zone_alloc);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d42a68d8313b..f4795fdb7389 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -90,7 +90,7 @@ xfs_trans_dup(
        trace_xfs_trans_dup(tp, _RET_IP_);
-        ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+        ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
        /*
         * Initialize the new transaction structure.
@@ -263,7 +263,7 @@ xfs_trans_alloc(
         * GFP_NOFS allocation context so that we avoid lockdep false positives
         * by doing GFP_KERNEL allocations inside sb_start_intwrite().
         */
-        tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+        tp = kmem_zone_zalloc(xfs_trans_zone, 0);
        if (!(flags & XFS_TRANS_NO_WRITECOUNT))
                sb_start_intwrite(mp->m_super);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 1027c9ca6eb8..16457465833b 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -863,7 +863,7 @@ STATIC void
 xfs_trans_alloc_dqinfo(
        xfs_trans_t     *tp)
 {
-        tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
+        tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
 }
 void
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 3123b5aaad2a..cb895b1df5e4 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -30,7 +30,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
                value = NULL;
        }
-        error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+        error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
        if (error)
                return error;
        return asize;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae6648145d18..ffe35d97afcb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3543,6 +3543,8 @@ extern void inode_nohighmem(struct inode *inode);
 /* mm/fadvise.c */
 extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
                       int advice);
+extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
+                           int advice);
 #if defined(CONFIG_IO_URING)
 extern struct sock *io_uring_get_socket(struct file *file);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd032037..4f17c83db575 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -27,8 +27,7 @@
 * deactivate the pages and clear PG_Referenced.
 */
-static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
+int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
-                           int advice)
 {
        struct inode *inode;
        struct address_space *mapping;
@@ -178,6 +177,7 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
        }
        return 0;
 }
+EXPORT_SYMBOL(generic_fadvise);
 int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 {
diff --git a/mm/madvise.c b/mm/madvise.c
index 968df3aa069f..bac973b9f2cc 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -14,6 +14,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
+#include <linux/fadvise.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/fs.h>
@@ -275,6 +276,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end)
 {
        struct file *file = vma->vm_file;
+        loff_t offset;
        *prev = vma;
 #ifdef CONFIG_SWAP
@@ -298,12 +300,20 @@ static long madvise_willneed(struct vm_area_struct *vma,
                return 0;
        }
-        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+        /*
-        if (end > vma->vm_end)
+         * Filesystem's fadvise may need to take various locks.  We need to
-                end = vma->vm_end;
+         * explicitly grab a reference because the vma (and hence the
-        end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+         * vma's reference to the file) can go away as soon as we drop
+         * mmap_sem.
-        force_page_cache_readahead(file->f_mapping, file, start, end - start);
+         */
+        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
+        get_file(file);
+        up_read(&current->mm->mmap_sem);
+        offset = (loff_t)(start - vma->vm_start)
+                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+        vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
+        fput(file);
+        down_read(&current->mm->mmap_sem);
        return 0;
 }