49 files changed, 2024 insertions, 1949 deletions
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 0bfafe108357..5a5a05582b58 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -228,30 +228,19 @@ default behaviour.
 Deprecated Mount Options
 ========================
-  delaylog/nodelaylog
+None at present.
-        Delayed logging is the only logging method that XFS supports
-        now, so these mount options are now ignored.
-        Due for removal in 3.12.
-  ihashsize=value
-        In memory inode hashes have been removed, so this option has
-        no function as of August 2007. Option is deprecated.
-        Due for removal in 3.12.
-  irixsgid
-        This behaviour is now controlled by a sysctl, so the mount
-        option is ignored.
-        Due for removal in 3.12.
+Removed Mount Options
+=====================
-  osyncisdsync
+  Name                          Removed
-  osyncisosync
+  ----                          -------
-        O_SYNC and O_DSYNC are fully supported, so there is no need
+  delaylog/nodelaylog           v3.20
-        for these options any more.
+  ihashsize                     v3.20
+  irixsgid                      v3.20
+  osyncisdsync/osyncisosync     v3.20
-        Due for removal in 3.12.
 sysctls
 =======
diff --git a/fs/open.c b/fs/open.c
index 6796f04d6032..98e5a52dc68c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -231,8 +231,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                return -EINVAL;
        /* Return error if mode is not supported */
-        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+        if (mode & ~FALLOC_FL_SUPPORTED_MASK)
-                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
        /* Punch hole and zero range are mutually exclusive */
@@ -250,6 +249,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
            (mode & ~FALLOC_FL_COLLAPSE_RANGE))
                return -EINVAL;
+        /* Insert range should only be used exclusively. */
+        if ((mode & FALLOC_FL_INSERT_RANGE) &&
+            (mode & ~FALLOC_FL_INSERT_RANGE))
+                return -EINVAL;
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a6fbf4472017..516162be1398 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -260,6 +260,7 @@ xfs_alloc_fix_len(
                rlen = rlen - (k - args->mod);
        else
                rlen = rlen - args->prod + (args->mod - k);
+        /* casts to (int) catch length underflows */
        if ((int)rlen < (int)args->minlen)
                return;
        ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
@@ -286,7 +287,8 @@ xfs_alloc_fix_minleft(
        if (diff >= 0)
                return 1;
        args->len += diff;              /* shrink the allocated space */
-        if (args->len >= args->minlen)
+        /* casts to (int) catch length underflows */
+        if ((int)args->len >= (int)args->minlen)
                return 1;
        args->agbno = NULLAGBLOCK;
        return 0;
@@ -315,6 +317,9 @@ xfs_alloc_fixup_trees(
        xfs_agblock_t   nfbno2;         /* second new free startblock */
        xfs_extlen_t    nflen1=0;       /* first new free length */
        xfs_extlen_t    nflen2=0;       /* second new free length */
+        struct xfs_mount *mp;
+        mp = cnt_cur->bc_mp;
        /*
         * Look up the record in the by-size tree if necessary.
@@ -323,13 +328,13 @@ xfs_alloc_fixup_trees(
 #ifdef DEBUG
                if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        i == 1 && nfbno1 == fbno && nflen1 == flen);
 #endif
        } else {
                if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
        /*
         * Look up the record in the by-block tree if necessary.
@@ -338,13 +343,13 @@ xfs_alloc_fixup_trees(
 #ifdef DEBUG
                if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        i == 1 && nfbno1 == fbno && nflen1 == flen);
 #endif
        } else {
                if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
 #ifdef DEBUG
@@ -355,7 +360,7 @@ xfs_alloc_fixup_trees(
                bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
                cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        bnoblock->bb_numrecs == cntblock->bb_numrecs);
        }
 #endif
@@ -386,25 +391,25 @@ xfs_alloc_fixup_trees(
         */
        if ((error = xfs_btree_delete(cnt_cur, &i)))
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        /*
         * Add new by-size btree entry(s).
         */
        if (nfbno1 != NULLAGBLOCK) {
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
        if (nfbno2 != NULLAGBLOCK) {
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
        /*
         * Fix up the by-block btree entry(s).
@@ -415,7 +420,7 @@ xfs_alloc_fixup_trees(
                 */
                if ((error = xfs_btree_delete(bno_cur, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        } else {
                /*
                 * Update the by-block entry to start later|be shorter.
@@ -429,10 +434,10 @@ xfs_alloc_fixup_trees(
                 */
                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
                if ((error = xfs_btree_insert(bno_cur, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
        return 0;
 }
@@ -682,7 +687,7 @@ xfs_alloc_ag_vextent_exact(
        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
        if (error)
                goto error0;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
        ASSERT(fbno <= args->agbno);
        /*
@@ -783,7 +788,7 @@ xfs_alloc_find_best_extent(
                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
                /*
@@ -946,7 +951,7 @@ restart:
                                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
                                                &ltlen, &i)))
                                        goto error0;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                                if (ltlen >= args->minlen)
                                        break;
                                if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
@@ -966,7 +971,7 @@ restart:
                         */
                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
                                                  &ltbnoa, &ltlena);
                        if (ltlena < args->minlen)
@@ -999,7 +1004,7 @@ restart:
                cnt_cur->bc_ptrs[0] = besti;
                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
                args->len = blen;
                if (!xfs_alloc_fix_minleft(args)) {
@@ -1088,7 +1093,7 @@ restart:
                if (bno_cur_lt) {
                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
                                                  &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
@@ -1104,7 +1109,7 @@ restart:
                if (bno_cur_gt) {
                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, gtbno, gtlen,
                                                  &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
@@ -1303,7 +1308,7 @@ restart:
                        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
                        if (error)
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, fbno, flen,
                                                  &rbno, &rlen);
@@ -1342,7 +1347,7 @@ restart:
         * This can't happen in the second case above.
         */
        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+        XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
        if (rlen < args->maxlen) {
                xfs_agblock_t   bestfbno;
@@ -1362,13 +1367,13 @@ restart:
                        if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
                                        &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        if (flen < bestrlen)
                                break;
                        xfs_alloc_compute_aligned(args, fbno, flen,
                                                  &rbno, &rlen);
                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-                        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
                                (rlen <= flen && rbno + rlen <= fbno + flen),
                                error0);
                        if (rlen > bestrlen) {
@@ -1383,7 +1388,7 @@ restart:
                if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
                                &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                rlen = bestrlen;
                rbno = bestrbno;
                flen = bestflen;
@@ -1408,7 +1413,7 @@ restart:
        if (!xfs_alloc_fix_minleft(args))
                goto out_nominleft;
        rlen = args->len;
-        XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+        XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
        /*
         * Allocate and initialize a cursor for the by-block tree.
         */
@@ -1422,7 +1427,7 @@ restart:
        cnt_cur = bno_cur = NULL;
        args->len = rlen;
        args->agbno = rbno;
-        XFS_WANT_CORRUPTED_GOTO(
+        XFS_WANT_CORRUPTED_GOTO(args->mp,
                args->agbno + args->len <=
                        be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
                error0);
@@ -1467,7 +1472,7 @@ xfs_alloc_ag_vextent_small(
        if (i) {
                if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
        }
        /*
         * Nothing in the btree, try the freelist.  Make sure
@@ -1493,7 +1498,7 @@ xfs_alloc_ag_vextent_small(
                        }
                        args->len = 1;
                        args->agbno = fbno;
-                        XFS_WANT_CORRUPTED_GOTO(
+                        XFS_WANT_CORRUPTED_GOTO(args->mp,
                                args->agbno + args->len <=
                                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
                                error0);
@@ -1579,7 +1584,7 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * It's not contiguous, though.
                 */
@@ -1591,7 +1596,8 @@ xfs_free_ag_extent(
                         * space was invalid, it's (partly) already free.
                         * Very bad.
                         */
-                        XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+                        XFS_WANT_CORRUPTED_GOTO(mp,
+                                                ltbno + ltlen <= bno, error0);
                }
        }
        /*
@@ -1606,7 +1612,7 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * It's not contiguous, though.
                 */
@@ -1618,7 +1624,7 @@ xfs_free_ag_extent(
                         * space was invalid, it's (partly) already free.
                         * Very bad.
                         */
-                        XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+                        XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0);
                }
        }
        /*
@@ -1635,31 +1641,31 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Delete the old by-size entry on the right.
                 */
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Delete the old by-block entry for the right block.
                 */
                if ((error = xfs_btree_delete(bno_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Move the by-block cursor back to the left neighbor.
                 */
                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
 #ifdef DEBUG
                /*
                 * Check that this is the right record: delete didn't
@@ -1672,7 +1678,7 @@ xfs_free_ag_extent(
                        if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
                                        &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(
+                        XFS_WANT_CORRUPTED_GOTO(mp,
                                i == 1 && xxbno == ltbno && xxlen == ltlen,
                                error0);
                }
@@ -1695,17 +1701,17 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Back up the by-block cursor to the left neighbor, and
                 * update its length.
                 */
                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                nbno = ltbno;
                nlen = len + ltlen;
                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
@@ -1721,10 +1727,10 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Update the starting block and length of the right
                 * neighbor in the by-block tree.
@@ -1743,7 +1749,7 @@ xfs_free_ag_extent(
                nlen = len;
                if ((error = xfs_btree_insert(bno_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        bno_cur = NULL;
@@ -1752,10 +1758,10 @@ xfs_free_ag_extent(
         */
        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
                goto error0;
-        XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0);
        if ((error = xfs_btree_insert(cnt_cur, &i)))
                goto error0;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
        cnt_cur = NULL;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 15105dbc9e28..04e79d57bca6 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -86,8 +86,83 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
                        int move_count);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+/*
+ * attr3 block 'firstused' conversion helpers.
+ *
+ * firstused refers to the offset of the first used byte of the nameval region
+ * of an attr leaf block. The region starts at the tail of the block and expands
+ * backwards towards the middle. As such, firstused is initialized to the block
+ * size for an empty leaf block and is reduced from there.
+ *
+ * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k.
+ * The in-core firstused field is 32-bit and thus supports the maximum fsb size.
+ * The on-disk field is only 16-bit, however, and overflows at 64k. Since this
+ * only occurs at exactly 64k, we use zero as a magic on-disk value to represent
+ * the attr block size. The following helpers manage the conversion between the
+ * in-core and on-disk formats.
+ */
+static void
+xfs_attr3_leaf_firstused_from_disk(
+        struct xfs_da_geometry          *geo,
+        struct xfs_attr3_icleaf_hdr     *to,
+        struct xfs_attr_leafblock       *from)
+{
+        struct xfs_attr3_leaf_hdr       *hdr3;
+        if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+                hdr3 = (struct xfs_attr3_leaf_hdr *) from;
+                to->firstused = be16_to_cpu(hdr3->firstused);
+        } else {
+                to->firstused = be16_to_cpu(from->hdr.firstused);
+        }
+        /*
+         * Convert from the magic fsb size value to actual blocksize. This
+         * should only occur for empty blocks when the block size overflows
+         * 16-bits.
+         */
+        if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) {
+                ASSERT(!to->count && !to->usedbytes);
+                ASSERT(geo->blksize > USHRT_MAX);
+                to->firstused = geo->blksize;
+        }
+}
+static void
+xfs_attr3_leaf_firstused_to_disk(
+        struct xfs_da_geometry          *geo,
+        struct xfs_attr_leafblock       *to,
+        struct xfs_attr3_icleaf_hdr     *from)
+{
+        struct xfs_attr3_leaf_hdr       *hdr3;
+        uint32_t                        firstused;
+        /* magic value should only be seen on disk */
+        ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF);
+        /*
+         * Scale down the 32-bit in-core firstused value to the 16-bit on-disk
+         * value. This only overflows at the max supported value of 64k. Use the
+         * magic on-disk value to represent block size in this case.
+         */
+        firstused = from->firstused;
+        if (firstused > USHRT_MAX) {
+                ASSERT(from->firstused == geo->blksize);
+                firstused = XFS_ATTR3_LEAF_NULLOFF;
+        }
+        if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+                hdr3 = (struct xfs_attr3_leaf_hdr *) to;
+                hdr3->firstused = cpu_to_be16(firstused);
+        } else {
+                to->hdr.firstused = cpu_to_be16(firstused);
+        }
+}
 void
 xfs_attr3_leaf_hdr_from_disk(
+        struct xfs_da_geometry          *geo,
        struct xfs_attr3_icleaf_hdr     *to,
        struct xfs_attr_leafblock       *from)
 {
@@ -104,7 +179,7 @@ xfs_attr3_leaf_hdr_from_disk(
                to->magic = be16_to_cpu(hdr3->info.hdr.magic);
                to->count = be16_to_cpu(hdr3->count);
                to->usedbytes = be16_to_cpu(hdr3->usedbytes);
-                to->firstused = be16_to_cpu(hdr3->firstused);
+                xfs_attr3_leaf_firstused_from_disk(geo, to, from);
                to->holes = hdr3->holes;
                for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -118,7 +193,7 @@ xfs_attr3_leaf_hdr_from_disk(
        to->magic = be16_to_cpu(from->hdr.info.magic);
        to->count = be16_to_cpu(from->hdr.count);
        to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
-        to->firstused = be16_to_cpu(from->hdr.firstused);
+        xfs_attr3_leaf_firstused_from_disk(geo, to, from);
        to->holes = from->hdr.holes;
        for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -129,10 +204,11 @@ xfs_attr3_leaf_hdr_from_disk(
 void
 xfs_attr3_leaf_hdr_to_disk(
+        struct xfs_da_geometry          *geo,
        struct xfs_attr_leafblock       *to,
        struct xfs_attr3_icleaf_hdr     *from)
 {
-        int     i;
+        int                             i;
        ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
               from->magic == XFS_ATTR3_LEAF_MAGIC);
@@ -145,7 +221,7 @@ xfs_attr3_leaf_hdr_to_disk(
                hdr3->info.hdr.magic = cpu_to_be16(from->magic);
                hdr3->count = cpu_to_be16(from->count);
                hdr3->usedbytes = cpu_to_be16(from->usedbytes);
-                hdr3->firstused = cpu_to_be16(from->firstused);
+                xfs_attr3_leaf_firstused_to_disk(geo, to, from);
                hdr3->holes = from->holes;
                hdr3->pad1 = 0;
@@ -160,7 +236,7 @@ xfs_attr3_leaf_hdr_to_disk(
        to->hdr.info.magic = cpu_to_be16(from->magic);
        to->hdr.count = cpu_to_be16(from->count);
        to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
-        to->hdr.firstused = cpu_to_be16(from->firstused);
+        xfs_attr3_leaf_firstused_to_disk(geo, to, from);
        to->hdr.holes = from->holes;
        to->hdr.pad1 = 0;
@@ -178,7 +254,7 @@ xfs_attr3_leaf_verify(
        struct xfs_attr_leafblock *leaf = bp->b_addr;
        struct xfs_attr3_icleaf_hdr ichdr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
@@ -757,9 +833,10 @@ xfs_attr_shortform_allfit(
        struct xfs_attr3_icleaf_hdr leafhdr;
        int                     bytes;
        int                     i;
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
        entry = xfs_attr3_leaf_entryp(leaf);
        bytes = sizeof(struct xfs_attr_sf_hdr);
@@ -812,7 +889,7 @@ xfs_attr3_leaf_to_shortform(
        memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
        leaf = (xfs_attr_leafblock_t *)tmpbuffer;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        entry = xfs_attr3_leaf_entryp(leaf);
        /* XXX (dgc): buffer is about to be marked stale - why zero it? */
@@ -923,7 +1000,7 @@ xfs_attr3_leaf_to_node(
        btree = dp->d_ops->node_tree_p(node);
        leaf = bp2->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
        entries = xfs_attr3_leaf_entryp(leaf);
        /* both on-disk, don't endian-flip twice */
@@ -988,7 +1065,7 @@ xfs_attr3_leaf_create(
        }
        ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
-        xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+        xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
        xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
        *bpp = bp;
@@ -1073,7 +1150,7 @@ xfs_attr3_leaf_add(
        trace_xfs_attr_leaf_add(args);
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(args->index >= 0 && args->index <= ichdr.count);
        entsize = xfs_attr_leaf_newentsize(args, NULL);
@@ -1126,7 +1203,7 @@ xfs_attr3_leaf_add(
        tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
 out_log_hdr:
-        xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+        xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
        xfs_trans_log_buf(args->trans, bp,
                XFS_DA_LOGRANGE(leaf, &leaf->hdr,
                                xfs_attr3_leaf_hdr_size(leaf)));
@@ -1294,7 +1371,7 @@ xfs_attr3_leaf_compact(
                                                ichdr_dst->freemap[0].base;
        /* write the header back to initialise the underlying buffer */
-        xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
+        xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst);
        /*
         * Copy all entry's in the same (sorted) order,
@@ -1344,9 +1421,10 @@ xfs_attr_leaf_order(
 {
        struct xfs_attr3_icleaf_hdr ichdr1;
        struct xfs_attr3_icleaf_hdr ichdr2;
+        struct xfs_mount *mp = leaf1_bp->b_target->bt_mount;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
-        xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
        return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
 }
@@ -1388,8 +1466,8 @@ xfs_attr3_leaf_rebalance(
        ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
        leaf1 = blk1->bp->b_addr;
        leaf2 = blk2->bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1);
-        xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2);
        ASSERT(ichdr2.count == 0);
        args = state->args;
@@ -1490,8 +1568,8 @@ xfs_attr3_leaf_rebalance(
                                        ichdr1.count, count);
        }
-        xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
+        xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1);
-        xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+        xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2);
        xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
        xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
@@ -1684,7 +1762,7 @@ xfs_attr3_leaf_toosmall(
         */
        blk = &state->path.blk[ state->path.active-1 ];
        leaf = blk->bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf);
        bytes = xfs_attr3_leaf_hdr_size(leaf) +
                ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
                ichdr.usedbytes;
@@ -1740,7 +1818,7 @@ xfs_attr3_leaf_toosmall(
                if (error)
                        return error;
-                xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+                xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr);
                bytes = state->args->geo->blksize -
                        (state->args->geo->blksize >> 2) -
@@ -1805,7 +1883,7 @@ xfs_attr3_leaf_remove(
        trace_xfs_attr_leaf_remove(args);
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
        ASSERT(args->index >= 0 && args->index < ichdr.count);
@@ -1918,12 +1996,11 @@ xfs_attr3_leaf_remove(
                                tmp = be16_to_cpu(entry->nameidx);
                }
                ichdr.firstused = tmp;
-                if (!ichdr.firstused)
+                ASSERT(ichdr.firstused != 0);
-                        ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
        } else {
                ichdr.holes = 1;        /* mark as needing compaction */
        }
-        xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+        xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
        xfs_trans_log_buf(args->trans, bp,
                          XFS_DA_LOGRANGE(leaf, &leaf->hdr,
                                          xfs_attr3_leaf_hdr_size(leaf)));
@@ -1957,8 +2034,8 @@ xfs_attr3_leaf_unbalance(
        drop_leaf = drop_blk->bp->b_addr;
        save_leaf = save_blk->bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf);
-        xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf);
        entry = xfs_attr3_leaf_entryp(drop_leaf);
        /*
@@ -2012,7 +2089,7 @@ xfs_attr3_leaf_unbalance(
                tmphdr.firstused = state->args->geo->blksize;
                /* write the header to the temp buffer to initialise it */
-                xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+                xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr);
                if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
                                         drop_blk->bp, &drophdr)) {
@@ -2039,7 +2116,7 @@ xfs_attr3_leaf_unbalance(
                kmem_free(tmp_leaf);
        }
-        xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
+        xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
        xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
                                           state->args->geo->blksize - 1);
@@ -2085,7 +2162,7 @@ xfs_attr3_leaf_lookup_int(
        trace_xfs_attr_leaf_lookup(args);
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        entries = xfs_attr3_leaf_entryp(leaf);
        ASSERT(ichdr.count < args->geo->blksize / 8);
@@ -2190,7 +2267,7 @@ xfs_attr3_leaf_getvalue(
        int                     valuelen;
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(ichdr.count < args->geo->blksize / 8);
        ASSERT(args->index < ichdr.count);
@@ -2391,8 +2468,9 @@ xfs_attr_leaf_lasthash(
 {
        struct xfs_attr3_icleaf_hdr ichdr;
        struct xfs_attr_leaf_entry *entries;
+        struct xfs_mount *mp = bp->b_target->bt_mount;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
        entries = xfs_attr3_leaf_entryp(bp->b_addr);
        if (count)
                *count = ichdr.count;
@@ -2486,7 +2564,7 @@ xfs_attr3_leaf_clearflag(
        ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
 #ifdef DEBUG
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(args->index < ichdr.count);
        ASSERT(args->index >= 0);
@@ -2550,7 +2628,7 @@ xfs_attr3_leaf_setflag(
        leaf = bp->b_addr;
 #ifdef DEBUG
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(args->index < ichdr.count);
        ASSERT(args->index >= 0);
 #endif
@@ -2629,11 +2707,11 @@ xfs_attr3_leaf_flipflags(
        entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
 #ifdef DEBUG
-        xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1);
        ASSERT(args->index < ichdr1.count);
        ASSERT(args->index >= 0);
-        xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2);
        ASSERT(args->index2 < ichdr2.count);
        ASSERT(args->index2 >= 0);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index e2929da7c3ba..025c4b820c03 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -100,9 +100,11 @@ int	xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
 int     xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
                        xfs_dablk_t bno, xfs_daddr_t mappedbno,
                        struct xfs_buf **bpp);
-void    xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+void    xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
+                                     struct xfs_attr3_icleaf_hdr *to,
                                     struct xfs_attr_leafblock *from);
-void    xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+void    xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
+                                   struct xfs_attr_leafblock *to,
                                   struct xfs_attr3_icleaf_hdr *from);
 #endif  /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 61ec015dca16..aeffeaaac0ec 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -244,30 +244,6 @@ xfs_bmap_forkoff_reset(
        }
 }
-/*
- * Debug/sanity checking code
- */
-STATIC int
-xfs_bmap_sanity_check(
-        struct xfs_mount        *mp,
-        struct xfs_buf          *bp,
-        int                     level)
-{
-        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-        if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
-            block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
-                return 0;
-        if (be16_to_cpu(block->bb_level) != level ||
-            be16_to_cpu(block->bb_numrecs) == 0 ||
-            be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
-                return 0;
-        return 1;
-}
 #ifdef DEBUG
 STATIC struct xfs_buf *
 xfs_bmap_get_bp(
@@ -410,9 +386,6 @@ xfs_bmap_check_leaf_extents(
                                goto error_norelse;
                }
                block = XFS_BUF_TO_BLOCK(bp);
-                XFS_WANT_CORRUPTED_GOTO(
-                        xfs_bmap_sanity_check(mp, bp, level),
-                        error0);
                if (level == 0)
                        break;
@@ -424,7 +397,8 @@ xfs_bmap_check_leaf_extents(
                xfs_check_block(block, mp, 0, 0);
                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
-                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+                XFS_WANT_CORRUPTED_GOTO(mp,
+                                        XFS_FSB_SANITY_CHECK(mp, bno), error0);
                if (bp_release) {
                        bp_release = 0;
                        xfs_trans_brelse(NULL, bp);
@@ -1029,7 +1003,7 @@ xfs_bmap_add_attrfork_btree(
                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                        goto error0;
                /* must be at least one entry */
-                XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
                if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
                        goto error0;
                if (stat == 0) {
@@ -1311,14 +1285,12 @@ xfs_bmap_read_extents(
                if (error)
                        return error;
                block = XFS_BUF_TO_BLOCK(bp);
-                XFS_WANT_CORRUPTED_GOTO(
-                        xfs_bmap_sanity_check(mp, bp, level),
-                        error0);
                if (level == 0)
                        break;
                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
-                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+                XFS_WANT_CORRUPTED_GOTO(mp,
+                        XFS_FSB_SANITY_CHECK(mp, bno), error0);
                xfs_trans_brelse(tp, bp);
        }
        /*
@@ -1345,9 +1317,6 @@ xfs_bmap_read_extents(
                                XFS_ERRLEVEL_LOW, ip->i_mount, block);
                        goto error0;
                }
-                XFS_WANT_CORRUPTED_GOTO(
-                        xfs_bmap_sanity_check(mp, bp, 0),
-                        error0);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
@@ -1755,7 +1724,9 @@ xfs_bmap_add_extent_delay_real(
        xfs_filblks_t           temp=0; /* value for da_new calculations */
        xfs_filblks_t           temp2=0;/* value for da_new calculations */
        int                     tmp_rval;       /* partial logging flags */
+        struct xfs_mount        *mp;
+        mp  = bma->tp ? bma->tp->t_mountp : NULL;
        ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
        ASSERT(bma->idx >= 0);
@@ -1866,15 +1837,15 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_btree_delete(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_btree_decrement(bma->cur, 0, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
@@ -1907,7 +1878,7 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
@@ -1938,7 +1909,7 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
                                        new->br_startblock,
                                        PREV.br_blockcount +
@@ -1968,12 +1939,12 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
@@ -2001,7 +1972,7 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
@@ -2038,12 +2009,12 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2084,7 +2055,7 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, new->br_startoff,
                                        new->br_startblock,
                                        new->br_blockcount +
@@ -2122,12 +2093,12 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2191,12 +2162,12 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2212,9 +2183,8 @@ xfs_bmap_add_extent_delay_real(
                diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
                        (bma->cur ? bma->cur->bc_private.b.allocated : 0));
                if (diff > 0) {
-                        error = xfs_icsb_modify_counters(bma->ip->i_mount,
+                        error = xfs_mod_fdblocks(bma->ip->i_mount,
-                                        XFS_SBS_FDBLOCKS,
+                                                 -((int64_t)diff), false);
-                                        -((int64_t)diff), 0);
                        ASSERT(!error);
                        if (error)
                                goto done;
@@ -2265,9 +2235,8 @@ xfs_bmap_add_extent_delay_real(
                        temp += bma->cur->bc_private.b.allocated;
                ASSERT(temp <= da_old);
                if (temp < da_old)
-                        xfs_icsb_modify_counters(bma->ip->i_mount,
+                        xfs_mod_fdblocks(bma->ip->i_mount,
-                                        XFS_SBS_FDBLOCKS,
+                                        (int64_t)(da_old - temp), false);
-                                        (int64_t)(da_old - temp), 0);
        }
        /* clear out the allocated field, done with it now in any case. */
@@ -2309,6 +2278,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        /* left is 0, right is 1, prev is 2 */
        int                     rval=0; /* return value (logging flags) */
        int                     state = 0;/* state bits, accessed thru macros */
+        struct xfs_mount        *mp = tp->t_mountp;
        *logflagsp = 0;
@@ -2421,19 +2391,19 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_startblock,
                                        RIGHT.br_blockcount, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
                                LEFT.br_blockcount + PREV.br_blockcount +
@@ -2464,13 +2434,13 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
                                LEFT.br_blockcount + PREV.br_blockcount,
@@ -2499,13 +2469,13 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_startblock,
                                        RIGHT.br_blockcount, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock,
                                new->br_blockcount + RIGHT.br_blockcount,
@@ -2532,7 +2502,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock, new->br_blockcount,
                                newext)))
@@ -2569,7 +2539,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur,
                                PREV.br_startoff + new->br_blockcount,
                                PREV.br_startblock + new->br_blockcount,
@@ -2611,7 +2581,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur,
                                PREV.br_startoff + new->br_blockcount,
                                PREV.br_startblock + new->br_blockcount,
@@ -2621,7 +2591,7 @@ xfs_bmap_add_extent_unwritten_real(
                        cur->bc_rec.b = *new;
                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
@@ -2651,7 +2621,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock,
                                        PREV.br_blockcount, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
                                PREV.br_startblock,
                                PREV.br_blockcount - new->br_blockcount,
@@ -2689,7 +2659,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
                                PREV.br_startblock,
                                PREV.br_blockcount - new->br_blockcount,
@@ -2699,11 +2669,11 @@ xfs_bmap_add_extent_unwritten_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
@@ -2737,7 +2707,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        /* new right extent - oldext */
                        if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
                                r[1].br_startblock, r[1].br_blockcount,
@@ -2749,7 +2719,7 @@ xfs_bmap_add_extent_unwritten_real(
                                new->br_startoff - PREV.br_startoff;
                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        /*
                         * Reset the cursor to the position of the new extent
                         * we are about to insert as we can't trust it after
@@ -2759,12 +2729,12 @@ xfs_bmap_add_extent_unwritten_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        /* new middle extent - newext */
                        cur->bc_rec.b.br_state = new->br_state;
                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
@@ -2944,8 +2914,8 @@ xfs_bmap_add_extent_hole_delay(
        }
        if (oldlen != newlen) {
                ASSERT(oldlen > newlen);
-                xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+                xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
-                        (int64_t)(oldlen - newlen), 0);
+                                 false);
                /*
                 * Nothing to do for disk quota accounting here.
                 */
@@ -2968,7 +2938,9 @@ xfs_bmap_add_extent_hole_real(
        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
        int                     rval=0; /* return value (logging flags) */
        int                     state;  /* state bits, accessed thru macros */
+        struct xfs_mount        *mp;
+        mp = bma->tp ? bma->tp->t_mountp : NULL;
        ifp = XFS_IFORK_PTR(bma->ip, whichfork);
        ASSERT(bma->idx >= 0);
@@ -3056,15 +3028,15 @@ xfs_bmap_add_extent_hole_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_btree_delete(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_btree_decrement(bma->cur, 0, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, left.br_startoff,
                                        left.br_startblock,
                                        left.br_blockcount +
@@ -3097,7 +3069,7 @@ xfs_bmap_add_extent_hole_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, left.br_startoff,
                                        left.br_startblock,
                                        left.br_blockcount +
@@ -3131,7 +3103,7 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, new->br_startoff,
                                        new->br_startblock,
                                        new->br_blockcount +
@@ -3161,12 +3133,12 @@ xfs_bmap_add_extent_hole_real(
                                        new->br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = new->br_state;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
        }
@@ -4160,18 +4132,15 @@ xfs_bmapi_reserve_delalloc(
        ASSERT(indlen > 0);
        if (rt) {
-                error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+                error = xfs_mod_frextents(mp, -((int64_t)extsz));
-                                          -((int64_t)extsz), 0);
        } else {
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
-                                                 -((int64_t)alen), 0);
        }
        if (error)
                goto out_unreserve_quota;
-        error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+        error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
-                                         -((int64_t)indlen), 0);
        if (error)
                goto out_unreserve_blocks;
@@ -4198,9 +4167,9 @@ xfs_bmapi_reserve_delalloc(
 out_unreserve_blocks:
        if (rt)
-                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+                xfs_mod_frextents(mp, extsz);
        else
-                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+                xfs_mod_fdblocks(mp, alen, false);
 out_unreserve_quota:
        if (XFS_IS_QUOTA_ON(mp))
                xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
@@ -4801,7 +4770,7 @@ xfs_bmap_del_extent(
                                        got.br_startblock, got.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                da_old = da_new = 0;
        } else {
@@ -4835,7 +4804,7 @@ xfs_bmap_del_extent(
                }
                if ((error = xfs_btree_delete(cur, &i)))
                        goto done;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                break;
        case 2:
@@ -4935,7 +4904,8 @@ xfs_bmap_del_extent(
                                                        got.br_startblock,
                                                        temp, &i)))
                                                goto done;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                                        XFS_WANT_CORRUPTED_GOTO(mp,
+                                                                i == 1, done);
                                        /*
                                         * Update the btree record back
                                         * to the original value.
@@ -4956,7 +4926,7 @@ xfs_bmap_del_extent(
                                        error = -ENOSPC;
                                        goto done;
                                }
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        } else
                                flags |= xfs_ilog_fext(whichfork);
                        XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -5012,10 +4982,8 @@ xfs_bmap_del_extent(
         * Nothing to do for disk quota accounting here.
         */
        ASSERT(da_old >= da_new);
-        if (da_old > da_new) {
+        if (da_old > da_new)
-                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
-                        (int64_t)(da_old - da_new), 0);
-        }
 done:
        *logflagsp = flags;
        return error;
@@ -5284,14 +5252,13 @@ xfs_bunmapi(
                                rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
                                do_div(rtexts, mp->m_sb.sb_rextsize);
-                                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+                                xfs_mod_frextents(mp, (int64_t)rtexts);
-                                                (int64_t)rtexts, 0);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
-                                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
-                                                (int64_t)del.br_blockcount, 0);
+                                                 false);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_REGBLKS);
@@ -5453,6 +5420,7 @@ xfs_bmse_merge(
        struct xfs_bmbt_irec            left;
        xfs_filblks_t                   blockcount;
        int                             error, i;
+        struct xfs_mount                *mp = ip->i_mount;
        xfs_bmbt_get_all(gotp, &got);
        xfs_bmbt_get_all(leftp, &left);
@@ -5487,19 +5455,19 @@ xfs_bmse_merge(
                                   got.br_blockcount, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        error = xfs_btree_delete(cur, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        /* lookup and update size of the previous extent */
        error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
                                   left.br_blockcount, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        left.br_blockcount = blockcount;
@@ -5518,50 +5486,92 @@ xfs_bmse_shift_one(
        int                             *current_ext,
        struct xfs_bmbt_rec_host        *gotp,
        struct xfs_btree_cur            *cur,
-        int                             *logflags)
+        int                             *logflags,
+        enum shift_direction            direction)
 {
        struct xfs_ifork                *ifp;
+        struct xfs_mount                *mp;
        xfs_fileoff_t                   startoff;
-        struct xfs_bmbt_rec_host        *leftp;
+        struct xfs_bmbt_rec_host        *adj_irecp;
        struct xfs_bmbt_irec            got;
-        struct xfs_bmbt_irec            left;
+        struct xfs_bmbt_irec            adj_irec;
        int                             error;
        int                             i;
+        int                             total_extents;
+        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
+        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
        xfs_bmbt_get_all(gotp, &got);
-        startoff = got.br_startoff - offset_shift_fsb;
        /* delalloc extents should be prevented by caller */
-        XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock));
+        XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
-        /*
+        if (direction == SHIFT_LEFT) {
-         * Check for merge if we've got an extent to the left, otherwise make
+                startoff = got.br_startoff - offset_shift_fsb;
-         * sure there's enough room at the start of the file for the shift.
-         */
+                /*
-        if (*current_ext) {
+                 * Check for merge if we've got an extent to the left,
-                /* grab the left extent and check for a large enough hole */
+                 * otherwise make sure there's enough room at the start
-                leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
+                 * of the file for the shift.
-                xfs_bmbt_get_all(leftp, &left);
+                 */
+                if (!*current_ext) {
+                        if (got.br_startoff < offset_shift_fsb)
+                                return -EINVAL;
+                        goto update_current_ext;
+                }
+                /*
+                 * grab the left extent and check for a large
+                 * enough hole.
+                 */
+                adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1);
+                xfs_bmbt_get_all(adj_irecp, &adj_irec);
-                if (startoff < left.br_startoff + left.br_blockcount)
+                if (startoff <
+                    adj_irec.br_startoff + adj_irec.br_blockcount)
                        return -EINVAL;
                /* check whether to merge the extent or shift it down */
-                if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) {
+                if (xfs_bmse_can_merge(&adj_irec, &got,
+                                       offset_shift_fsb)) {
                        return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
-                                              *current_ext, gotp, leftp, cur,
+                                              *current_ext, gotp, adj_irecp,
-                                              logflags);
+                                              cur, logflags);
                }
-        } else if (got.br_startoff < offset_shift_fsb)
+        } else {
-                return -EINVAL;
+                startoff = got.br_startoff + offset_shift_fsb;
+                /* nothing to move if this is the last extent */
+                if (*current_ext >= (total_extents - 1))
+                        goto update_current_ext;
+                /*
+                 * If this is not the last extent in the file, make sure there
+                 * is enough room between current extent and next extent for
+                 * accommodating the shift.
+                 */
+                adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1);
+                xfs_bmbt_get_all(adj_irecp, &adj_irec);
+                if (startoff + got.br_blockcount > adj_irec.br_startoff)
+                        return -EINVAL;
+                /*
+                 * Unlike a left shift (which involves a hole punch),
+                 * a right shift does not modify extent neighbors
+                 * in any way. We should never find mergeable extents
+                 * in this scenario. Check anyways and warn if we
+                 * encounter two extents that could be one.
+                 */
+                if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb))
+                        WARN_ON_ONCE(1);
+        }
        /*
         * Increment the extent index for the next iteration, update the start
         * offset of the in-core extent and update the btree if applicable.
         */
-        (*current_ext)++;
+update_current_ext:
+        if (direction == SHIFT_LEFT)
+                (*current_ext)++;
+        else
+                (*current_ext)--;
        xfs_bmbt_set_startoff(gotp, startoff);
        *logflags |= XFS_ILOG_CORE;
        if (!cur) {
@@ -5573,18 +5583,18 @@ xfs_bmse_shift_one(
                                   got.br_blockcount, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        got.br_startoff = startoff;
        return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
-                                got.br_blockcount, got.br_state);
+                               got.br_blockcount, got.br_state);
 }
 /*
- * Shift extent records to the left to cover a hole.
+ * Shift extent records to the left/right to cover/create a hole.
 *
 * The maximum number of extents to be shifted in a single operation is
- * @num_exts. @start_fsb specifies the file offset to start the shift and the
+ * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the
 * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
 * is the length by which each extent is shifted. If there is no hole to shift
 * the extents into, this will be considered invalid operation and we abort
@@ -5594,12 +5604,13 @@ int
 xfs_bmap_shift_extents(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
-        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           *next_fsb,
        xfs_fileoff_t           offset_shift_fsb,
        int                     *done,
-        xfs_fileoff_t           *next_fsb,
+        xfs_fileoff_t           stop_fsb,
        xfs_fsblock_t           *firstblock,
        struct xfs_bmap_free    *flist,
+        enum shift_direction    direction,
        int                     num_exts)
 {
        struct xfs_btree_cur            *cur = NULL;
@@ -5609,10 +5620,11 @@ xfs_bmap_shift_extents(
        struct xfs_ifork                *ifp;
        xfs_extnum_t                    nexts = 0;
        xfs_extnum_t                    current_ext;
+        xfs_extnum_t                    total_extents;
+        xfs_extnum_t                    stop_extent;
        int                             error = 0;
        int                             whichfork = XFS_DATA_FORK;
        int                             logflags = 0;
-        int                             total_extents;
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5628,6 +5640,8 @@ xfs_bmap_shift_extents(
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+        ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT);
        ifp = XFS_IFORK_PTR(ip, whichfork);
        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -5645,43 +5659,83 @@ xfs_bmap_shift_extents(
        }
        /*
+         * There may be delalloc extents in the data fork before the range we
+         * are collapsing out, so we cannot use the count of real extents here.
+         * Instead we have to calculate it from the incore fork.
+         */
+        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        if (total_extents == 0) {
+                *done = 1;
+                goto del_cursor;
+        }
+        /*
+         * In case of first right shift, we need to initialize next_fsb
+         */
+        if (*next_fsb == NULLFSBLOCK) {
+                gotp = xfs_iext_get_ext(ifp, total_extents - 1);
+                xfs_bmbt_get_all(gotp, &got);
+                *next_fsb = got.br_startoff;
+                if (stop_fsb > *next_fsb) {
+                        *done = 1;
+                        goto del_cursor;
+                }
+        }
+        /* Lookup the extent index at which we have to stop */
+        if (direction == SHIFT_RIGHT) {
+                gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent);
+                /* Make stop_extent exclusive of shift range */
+                stop_extent--;
+        } else
+                stop_extent = total_extents;
+        /*
         * Look up the extent index for the fsb where we start shifting. We can
         * henceforth iterate with current_ext as extent list changes are locked
         * out via ilock.
         *
         * gotp can be null in 2 cases: 1) if there are no extents or 2)
-         * start_fsb lies in a hole beyond which there are no extents. Either
+         * *next_fsb lies in a hole beyond which there are no extents. Either
         * way, we are done.
         */
-        gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext);
+        gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, &current_ext);
        if (!gotp) {
                *done = 1;
                goto del_cursor;
        }
-        /*
+        /* some sanity checking before we finally start shifting extents */
-         * There may be delalloc extents in the data fork before the range we
+        if ((direction == SHIFT_LEFT && current_ext >= stop_extent) ||
-         * are collapsing out, so we cannot use the count of real extents here.
+             (direction == SHIFT_RIGHT && current_ext <= stop_extent)) {
-         * Instead we have to calculate it from the incore fork.
+                error = -EIO;
-         */
+                goto del_cursor;
-        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        }
-        while (nexts++ < num_exts && current_ext < total_extents) {
+        while (nexts++ < num_exts) {
                error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
-                                        &current_ext, gotp, cur, &logflags);
+                                           &current_ext, gotp, cur, &logflags,
+                                           direction);
                if (error)
                        goto del_cursor;
+                /*
+                 * If there was an extent merge during the shift, the extent
+                 * count can change. Update the total and grade the next record.
+                 */
+                if (direction == SHIFT_LEFT) {
+                        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+                        stop_extent = total_extents;
+                }
-                /* update total extent count and grab the next record */
+                if (current_ext == stop_extent) {
-                total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+                        *done = 1;
-                if (current_ext >= total_extents)
+                        *next_fsb = NULLFSBLOCK;
                        break;
+                }
                gotp = xfs_iext_get_ext(ifp, current_ext);
        }
-        /* Check if we are done */
+        if (!*done) {
-        if (current_ext == total_extents) {
-                *done = 1;
-        } else if (next_fsb) {
                xfs_bmbt_get_all(gotp, &got);
                *next_fsb = got.br_startoff;
        }
@@ -5696,3 +5750,189 @@ del_cursor:
        return error;
 }
+/*
+ * Splits an extent into two extents at split_fsb block such that it is
+ * the first block of the current_ext. @current_ext is a target extent
+ * to be split. @split_fsb is a block where the extents is split.
+ * If split_fsb lies in a hole or the first block of extents, just return 0.
+ */
+STATIC int
+xfs_bmap_split_extent_at(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           split_fsb,
+        xfs_fsblock_t           *firstfsb,
+        struct xfs_bmap_free    *free_list)
+{
+        int                             whichfork = XFS_DATA_FORK;
+        struct xfs_btree_cur            *cur = NULL;
+        struct xfs_bmbt_rec_host        *gotp;
+        struct xfs_bmbt_irec            got;
+        struct xfs_bmbt_irec            new; /* split extent */
+        struct xfs_mount                *mp = ip->i_mount;
+        struct xfs_ifork                *ifp;
+        xfs_fsblock_t                   gotblkcnt; /* new block count for got */
+        xfs_extnum_t                    current_ext;
+        int                             error = 0;
+        int                             logflags = 0;
+        int                             i = 0;
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
+                                 XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                /* Read in all the extents */
+                error = xfs_iread_extents(tp, ip, whichfork);
+                if (error)
+                        return error;
+        }
+        /*
+         * gotp can be null in 2 cases: 1) if there are no extents
+         * or 2) split_fsb lies in a hole beyond which there are
+         * no extents. Either way, we are done.
+         */
+        gotp = xfs_iext_bno_to_ext(ifp, split_fsb, &current_ext);
+        if (!gotp)
+                return 0;
+        xfs_bmbt_get_all(gotp, &got);
+        /*
+         * Check split_fsb lies in a hole or the start boundary offset
+         * of the extent.
+         */
+        if (got.br_startoff >= split_fsb)
+                return 0;
+        gotblkcnt = split_fsb - got.br_startoff;
+        new.br_startoff = split_fsb;
+        new.br_startblock = got.br_startblock + gotblkcnt;
+        new.br_blockcount = got.br_blockcount - gotblkcnt;
+        new.br_state = got.br_state;
+        if (ifp->if_flags & XFS_IFBROOT) {
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+                cur->bc_private.b.firstblock = *firstfsb;
+                cur->bc_private.b.flist = free_list;
+                cur->bc_private.b.flags = 0;
+                error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                got.br_startblock,
+                                got.br_blockcount,
+                                &i);
+                if (error)
+                        goto del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+        }
+        xfs_bmbt_set_blockcount(gotp, gotblkcnt);
+        got.br_blockcount = gotblkcnt;
+        logflags = XFS_ILOG_CORE;
+        if (cur) {
+                error = xfs_bmbt_update(cur, got.br_startoff,
+                                got.br_startblock,
+                                got.br_blockcount,
+                                got.br_state);
+                if (error)
+                        goto del_cursor;
+        } else
+                logflags |= XFS_ILOG_DEXT;
+        /* Add new extent */
+        current_ext++;
+        xfs_iext_insert(ip, current_ext, 1, &new, 0);
+        XFS_IFORK_NEXT_SET(ip, whichfork,
+                           XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+        if (cur) {
+                error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
+                                new.br_startblock, new.br_blockcount,
+                                &i);
+                if (error)
+                        goto del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
+                cur->bc_rec.b.br_state = new.br_state;
+                error = xfs_btree_insert(cur, &i);
+                if (error)
+                        goto del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+        }
+        /*
+         * Convert to a btree if necessary.
+         */
+        if (xfs_bmap_needs_btree(ip, whichfork)) {
+                int tmp_logflags; /* partial log flag return val */
+                ASSERT(cur == NULL);
+                error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
+                                &cur, 0, &tmp_logflags, whichfork);
+                logflags |= tmp_logflags;
+        }
+del_cursor:
+        if (cur) {
+                cur->bc_private.b.allocated = 0;
+                xfs_btree_del_cursor(cur,
+                                error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        }
+        if (logflags)
+                xfs_trans_log_inode(tp, ip, logflags);
+        return error;
+}
+int
+xfs_bmap_split_extent(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           split_fsb)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        struct xfs_bmap_free    free_list;
+        xfs_fsblock_t           firstfsb;
+        int                     committed;
+        int                     error;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                        XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_bmap_init(&free_list, &firstfsb);
+        error = xfs_bmap_split_extent_at(tp, ip, split_fsb,
+                        &firstfsb, &free_list);
+        if (error)
+                goto out;
+        error = xfs_bmap_finish(&tp, &free_list, &committed);
+        if (error)
+                goto out;
+        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+out:
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+        return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b9d8a499d2c4..6aaa0c1c7200 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -166,6 +166,11 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 */
 #define XFS_BMAP_MAX_SHIFT_EXTENTS      1
+enum shift_direction {
+        SHIFT_LEFT = 0,
+        SHIFT_RIGHT,
+};
 #ifdef DEBUG
 void    xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
                int whichfork, unsigned long caller_ip);
@@ -211,8 +216,10 @@ int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
                xfs_extnum_t num);
 uint    xfs_default_attroffset(struct xfs_inode *ip);
 int     xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
-                xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb,
+                xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
-                int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock,
+                int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
-                struct xfs_bmap_free *flist, int num_exts);
+                struct xfs_bmap_free *flist, enum shift_direction direction,
+                int num_exts);
+int     xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 81cad433df85..c72283dd8d44 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -168,7 +168,7 @@ xfs_btree_check_lptr(
        xfs_fsblock_t           bno,    /* btree block disk address */
        int                     level)  /* btree block level */
 {
-        XFS_WANT_CORRUPTED_RETURN(
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
                level > 0 &&
                bno != NULLFSBLOCK &&
                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
@@ -187,7 +187,7 @@ xfs_btree_check_sptr(
 {
        xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
-        XFS_WANT_CORRUPTED_RETURN(
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
                level > 0 &&
                bno != NULLAGBLOCK &&
                bno != 0 &&
@@ -1825,7 +1825,7 @@ xfs_btree_lookup(
                        error = xfs_btree_increment(cur, 0, &i);
                        if (error)
                                goto error0;
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
                        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
                        *stat = 1;
                        return 0;
@@ -2285,7 +2285,7 @@ xfs_btree_rshift(
        if (error)
                goto error0;
        i = xfs_btree_lastrec(tcur, level);
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
        error = xfs_btree_increment(tcur, level, &i);
        if (error)
@@ -3138,7 +3138,7 @@ xfs_btree_insert(
                        goto error0;
                }
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                level++;
                /*
@@ -3582,15 +3582,15 @@ xfs_btree_delrec(
                 * Actually any entry but the first would suffice.
                 */
                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                error = xfs_btree_increment(tcur, level, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                /* Grab a pointer to the block. */
                right = xfs_btree_get_block(tcur, level, &rbp);
@@ -3634,12 +3634,12 @@ xfs_btree_delrec(
                rrecs = xfs_btree_get_numrecs(right);
                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
                        i = xfs_btree_firstrec(tcur, level);
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                        error = xfs_btree_decrement(tcur, level, &i);
                        if (error)
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                }
        }
@@ -3653,13 +3653,13 @@ xfs_btree_delrec(
                 * previous block.
                 */
                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                error = xfs_btree_decrement(tcur, level, &i);
                if (error)
                        goto error0;
                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                /* Grab a pointer to the block. */
                left = xfs_btree_get_block(tcur, level, &lbp);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 9cb0115c6bd1..2385f8cd08ab 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -538,12 +538,12 @@ xfs_da3_root_split(
        oldroot = blk1->bp->b_addr;
        if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
            oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
-                struct xfs_da3_icnode_hdr nodehdr;
+                struct xfs_da3_icnode_hdr icnodehdr;
-                dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+                dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
                btree = dp->d_ops->node_tree_p(oldroot);
-                size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
+                size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
-                level = nodehdr.level;
+                level = icnodehdr.level;
                /*
                 * we are about to copy oldroot to bp, so set up the type
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 0a49b0286372..74bcbabfa523 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -725,7 +725,13 @@ struct xfs_attr3_icleaf_hdr {
        __uint16_t      magic;
        __uint16_t      count;
        __uint16_t      usedbytes;
-        __uint16_t      firstused;
+        /*
+         * firstused is 32-bit here instead of 16-bit like the on-disk variant
+         * to support maximum fsb size of 64k without overflow issues throughout
+         * the attr code. Instead, the overflow condition is handled on
+         * conversion to/from disk.
+         */
+        __uint32_t      firstused;
        __u8            holes;
        struct {
                __uint16_t      base;
@@ -734,6 +740,12 @@ struct xfs_attr3_icleaf_hdr {
 };
 /*
+ * Special value to represent fs block size in the leaf header firstused field.
+ * Only used when block size overflows the 2-bytes available on disk.
+ */
+#define XFS_ATTR3_LEAF_NULLOFF  0
+/*
 * Flags used in the leaf_entry[i].flags field.
 * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
 * on the system call, they are "or"ed together for various operations.
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 5ff31be9b1cd..de1ea16f5748 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -89,7 +89,7 @@ __xfs_dir3_data_check(
                 * so just ensure that the count falls somewhere inside the
                 * block right now.
                 */
-                XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+                XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) <
                        ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
                break;
        case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
@@ -107,21 +107,21 @@ __xfs_dir3_data_check(
        bf = ops->data_bestfree_p(hdr);
        count = lastfree = freeseen = 0;
        if (!bf[0].length) {
-                XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
+                XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset);
                freeseen |= 1 << 0;
        }
        if (!bf[1].length) {
-                XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
+                XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset);
                freeseen |= 1 << 1;
        }
        if (!bf[2].length) {
-                XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
+                XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset);
                freeseen |= 1 << 2;
        }
-        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+        XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >=
                                                be16_to_cpu(bf[1].length));
-        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+        XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >=
                                                be16_to_cpu(bf[2].length));
        /*
         * Loop over the data/unused entries.
@@ -134,18 +134,18 @@ __xfs_dir3_data_check(
                 * doesn't need to be there.
                 */
                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                        XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+                        XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0);
-                        XFS_WANT_CORRUPTED_RETURN(
+                        XFS_WANT_CORRUPTED_RETURN(mp,
                                be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
                                               (char *)dup - (char *)hdr);
                        dfp = xfs_dir2_data_freefind(hdr, bf, dup);
                        if (dfp) {
                                i = (int)(dfp - bf);
-                                XFS_WANT_CORRUPTED_RETURN(
+                                XFS_WANT_CORRUPTED_RETURN(mp,
                                        (freeseen & (1 << i)) == 0);
                                freeseen |= 1 << i;
                        } else {
-                                XFS_WANT_CORRUPTED_RETURN(
+                                XFS_WANT_CORRUPTED_RETURN(mp,
                                        be16_to_cpu(dup->length) <=
                                                be16_to_cpu(bf[2].length));
                        }
@@ -160,13 +160,13 @@ __xfs_dir3_data_check(
                 * The linear search is crude but this is DEBUG code.
                 */
                dep = (xfs_dir2_data_entry_t *)p;
-                XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+                XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0);
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
                                               (char *)dep - (char *)hdr);
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                                ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
                count++;
                lastfree = 0;
@@ -183,14 +183,15 @@ __xfs_dir3_data_check(
                                    be32_to_cpu(lep[i].hashval) == hash)
                                        break;
                        }
-                        XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
+                        XFS_WANT_CORRUPTED_RETURN(mp,
+                                                  i < be32_to_cpu(btp->count));
                }
                p += ops->data_entsize(dep->namelen);
        }
        /*
         * Need to have seen all the entries and all the bestfree slots.
         */
-        XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
+        XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7);
        if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
            hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
                for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
@@ -198,13 +199,13 @@ __xfs_dir3_data_check(
                            cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
                                stale++;
                        if (i > 0)
-                                XFS_WANT_CORRUPTED_RETURN(
+                                XFS_WANT_CORRUPTED_RETURN(mp,
                                        be32_to_cpu(lep[i].hashval) >=
                                                be32_to_cpu(lep[i - 1].hashval));
                }
-                XFS_WANT_CORRUPTED_RETURN(count ==
+                XFS_WANT_CORRUPTED_RETURN(mp, count ==
                        be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
-                XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
+                XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale));
        }
        return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 8eb718979383..4daaa662337b 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -264,68 +264,6 @@ typedef struct xfs_dsb {
        /* must be padded to 64 bit alignment */
 } xfs_dsb_t;
-/*
- * Sequence number values for the fields.
- */
-typedef enum {
-        XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
-        XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
-        XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
-        XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
-        XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
-        XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
-        XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
-        XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
-        XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
-        XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
-        XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
-        XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
-        XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
-        XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
-        XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
-        XFS_SBS_PQUOTINO, XFS_SBS_LSN,
-        XFS_SBS_FIELDCOUNT
-} xfs_sb_field_t;
-/*
- * Mask values, defined based on the xfs_sb_field_t values.
- * Only define the ones we're using.
- */
-#define XFS_SB_MVAL(x)          (1LL << XFS_SBS_ ## x)
-#define XFS_SB_UUID             XFS_SB_MVAL(UUID)
-#define XFS_SB_FNAME            XFS_SB_MVAL(FNAME)
-#define XFS_SB_ROOTINO          XFS_SB_MVAL(ROOTINO)
-#define XFS_SB_RBMINO           XFS_SB_MVAL(RBMINO)
-#define XFS_SB_RSUMINO          XFS_SB_MVAL(RSUMINO)
-#define XFS_SB_VERSIONNUM       XFS_SB_MVAL(VERSIONNUM)
-#define XFS_SB_UQUOTINO         XFS_SB_MVAL(UQUOTINO)
-#define XFS_SB_GQUOTINO         XFS_SB_MVAL(GQUOTINO)
-#define XFS_SB_QFLAGS           XFS_SB_MVAL(QFLAGS)
-#define XFS_SB_SHARED_VN        XFS_SB_MVAL(SHARED_VN)
-#define XFS_SB_UNIT             XFS_SB_MVAL(UNIT)
-#define XFS_SB_WIDTH            XFS_SB_MVAL(WIDTH)
-#define XFS_SB_ICOUNT           XFS_SB_MVAL(ICOUNT)
-#define XFS_SB_IFREE            XFS_SB_MVAL(IFREE)
-#define XFS_SB_FDBLOCKS         XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2        (XFS_SB_MVAL(FEATURES2) | \
-                                 XFS_SB_MVAL(BAD_FEATURES2))
-#define XFS_SB_FEATURES_COMPAT  XFS_SB_MVAL(FEATURES_COMPAT)
-#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
-#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
-#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
-#define XFS_SB_CRC              XFS_SB_MVAL(CRC)
-#define XFS_SB_PQUOTINO         XFS_SB_MVAL(PQUOTINO)
-#define XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
-#define XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
-#define XFS_SB_MOD_BITS         \
-        (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
-         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
-         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
-         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
-         XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
-         XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
-         XFS_SB_PQUOTINO)
 /*
 * Misc. Flags - warning - these will be cleared by xfs_repair unless
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 116ef1ddb3e3..07349a183a11 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -376,7 +376,8 @@ xfs_ialloc_ag_alloc(
         */
        newlen = args.mp->m_ialloc_inos;
        if (args.mp->m_maxicount &&
-            args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+            percpu_counter_read(&args.mp->m_icount) + newlen >
+                                                        args.mp->m_maxicount)
                return -ENOSPC;
        args.minlen = args.maxlen = args.mp->m_ialloc_blks;
        /*
@@ -700,7 +701,7 @@ xfs_ialloc_next_rec(
                error = xfs_inobt_get_rec(cur, rec, &i);
                if (error)
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        }
        return 0;
@@ -724,7 +725,7 @@ xfs_ialloc_get_rec(
                error = xfs_inobt_get_rec(cur, rec, &i);
                if (error)
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        }
        return 0;
@@ -783,12 +784,12 @@ xfs_dialloc_ag_inobt(
                error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                error = xfs_inobt_get_rec(cur, &rec, &j);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
                if (rec.ir_freecount > 0) {
                        /*
@@ -944,19 +945,19 @@ newino:
        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
        if (error)
                goto error0;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        for (;;) {
                error = xfs_inobt_get_rec(cur, &rec, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if (rec.ir_freecount > 0)
                        break;
                error = xfs_btree_increment(cur, 0, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        }
 alloc_inode:
@@ -1016,7 +1017,7 @@ xfs_dialloc_ag_finobt_near(
                error = xfs_inobt_get_rec(lcur, rec, &i);
                if (error)
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
                /*
                 * See if we've landed in the parent inode record. The finobt
@@ -1039,10 +1040,10 @@ xfs_dialloc_ag_finobt_near(
                error = xfs_inobt_get_rec(rcur, &rrec, &j);
                if (error)
                        goto error_rcur;
-                XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+                XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
        }
-        XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+        XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
        if (i == 1 && j == 1) {
                /*
                 * Both the left and right records are valid. Choose the closer
@@ -1095,7 +1096,7 @@ xfs_dialloc_ag_finobt_newino(
                        error = xfs_inobt_get_rec(cur, rec, &i);
                        if (error)
                                return error;
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
                        return 0;
                }
        }
@@ -1106,12 +1107,12 @@ xfs_dialloc_ag_finobt_newino(
        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        error = xfs_inobt_get_rec(cur, rec, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        return 0;
 }
@@ -1133,19 +1134,19 @@ xfs_dialloc_ag_update_inobt(
        error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
                                   XFS_INODES_PER_CHUNK) == 0);
        rec.ir_free &= ~XFS_INOBT_MASK(offset);
        rec.ir_freecount--;
-        XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
                                  (rec.ir_freecount == frec->ir_freecount));
        return xfs_inobt_update(cur, &rec);
@@ -1340,7 +1341,8 @@ xfs_dialloc(
         * inode.
         */
        if (mp->m_maxicount &&
-            mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
+            percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos >
+                                                        mp->m_maxicount) {
                noroom = 1;
                okalloc = 0;
        }
@@ -1475,14 +1477,14 @@ xfs_difree_inobt(
                        __func__, error);
                goto error0;
        }
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error) {
                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
                        __func__, error);
                goto error0;
        }
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        /*
         * Get the offset in the inode chunk.
         */
@@ -1592,7 +1594,7 @@ xfs_difree_finobt(
                 * freed an inode in a previously fully allocated chunk. If not,
                 * something is out of sync.
                 */
-                XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+                XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
                error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
                                             ibtrec->ir_free, &i);
@@ -1613,12 +1615,12 @@ xfs_difree_finobt(
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error)
                goto error;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
        rec.ir_free |= XFS_INOBT_MASK(offset);
        rec.ir_freecount++;
-        XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+        XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
                                (rec.ir_freecount == ibtrec->ir_freecount),
                                error);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b0a5fe95a3e2..dc4bfc5d88fc 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -111,14 +111,6 @@ xfs_mount_validate_sb(
        bool            check_inprogress,
        bool            check_version)
 {
-        /*
-         * If the log device and data device have the
-         * same device number, the log is internal.
-         * Consequently, the sb_logstart should be non-zero.  If
-         * we have a zero sb_logstart in this case, we may be trying to mount
-         * a volume filesystem in a non-volume manner.
-         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
                xfs_warn(mp, "bad magic number");
                return -EWRONGFS;
@@ -743,17 +735,15 @@ xfs_initialize_perag_data(
                btree += pag->pagf_btreeblks;
                xfs_perag_put(pag);
        }
-        /*
-         * Overwrite incore superblock counters with just-read data
+        /* Overwrite incore superblock counters with just-read data */
-         */
        spin_lock(&mp->m_sb_lock);
        sbp->sb_ifree = ifree;
        sbp->sb_icount = ialloc;
        sbp->sb_fdblocks = bfree + bfreelst + btree;
        spin_unlock(&mp->m_sb_lock);
-        /* Fixup the per-cpu counters as well. */
+        xfs_reinit_percpu_counters(mp);
-        xfs_icsb_reinit_counters(mp);
        return 0;
 }
@@ -771,6 +761,10 @@ xfs_log_sb(
        struct xfs_mount        *mp = tp->t_mountp;
        struct xfs_buf          *bp = xfs_trans_getsb(tp, mp, 0);
+        mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+        mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+        mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
        xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1d8eef9cf0f5..a56960dd1684 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1232,6 +1232,117 @@ xfs_vm_releasepage(
        return try_to_free_buffers(page);
 }
+/*
+ * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * type of write IO we are doing. This passes to the completion function the
+ * operations it needs to perform. If the mapping is for an overwrite wholly
+ * within the EOF then we don't need an ioend and so we don't allocate one.
+ * This avoids the unnecessary overhead of allocating and freeing ioends for
+ * workloads that don't require transactions on IO completion.
+ *
+ * If we get multiple mappings in a single IO, we might be mapping different
+ * types. But because the direct IO can only have a single private pointer, we
+ * need to ensure that:
+ *
+ * a) i) the ioend spans the entire region of unwritten mappings; or
+ *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
+ * b) if it contains unwritten extents, it is *permanently* marked as such
+ *
+ * We could do this by chaining ioends like buffered IO does, but we only
+ * actually get one IO completion callback from the direct IO, and that spans
+ * the entire IO regardless of how many mappings and IOs are needed to complete
+ * the DIO. There is only going to be one reference to the ioend and its life
+ * cycle is constrained by the DIO completion code. hence we don't need
+ * reference counting here.
+ */
+static void
+xfs_map_direct(
+        struct inode            *inode,
+        struct buffer_head      *bh_result,
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset)
+{
+        struct xfs_ioend        *ioend;
+        xfs_off_t               size = bh_result->b_size;
+        int                     type;
+        if (ISUNWRITTEN(imap))
+                type = XFS_IO_UNWRITTEN;
+        else
+                type = XFS_IO_OVERWRITE;
+        trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+        if (bh_result->b_private) {
+                ioend = bh_result->b_private;
+                ASSERT(ioend->io_size > 0);
+                ASSERT(offset >= ioend->io_offset);
+                if (offset + size > ioend->io_offset + ioend->io_size)
+                        ioend->io_size = offset - ioend->io_offset + size;
+                if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
+                        ioend->io_type = XFS_IO_UNWRITTEN;
+                trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
+                                              ioend->io_size, ioend->io_type,
+                                              imap);
+        } else if (type == XFS_IO_UNWRITTEN ||
+                   offset + size > i_size_read(inode)) {
+                ioend = xfs_alloc_ioend(inode, type);
+                ioend->io_offset = offset;
+                ioend->io_size = size;
+                bh_result->b_private = ioend;
+                set_buffer_defer_completion(bh_result);
+                trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
+                                           imap);
+        } else {
+                trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+                                            imap);
+        }
+}
+/*
+ * If this is O_DIRECT or the mpage code calling tell them how large the mapping
+ * is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the mapping
+ * for blocks beyond EOF must be marked new so that sub block regions can be
+ * correctly zeroed. We can't do this for mappings within EOF unless the mapping
+ * was just allocated or is unwritten, otherwise the callers would overwrite
+ * existing data with zeros. Hence we have to split the mapping into a range up
+ * to and including EOF, and a second mapping for beyond EOF.
+ */
+static void
+xfs_map_trim_size(
+        struct inode            *inode,
+        sector_t                iblock,
+        struct buffer_head      *bh_result,
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset,
+        ssize_t                 size)
+{
+        xfs_off_t               mapping_size;
+        mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
+        mapping_size <<= inode->i_blkbits;
+        ASSERT(mapping_size > 0);
+        if (mapping_size > size)
+                mapping_size = size;
+        if (offset < i_size_read(inode) &&
+            offset + mapping_size >= i_size_read(inode)) {
+                /* limit mapping to block that spans EOF */
+                mapping_size = roundup_64(i_size_read(inode) - offset,
+                                          1 << inode->i_blkbits);
+        }
+        if (mapping_size > LONG_MAX)
+                mapping_size = LONG_MAX;
+        bh_result->b_size = mapping_size;
+}
 STATIC int
 __xfs_get_blocks(
        struct inode            *inode,
@@ -1320,31 +1431,37 @@ __xfs_get_blocks(
                        xfs_iunlock(ip, lockmode);
                }
+                trace_xfs_get_blocks_alloc(ip, offset, size,
-                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+                                                   : XFS_IO_DELALLOC, &imap);
        } else if (nimaps) {
-                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+                trace_xfs_get_blocks_found(ip, offset, size,
+                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+                                                   : XFS_IO_OVERWRITE, &imap);
                xfs_iunlock(ip, lockmode);
        } else {
                trace_xfs_get_blocks_notfound(ip, offset, size);
                goto out_unlock;
        }
+        /* trim mapping down to size requested */
+        if (direct || size > (1 << inode->i_blkbits))
+                xfs_map_trim_size(inode, iblock, bh_result,
+                                  &imap, offset, size);
+        /*
+         * For unwritten extents do not report a disk address in the buffered
+         * read case (treat as if we're reading into a hole).
+         */
        if (imap.br_startblock != HOLESTARTBLOCK &&
-            imap.br_startblock != DELAYSTARTBLOCK) {
+            imap.br_startblock != DELAYSTARTBLOCK &&
-                /*
+            (create || !ISUNWRITTEN(&imap))) {
-                 * For unwritten extents do not report a disk address on
+                xfs_map_buffer(inode, bh_result, &imap, offset);
-                 * the read case (treat as if we're reading into a hole).
+                if (ISUNWRITTEN(&imap))
-                 */
-                if (create || !ISUNWRITTEN(&imap))
-                        xfs_map_buffer(inode, bh_result, &imap, offset);
-                if (create && ISUNWRITTEN(&imap)) {
-                        if (direct) {
-                                bh_result->b_private = inode;
-                                set_buffer_defer_completion(bh_result);
-                        }
                        set_buffer_unwritten(bh_result);
-                }
+                /* direct IO needs special help */
+                if (create && direct)
+                        xfs_map_direct(inode, bh_result, &imap, offset);
        }
        /*
@@ -1377,39 +1494,6 @@ __xfs_get_blocks(
                }
        }
-        /*
-         * If this is O_DIRECT or the mpage code calling tell them how large
-         * the mapping is, so that we can avoid repeated get_blocks calls.
-         *
-         * If the mapping spans EOF, then we have to break the mapping up as the
-         * mapping for blocks beyond EOF must be marked new so that sub block
-         * regions can be correctly zeroed. We can't do this for mappings within
-         * EOF unless the mapping was just allocated or is unwritten, otherwise
-         * the callers would overwrite existing data with zeros. Hence we have
-         * to split the mapping into a range up to and including EOF, and a
-         * second mapping for beyond EOF.
-         */
-        if (direct || size > (1 << inode->i_blkbits)) {
-                xfs_off_t               mapping_size;
-                mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-                mapping_size <<= inode->i_blkbits;
-                ASSERT(mapping_size > 0);
-                if (mapping_size > size)
-                        mapping_size = size;
-                if (offset < i_size_read(inode) &&
-                    offset + mapping_size >= i_size_read(inode)) {
-                        /* limit mapping to block that spans EOF */
-                        mapping_size = roundup_64(i_size_read(inode) - offset,
-                                                  1 << inode->i_blkbits);
-                }
-                if (mapping_size > LONG_MAX)
-                        mapping_size = LONG_MAX;
-                bh_result->b_size = mapping_size;
-        }
        return 0;
 out_unlock:
@@ -1440,9 +1524,11 @@ xfs_get_blocks_direct(
 /*
 * Complete a direct I/O write request.
 *
- * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * need to issue a transaction to convert the range from unwritten to written
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * extents.
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
 */
 STATIC void
 xfs_end_io_direct_write(
@@ -1454,43 +1540,71 @@ xfs_end_io_direct_write(
        struct inode            *inode = file_inode(iocb->ki_filp);
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ioend        *ioend = private;
-        if (XFS_FORCED_SHUTDOWN(mp))
+        trace_xfs_gbmap_direct_endio(ip, offset, size,
+                                     ioend ? ioend->io_type : 0, NULL);
+        if (!ioend) {
+                ASSERT(offset + size <= i_size_read(inode));
                return;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                goto out_end_io;
        /*
-         * While the generic direct I/O code updates the inode size, it does
+         * dio completion end_io functions are only called on writes if more
-         * so only after the end_io handler is called, which means our
+         * than 0 bytes was written.
-         * end_io handler thinks the on-disk size is outside the in-core
-         * size.  To prevent this just update it a little bit earlier here.
         */
+        ASSERT(size > 0);
+        /*
+         * The ioend only maps whole blocks, while the IO may be sector aligned.
+         * Hence the ioend offset/size may not match the IO offset/size exactly.
+         * Because we don't map overwrites within EOF into the ioend, the offset
+         * may not match, but only if the endio spans EOF.  Either way, write
+         * the IO sizes into the ioend so that completion processing does the
+         * right thing.
+         */
+        ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
+        ioend->io_size = size;
+        ioend->io_offset = offset;
+        /*
+         * The ioend tells us whether we are doing unwritten extent conversion
+         * or an append transaction that updates the on-disk file size. These
+         * cases are the only cases where we should *potentially* be needing
+         * to update the VFS inode size.
+         *
+         * We need to update the in-core inode size here so that we don't end up
+         * with the on-disk inode size being outside the in-core inode size. We
+         * have no other method of updating EOF for AIO, so always do it here
+         * if necessary.
+         *
+         * We need to lock the test/set EOF update as we can be racing with
+         * other IO completions here to update the EOF. Failing to serialise
+         * here can result in EOF moving backwards and Bad Things Happen when
+         * that occurs.
+         */
+        spin_lock(&ip->i_flags_lock);
        if (offset + size > i_size_read(inode))
                i_size_write(inode, offset + size);
+        spin_unlock(&ip->i_flags_lock);
        /*
-         * For direct I/O we do not know if we need to allocate blocks or not,
+         * If we are doing an append IO that needs to update the EOF on disk,
-         * so we can't preallocate an append transaction, as that results in
+         * do the transaction reserve now so we can use common end io
-         * nested reservations and log space deadlocks. Hence allocate the
+         * processing. Stashing the error (if there is one) in the ioend will
-         * transaction here. While this is sub-optimal and can block IO
+         * result in the ioend processing passing on the error if it is
-         * completion for some time, we're stuck with doing it this way until
+         * possible as we can't return it from here.
-         * we can pass the ioend to the direct IO allocation callbacks and
-         * avoid nesting that way.
         */
-        if (private && size > 0) {
+        if (ioend->io_type == XFS_IO_OVERWRITE)
-                xfs_iomap_write_unwritten(ip, offset, size);
+                ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
-        } else if (offset + size > ip->i_d.di_size) {
-                struct xfs_trans        *tp;
-                int                     error;
-                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-                if (error) {
-                        xfs_trans_cancel(tp, 0);
-                        return;
-                }
-                xfs_setfilesize(ip, tp, offset, size);
+out_end_io:
-        }
+        xfs_end_io(&ioend->io_work);
+        return;
 }
 STATIC ssize_t
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 83af4c149635..f9c1c64782d3 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -132,9 +132,10 @@ xfs_attr3_leaf_inactive(
        int                     size;
        int                     tmp;
        int                     i;
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
        /*
         * Count the number of "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a43d370d2c58..65fb37a18e92 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -225,6 +225,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
        int error, i;
        struct xfs_buf *bp;
        struct xfs_inode        *dp = context->dp;
+        struct xfs_mount        *mp = dp->i_mount;
        trace_xfs_attr_node_list(context);
@@ -256,7 +257,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                        case XFS_ATTR_LEAF_MAGIC:
                        case XFS_ATTR3_LEAF_MAGIC:
                                leaf = bp->b_addr;
-                                xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+                                xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
+                                                             &leafhdr, leaf);
                                entries = xfs_attr3_leaf_entryp(leaf);
                                if (cursor->hashval > be32_to_cpu(
                                                entries[leafhdr.count - 1].hashval)) {
@@ -340,7 +342,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                        xfs_trans_brelse(NULL, bp);
                        return error;
                }
-                xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+                xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
                if (context->seen_enough || leafhdr.forw == 0)
                        break;
                cursor->blkno = leafhdr.forw;
@@ -368,11 +370,12 @@ xfs_attr3_leaf_list_int(
        struct xfs_attr_leaf_entry      *entry;
        int                             retval;
        int                             i;
+        struct xfs_mount                *mp = context->dp->i_mount;
        trace_xfs_attr_list_leaf(context);
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
        entries = xfs_attr3_leaf_entryp(leaf);
        cursor = context->cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 22a5dcb70b32..a52bbd3abc7d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1376,22 +1376,19 @@ out:
 }
 /*
- * xfs_collapse_file_space()
+ * @next_fsb will keep track of the extent currently undergoing shift.
- *      This routine frees disk space and shift extent for the given file.
+ * @stop_fsb will keep track of the extent at which we have to stop.
- *      The first thing we do is to free data blocks in the specified range
+ * If we are shifting left, we will start with block (offset + len) and
- *      by calling xfs_free_file_space(). It would also sync dirty data
+ * shift each extent till last extent.
- *      and invalidate page cache over the region on which collapse range
+ * If we are shifting right, we will start with last extent inside file space
- *      is working. And Shift extent records to the left to cover a hole.
+ * and continue until we reach the block corresponding to offset.
- * RETURNS:
- *      0 on success
- *      errno on error
- *
 */
-int
+static int
-xfs_collapse_file_space(
+xfs_shift_file_space(
-        struct xfs_inode        *ip,
+        struct xfs_inode        *ip,
-        xfs_off_t               offset,
+        xfs_off_t               offset,
-        xfs_off_t               len)
+        xfs_off_t               len,
+        enum shift_direction    direction)
 {
        int                     done = 0;
        struct xfs_mount        *mp = ip->i_mount;
@@ -1400,21 +1397,26 @@ xfs_collapse_file_space(
        struct xfs_bmap_free    free_list;
        xfs_fsblock_t           first_block;
        int                     committed;
-        xfs_fileoff_t           start_fsb;
+        xfs_fileoff_t           stop_fsb;
        xfs_fileoff_t           next_fsb;
        xfs_fileoff_t           shift_fsb;
-        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
-        trace_xfs_collapse_file_space(ip);
+        if (direction == SHIFT_LEFT) {
+                next_fsb = XFS_B_TO_FSB(mp, offset + len);
+                stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+        } else {
+                /*
+                 * If right shift, delegate the work of initialization of
+                 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
+                 */
+                next_fsb = NULLFSBLOCK;
+                stop_fsb = XFS_B_TO_FSB(mp, offset);
+        }
-        next_fsb = XFS_B_TO_FSB(mp, offset + len);
        shift_fsb = XFS_B_TO_FSB(mp, len);
-        error = xfs_free_file_space(ip, offset, len);
-        if (error)
-                return error;
        /*
         * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
         * into the accessible region of the file.
@@ -1427,20 +1429,28 @@ xfs_collapse_file_space(
        /*
         * Writeback and invalidate cache for the remainder of the file as we're
-         * about to shift down every extent from the collapse range to EOF. The
+         * about to shift down every extent from offset to EOF.
-         * free of the collapse range above might have already done some of
-         * this, but we shouldn't rely on it to do anything outside of the range
-         * that was freed.
         */
        error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                             offset + len, -1);
+                                             offset, -1);
        if (error)
                return error;
        error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                        (offset + len) >> PAGE_CACHE_SHIFT, -1);
+                                        offset >> PAGE_CACHE_SHIFT, -1);
        if (error)
                return error;
+        /*
+         * The extent shiting code works on extent granularity. So, if
+         * stop_fsb is not the starting block of extent, we need to split
+         * the extent at stop_fsb.
+         */
+        if (direction == SHIFT_RIGHT) {
+                error = xfs_bmap_split_extent(ip, stop_fsb);
+                if (error)
+                        return error;
+        }
        while (!error && !done) {
                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
                /*
@@ -1464,7 +1474,7 @@ xfs_collapse_file_space(
                if (error)
                        goto out;
-                xfs_trans_ijoin(tp, ip, 0);
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
                xfs_bmap_init(&free_list, &first_block);
@@ -1472,10 +1482,9 @@ xfs_collapse_file_space(
                 * We are using the write transaction in which max 2 bmbt
                 * updates are allowed
                 */
-                start_fsb = next_fsb;
+                error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
-                error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb,
+                                &done, stop_fsb, &first_block, &free_list,
-                                &done, &next_fsb, &first_block, &free_list,
+                                direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
-                                XFS_BMAP_MAX_SHIFT_EXTENTS);
                if (error)
                        goto out;
@@ -1484,18 +1493,70 @@ xfs_collapse_file_space(
                        goto out;
                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
        return error;
 out:
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
 /*
+ * xfs_collapse_file_space()
+ *      This routine frees disk space and shift extent for the given file.
+ *      The first thing we do is to free data blocks in the specified range
+ *      by calling xfs_free_file_space(). It would also sync dirty data
+ *      and invalidate page cache over the region on which collapse range
+ *      is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ *      0 on success
+ *      errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        xfs_off_t               len)
+{
+        int error;
+        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        trace_xfs_collapse_file_space(ip);
+        error = xfs_free_file_space(ip, offset, len);
+        if (error)
+                return error;
+        return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
+}
+/*
+ * xfs_insert_file_space()
+ *      This routine create hole space by shifting extents for the given file.
+ *      The first thing we do is to sync dirty data and invalidate page cache
+ *      over the region on which insert range is working. And split an extent
+ *      to two extents at given offset by calling xfs_bmap_split_extent.
+ *      And shift all extent records which are laying between [offset,
+ *      last allocated extent] to the right to reserve hole range.
+ * RETURNS:
+ *      0 on success
+ *      errno on error
+ */
+int
+xfs_insert_file_space(
+        struct xfs_inode        *ip,
+        loff_t                  offset,
+        loff_t                  len)
+{
+        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        trace_xfs_insert_file_space(ip);
+        return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
+}
+/*
 * We need to check that the format of the data fork in the temporary inode is
 * valid for the target inode before doing the swap. This is not a problem with
 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
@@ -1599,13 +1660,6 @@ xfs_swap_extent_flush(
        /* Verify O_DIRECT for ftmp */
        if (VFS_I(ip)->i_mapping->nrpages)
                return -EINVAL;
-        /*
-         * Don't try to swap extents on mmap()d files because we can't lock
-         * out races against page faults safely.
-         */
-        if (mapping_mapped(VFS_I(ip)->i_mapping))
-                return -EBUSY;
        return 0;
 }
@@ -1633,13 +1687,14 @@ xfs_swap_extents(
        }
        /*
-         * Lock up the inodes against other IO and truncate to begin with.
+         * Lock the inodes against other IO, page faults and truncate to
-         * Then we can ensure the inodes are flushed and have no page cache
+         * begin with.  Then we can ensure the inodes are flushed and have no
-         * safely. Once we have done this we can take the ilocks and do the rest
+         * page cache safely. Once we have done this we can take the ilocks and
-         * of the checks.
+         * do the rest of the checks.
         */
-        lock_flags = XFS_IOLOCK_EXCL;
+        lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+        xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
        /* Verify that both files have the same format */
        if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
@@ -1666,8 +1721,16 @@ xfs_swap_extents(
                xfs_trans_cancel(tp, 0);
                goto out_unlock;
        }
+        /*
+         * Lock and join the inodes to the tansaction so that transaction commit
+         * or cancel will unlock the inodes from this point onwards.
+         */
        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        lock_flags |= XFS_ILOCK_EXCL;
+        xfs_trans_ijoin(tp, ip, lock_flags);
+        xfs_trans_ijoin(tp, tip, lock_flags);
        /* Verify all data are being swapped */
        if (sxp->sx_offset != 0 ||
@@ -1720,9 +1783,6 @@ xfs_swap_extents(
                        goto out_trans_cancel;
        }
-        xfs_trans_ijoin(tp, ip, lock_flags);
-        xfs_trans_ijoin(tp, tip, lock_flags);
        /*
         * Before we've swapped the forks, lets set the owners of the forks
         * appropriately. We have to do this as we are demand paging the btree
@@ -1856,5 +1916,5 @@ out_unlock:
 out_trans_cancel:
        xfs_trans_cancel(tp, 0);
-        goto out_unlock;
+        goto out;
 }
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 736429a72a12..af97d9a1dfb4 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,6 +63,8 @@ int	xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
                            xfs_off_t len);
 int     xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
                                xfs_off_t len);
+int     xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
+                                xfs_off_t len);
 /* EOF block manipulation functions */
 bool    xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 507d96a57ac7..092d652bc03d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -537,9 +537,9 @@ xfs_buf_item_push(
        /* has a previous flush failed due to IO errors? */
        if ((bp->b_flags & XBF_WRITE_FAIL) &&
-            ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
+            ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
                xfs_warn(bp->b_target->bt_mount,
-"Detected failing async write on buffer block 0x%llx. Retrying async write.",
+"Failing async write on buffer block 0x%llx. Retrying async write.",
                         (long long)bp->b_bn);
        }
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 799e5a2d334d..e85a9519a5ae 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -84,7 +84,7 @@ xfs_trim_extents(
                error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
                if (error)
                        goto out_del_cursor;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor);
                ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
                /*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 3ee186ac1093..338e50bbfd1e 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -131,7 +131,7 @@ xfs_error_report(
 {
        if (level <= xfs_error_level) {
                xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
-                "Internal error %s at line %d of file %s.  Caller %pF",
+                "Internal error %s at line %d of file %s.  Caller %pS",
                            tag, linenum, filename, ra);
                xfs_stack_trace();
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 279a76e52791..c0394ed126fc 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -40,25 +40,25 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
 /*
 * Macros to set EFSCORRUPTED & return/branch.
 */
-#define XFS_WANT_CORRUPTED_GOTO(x,l)    \
+#define XFS_WANT_CORRUPTED_GOTO(mp, x, l)       \
        { \
                int fs_is_ok = (x); \
                ASSERT(fs_is_ok); \
                if (unlikely(!fs_is_ok)) { \
                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
-                                         XFS_ERRLEVEL_LOW, NULL); \
+                                         XFS_ERRLEVEL_LOW, mp); \
                        error = -EFSCORRUPTED; \
                        goto l; \
                } \
        }
-#define XFS_WANT_CORRUPTED_RETURN(x)    \
+#define XFS_WANT_CORRUPTED_RETURN(mp, x)        \
        { \
                int fs_is_ok = (x); \
                ASSERT(fs_is_ok); \
                if (unlikely(!fs_is_ok)) { \
                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
-                                         XFS_ERRLEVEL_LOW, NULL); \
+                                         XFS_ERRLEVEL_LOW, mp); \
                        return -EFSCORRUPTED; \
                } \
        }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f12ad0a8585..8121e75352ee 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -559,7 +559,7 @@ restart:
        if (error <= 0)
                return error;
-        error = xfs_break_layouts(inode, iolock);
+        error = xfs_break_layouts(inode, iolock, true);
        if (error)
                return error;
@@ -569,21 +569,42 @@ restart:
         * write.  If zeroing is needed and we are currently holding the
         * iolock shared, we need to update it to exclusive which implies
         * having to redo all checks before.
+         *
+         * We need to serialise against EOF updates that occur in IO
+         * completions here. We want to make sure that nobody is changing the
+         * size while we do this check until we have placed an IO barrier (i.e.
+         * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+         * The spinlock effectively forms a memory barrier once we have the
+         * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+         * and hence be able to correctly determine if we need to run zeroing.
         */
+        spin_lock(&ip->i_flags_lock);
        if (iocb->ki_pos > i_size_read(inode)) {
                bool    zero = false;
+                spin_unlock(&ip->i_flags_lock);
                if (*iolock == XFS_IOLOCK_SHARED) {
                        xfs_rw_iunlock(ip, *iolock);
                        *iolock = XFS_IOLOCK_EXCL;
                        xfs_rw_ilock(ip, *iolock);
                        iov_iter_reexpand(from, count);
+                        /*
+                         * We now have an IO submission barrier in place, but
+                         * AIO can do EOF updates during IO completion and hence
+                         * we now need to wait for all of them to drain. Non-AIO
+                         * DIO will have drained before we are given the
+                         * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+                         * no-op.
+                         */
+                        inode_dio_wait(inode);
                        goto restart;
                }
                error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
                if (error)
                        return error;
-        }
+        } else
+                spin_unlock(&ip->i_flags_lock);
        /*
         * Updating the timestamps will grab the ilock again from
@@ -645,6 +666,8 @@ xfs_file_dio_aio_write(
        int                     iolock;
        size_t                  count = iov_iter_count(from);
        loff_t                  pos = iocb->ki_pos;
+        loff_t                  end;
+        struct iov_iter         data;
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -685,10 +708,11 @@ xfs_file_dio_aio_write(
                goto out;
        count = iov_iter_count(from);
        pos = iocb->ki_pos;
+        end = pos + count - 1;
        if (mapping->nrpages) {
                ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                    pos, pos + count - 1);
+                                                   pos, end);
                if (ret)
                        goto out;
                /*
@@ -698,7 +722,7 @@ xfs_file_dio_aio_write(
                 */
                ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
                                        pos >> PAGE_CACHE_SHIFT,
-                                        (pos + count - 1) >> PAGE_CACHE_SHIFT);
+                                        end >> PAGE_CACHE_SHIFT);
                WARN_ON_ONCE(ret);
                ret = 0;
        }
@@ -715,8 +739,22 @@ xfs_file_dio_aio_write(
        }
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-        ret = generic_file_direct_write(iocb, from, pos);
+        data = *from;
+        ret = mapping->a_ops->direct_IO(iocb, &data, pos);
+        /* see generic_file_direct_write() for why this is necessary */
+        if (mapping->nrpages) {
+                invalidate_inode_pages2_range(mapping,
+                                              pos >> PAGE_CACHE_SHIFT,
+                                              end >> PAGE_CACHE_SHIFT);
+        }
+        if (ret > 0) {
+                pos += ret;
+                iov_iter_advance(from, ret);
+                iocb->ki_pos = pos;
+        }
 out:
        xfs_rw_iunlock(ip, iolock);
@@ -822,6 +860,11 @@ xfs_file_write_iter(
        return ret;
 }
+#define XFS_FALLOC_FL_SUPPORTED                                         \
+                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
+                 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
+                 FALLOC_FL_INSERT_RANGE)
 STATIC long
 xfs_file_fallocate(
        struct file             *file,
@@ -835,18 +878,21 @@ xfs_file_fallocate(
        enum xfs_prealloc_flags flags = 0;
        uint                    iolock = XFS_IOLOCK_EXCL;
        loff_t                  new_size = 0;
+        bool                    do_file_insert = 0;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
-        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+        if (mode & ~XFS_FALLOC_FL_SUPPORTED)
-                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
        xfs_ilock(ip, iolock);
-        error = xfs_break_layouts(inode, &iolock);
+        error = xfs_break_layouts(inode, &iolock, false);
        if (error)
                goto out_unlock;
+        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+        iolock |= XFS_MMAPLOCK_EXCL;
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                error = xfs_free_file_space(ip, offset, len);
                if (error)
@@ -873,6 +919,27 @@ xfs_file_fallocate(
                error = xfs_collapse_file_space(ip, offset, len);
                if (error)
                        goto out_unlock;
+        } else if (mode & FALLOC_FL_INSERT_RANGE) {
+                unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+                new_size = i_size_read(inode) + len;
+                if (offset & blksize_mask || len & blksize_mask) {
+                        error = -EINVAL;
+                        goto out_unlock;
+                }
+                /* check the new inode size does not wrap through zero */
+                if (new_size > inode->i_sb->s_maxbytes) {
+                        error = -EFBIG;
+                        goto out_unlock;
+                }
+                /* Offset should be less than i_size */
+                if (offset >= i_size_read(inode)) {
+                        error = -EINVAL;
+                        goto out_unlock;
+                }
+                do_file_insert = 1;
        } else {
                flags |= XFS_PREALLOC_SET;
@@ -907,8 +974,19 @@ xfs_file_fallocate(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
                error = xfs_setattr_size(ip, &iattr);
+                if (error)
+                        goto out_unlock;
        }
+        /*
+         * Perform hole insertion now that the file size has been
+         * updated so that if we crash during the operation we don't
+         * leave shifted extents past EOF and hence losing access to
+         * the data that is contained within them.
+         */
+        if (do_file_insert)
+                error = xfs_insert_file_space(ip, offset, len);
 out_unlock:
        xfs_iunlock(ip, iolock);
        return error;
@@ -997,20 +1075,6 @@ xfs_file_mmap(
 }
 /*
- * mmap()d file has taken write protection fault and is being made
- * writable. We can set the page state up correctly for a writable
- * page, which means we can do correct delalloc accounting (ENOSPC
- * checking!) and unwritten extent mapping.
- */
-STATIC int
-xfs_vm_page_mkwrite(
-        struct vm_area_struct   *vma,
-        struct vm_fault         *vmf)
-{
-        return block_page_mkwrite(vma, vmf, xfs_get_blocks);
-}
-/*
 * This type is designed to indicate the type of offset we would like
 * to search from page cache for xfs_seek_hole_data().
 */
@@ -1385,6 +1449,55 @@ xfs_file_llseek(
        }
 }
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_sem (MM)
+ *   i_mmap_lock (XFS - truncate serialisation)
+ *     page_lock (MM)
+ *       i_lock (XFS - extent map serialisation)
+ */
+STATIC int
+xfs_filemap_fault(
+        struct vm_area_struct   *vma,
+        struct vm_fault         *vmf)
+{
+        struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+        int                     error;
+        trace_xfs_filemap_fault(ip);
+        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+        error = filemap_fault(vma, vmf);
+        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+        return error;
+}
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
+ */
+STATIC int
+xfs_filemap_page_mkwrite(
+        struct vm_area_struct   *vma,
+        struct vm_fault         *vmf)
+{
+        struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+        int                     error;
+        trace_xfs_filemap_page_mkwrite(ip);
+        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+        error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+        return error;
+}
 const struct file_operations xfs_file_operations = {
        .llseek         = xfs_file_llseek,
        .read_iter      = xfs_file_read_iter,
@@ -1415,7 +1528,7 @@ const struct file_operations xfs_dir_file_operations = {
 };
 static const struct vm_operations_struct xfs_file_vm_ops = {
-        .fault          = filemap_fault,
+        .fault          = xfs_filemap_fault,
        .map_pages      = filemap_map_pages,
-        .page_mkwrite   = xfs_vm_page_mkwrite,
+        .page_mkwrite   = xfs_filemap_page_mkwrite,
 };
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a2e86e8a0fea..8f9f854376c6 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -322,7 +322,7 @@ xfs_filestream_lookup_ag(
        pip = xfs_filestream_get_parent(ip);
        if (!pip)
-                goto out;
+                return NULLAGNUMBER;
        mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
        if (mru) {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 74efe5b760dc..cb7e8a29dfb6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -637,12 +637,13 @@ xfs_fs_counts(
        xfs_mount_t             *mp,
        xfs_fsop_counts_t       *cnt)
 {
-        xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+        cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
+        cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
+        cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+                                                        XFS_ALLOC_SET_ASIDE(mp);
        spin_lock(&mp->m_sb_lock);
-        cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
        cnt->freertx = mp->m_sb.sb_frextents;
-        cnt->freeino = mp->m_sb.sb_ifree;
-        cnt->allocino = mp->m_sb.sb_icount;
        spin_unlock(&mp->m_sb_lock);
        return 0;
 }
@@ -692,14 +693,9 @@ xfs_reserve_blocks(
         * what to do. This means that the amount of free space can
         * change while we do this, so we need to retry if we end up
         * trying to reserve more space than is available.
-         *
-         * We also use the xfs_mod_incore_sb() interface so that we
-         * don't have to care about whether per cpu counter are
-         * enabled, disabled or even compiled in....
         */
 retry:
        spin_lock(&mp->m_sb_lock);
-        xfs_icsb_sync_counters_locked(mp, 0);
        /*
         * If our previous reservation was larger than the current value,
@@ -716,7 +712,8 @@ retry:
        } else {
                __int64_t       free;
-                free =  mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+                free = percpu_counter_sum(&mp->m_fdblocks) -
+                                                        XFS_ALLOC_SET_ASIDE(mp);
                if (!free)
                        goto out; /* ENOSPC and fdblks_delta = 0 */
@@ -755,8 +752,7 @@ out:
                 * the extra reserve blocks from the reserve.....
                 */
                int error;
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
-                                                 fdblks_delta, 0);
                if (error == -ENOSPC)
                        goto retry;
        }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9771b7ef62ed..76a9f2783282 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -439,11 +439,11 @@ again:
        *ipp = ip;
        /*
-         * If we have a real type for an on-disk inode, we can set ops(&unlock)
+         * If we have a real type for an on-disk inode, we can setup the inode
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
         */
        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-                xfs_setup_inode(ip);
+                xfs_setup_existing_inode(ip);
        return 0;
 out_error_or_again:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6163767aa856..d6ebc85192b7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared(
 }
 /*
- * The xfs inode contains 2 locks: a multi-reader lock called the
+ * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
- * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * the i_lock.  This routine allows various combinations of the locks to be
- * allows either or both of the locks to be obtained.
+ * obtained.
 *
- * The 2 locks should always be ordered so that the IO lock is
+ * The 3 locks should always be ordered so that the IO lock is obtained first,
- * obtained first in order to prevent deadlock.
+ * the mmap lock second and the ilock last in order to prevent deadlock.
 *
- * ip -- the inode being locked
+ * Basic locking order:
- * lock_flags -- this parameter indicates the inode's locks
+ *
- *       to be locked.  It can be:
+ * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
- *              XFS_IOLOCK_SHARED,
+ *
- *              XFS_IOLOCK_EXCL,
+ * mmap_sem locking order:
- *              XFS_ILOCK_SHARED,
+ *
- *              XFS_ILOCK_EXCL,
+ * i_iolock -> page lock -> mmap_sem
- *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ * mmap_sem -> i_mmap_lock -> page_lock
- *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *
- *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ * The difference in mmap_sem locking order mean that we cannot hold the
- *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
+ * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
+ * in get_user_pages() to map the user pages into the kernel address space for
+ * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * page faults already hold the mmap_sem.
+ *
+ * Hence to serialise fully against both syscall and mmap based IO, we need to
+ * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * taken in places where we need to invalidate the page cache in a race
+ * free manner (e.g. truncate, hole punch and other extent manipulation
+ * functions).
 */
 void
 xfs_ilock(
@@ -150,6 +160,8 @@ xfs_ilock(
         */
        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -159,6 +171,11 @@ xfs_ilock(
        else if (lock_flags & XFS_IOLOCK_SHARED)
                mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+        if (lock_flags & XFS_MMAPLOCK_EXCL)
+                mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+        else if (lock_flags & XFS_MMAPLOCK_SHARED)
+                mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
        if (lock_flags & XFS_ILOCK_EXCL)
                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
        else if (lock_flags & XFS_ILOCK_SHARED)
@@ -191,6 +208,8 @@ xfs_ilock_nowait(
         */
        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -202,21 +221,35 @@ xfs_ilock_nowait(
                if (!mrtryaccess(&ip->i_iolock))
                        goto out;
        }
+        if (lock_flags & XFS_MMAPLOCK_EXCL) {
+                if (!mrtryupdate(&ip->i_mmaplock))
+                        goto out_undo_iolock;
+        } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+                if (!mrtryaccess(&ip->i_mmaplock))
+                        goto out_undo_iolock;
+        }
        if (lock_flags & XFS_ILOCK_EXCL) {
                if (!mrtryupdate(&ip->i_lock))
-                        goto out_undo_iolock;
+                        goto out_undo_mmaplock;
        } else if (lock_flags & XFS_ILOCK_SHARED) {
                if (!mrtryaccess(&ip->i_lock))
-                        goto out_undo_iolock;
+                        goto out_undo_mmaplock;
        }
        return 1;
- out_undo_iolock:
+out_undo_mmaplock:
+        if (lock_flags & XFS_MMAPLOCK_EXCL)
+                mrunlock_excl(&ip->i_mmaplock);
+        else if (lock_flags & XFS_MMAPLOCK_SHARED)
+                mrunlock_shared(&ip->i_mmaplock);
+out_undo_iolock:
        if (lock_flags & XFS_IOLOCK_EXCL)
                mrunlock_excl(&ip->i_iolock);
        else if (lock_flags & XFS_IOLOCK_SHARED)
                mrunlock_shared(&ip->i_iolock);
- out:
+out:
        return 0;
 }
@@ -244,6 +277,8 @@ xfs_iunlock(
         */
        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -254,6 +289,11 @@ xfs_iunlock(
        else if (lock_flags & XFS_IOLOCK_SHARED)
                mrunlock_shared(&ip->i_iolock);
+        if (lock_flags & XFS_MMAPLOCK_EXCL)
+                mrunlock_excl(&ip->i_mmaplock);
+        else if (lock_flags & XFS_MMAPLOCK_SHARED)
+                mrunlock_shared(&ip->i_mmaplock);
        if (lock_flags & XFS_ILOCK_EXCL)
                mrunlock_excl(&ip->i_lock);
        else if (lock_flags & XFS_ILOCK_SHARED)
@@ -271,11 +311,14 @@ xfs_ilock_demote(
        xfs_inode_t             *ip,
        uint                    lock_flags)
 {
-        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+        ASSERT((lock_flags &
+                ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
        if (lock_flags & XFS_ILOCK_EXCL)
                mrdemote(&ip->i_lock);
+        if (lock_flags & XFS_MMAPLOCK_EXCL)
+                mrdemote(&ip->i_mmaplock);
        if (lock_flags & XFS_IOLOCK_EXCL)
                mrdemote(&ip->i_iolock);
@@ -294,6 +337,12 @@ xfs_isilocked(
                return rwsem_is_locked(&ip->i_lock.mr_lock);
        }
+        if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
+                if (!(lock_flags & XFS_MMAPLOCK_SHARED))
+                        return !!ip->i_mmaplock.mr_writer;
+                return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+        }
        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
                if (!(lock_flags & XFS_IOLOCK_SHARED))
                        return !!ip->i_iolock.mr_writer;
@@ -314,14 +363,27 @@ int xfs_lock_delays;
 #endif
 /*
- * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
- * a different value
+ * value. This shouldn't be called for page fault locking, but we also need to
+ * ensure we don't overrun the number of lockdep subclasses for the iolock or
+ * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
 */
 static inline int
 xfs_lock_inumorder(int lock_mode, int subclass)
 {
-        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
+                ASSERT(subclass + XFS_LOCK_INUMORDER <
+                        (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
                lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+        }
+        if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
+                ASSERT(subclass + XFS_LOCK_INUMORDER <
+                        (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
+                lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
+                                                        XFS_MMAPLOCK_SHIFT;
+        }
        if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
                lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
@@ -329,15 +391,14 @@ xfs_lock_inumorder(int lock_mode, int subclass)
 }
 /*
- * The following routine will lock n inodes in exclusive mode.
+ * The following routine will lock n inodes in exclusive mode.  We assume the
- * We assume the caller calls us with the inodes in i_ino order.
+ * caller calls us with the inodes in i_ino order.
 *
- * We need to detect deadlock where an inode that we lock
+ * We need to detect deadlock where an inode that we lock is in the AIL and we
- * is in the AIL and we start waiting for another inode that is locked
+ * start waiting for another inode that is locked by a thread in a long running
- * by a thread in a long running transaction (such as truncate). This can
+ * transaction (such as truncate). This can result in deadlock since the long
- * result in deadlock since the long running trans might need to wait
+ * running trans might need to wait for the inode we just locked in order to
- * for the inode we just locked in order to push the tail and free space
+ * push the tail and free space in the log.
- * in the log.
 */
 void
 xfs_lock_inodes(
@@ -348,30 +409,27 @@ xfs_lock_inodes(
        int             attempts = 0, i, j, try_lock;
        xfs_log_item_t  *lp;
-        ASSERT(ips && (inodes >= 2)); /* we need at least two */
+        /* currently supports between 2 and 5 inodes */
+        ASSERT(ips && inodes >= 2 && inodes <= 5);
        try_lock = 0;
        i = 0;
 again:
        for (; i < inodes; i++) {
                ASSERT(ips[i]);
-                if (i && (ips[i] == ips[i-1]))  /* Already locked */
+                if (i && (ips[i] == ips[i - 1]))        /* Already locked */
                        continue;
                /*
-                 * If try_lock is not set yet, make sure all locked inodes
+                 * If try_lock is not set yet, make sure all locked inodes are
-                 * are not in the AIL.
+                 * not in the AIL.  If any are, set try_lock to be used later.
-                 * If any are, set try_lock to be used later.
                 */
                if (!try_lock) {
                        for (j = (i - 1); j >= 0 && !try_lock; j--) {
                                lp = (xfs_log_item_t *)ips[j]->i_itemp;
-                                if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                                if (lp && (lp->li_flags & XFS_LI_IN_AIL))
                                        try_lock++;
-                                }
                        }
                }
@@ -381,51 +439,42 @@ again:
                 * we can't get any, we must release all we have
                 * and try again.
                 */
+                if (!try_lock) {
+                        xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+                        continue;
+                }
+                /* try_lock means we have an inode locked that is in the AIL. */
+                ASSERT(i != 0);
+                if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
+                        continue;
-                if (try_lock) {
+                /*
-                        /* try_lock must be 0 if i is 0. */
+                 * Unlock all previous guys and try again.  xfs_iunlock will try
+                 * to push the tail if the inode is in the AIL.
+                 */
+                attempts++;
+                for (j = i - 1; j >= 0; j--) {
                        /*
-                         * try_lock means we have an inode locked
+                         * Check to see if we've already unlocked this one.  Not
-                         * that is in the AIL.
+                         * the first one going back, and the inode ptr is the
+                         * same.
                         */
-                        ASSERT(i != 0);
+                        if (j != (i - 1) && ips[j] == ips[j + 1])
-                        if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
+                                continue;
-                                attempts++;
+                        xfs_iunlock(ips[j], lock_mode);
-                                /*
+                }
-                                 * Unlock all previous guys and try again.
-                                 * xfs_iunlock will try to push the tail
-                                 * if the inode is in the AIL.
-                                 */
-                                for(j = i - 1; j >= 0; j--) {
-                                        /*
-                                         * Check to see if we've already
-                                         * unlocked this one.
-                                         * Not the first one going back,
-                                         * and the inode ptr is the same.
-                                         */
-                                        if ((j != (i - 1)) && ips[j] ==
-                                                                ips[j+1])
-                                                continue;
-                                        xfs_iunlock(ips[j], lock_mode);
-                                }
-                                if ((attempts % 5) == 0) {
+                if ((attempts % 5) == 0) {
-                                        delay(1); /* Don't just spin the CPU */
+                        delay(1); /* Don't just spin the CPU */
 #ifdef DEBUG
-                                        xfs_lock_delays++;
+                        xfs_lock_delays++;
 #endif
-                                }
-                                i = 0;
-                                try_lock = 0;
-                                goto again;
-                        }
-                } else {
-                        xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
                }
+                i = 0;
+                try_lock = 0;
+                goto again;
        }
 #ifdef DEBUG
@@ -440,10 +489,10 @@ again:
 }
 /*
- * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
- * at a time - the iolock or the ilock, but not both at once. If
+ * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
- * we lock both at once, lockdep will report false positives saying
+ * lock more than one at a time, lockdep will report false positives saying we
- * we have violated locking orders.
+ * have violated locking orders.
 */
 void
 xfs_lock_two_inodes(
@@ -455,8 +504,12 @@ xfs_lock_two_inodes(
        int                     attempts = 0;
        xfs_log_item_t          *lp;
-        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
-                ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+                ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+                ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
+        } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
+                ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
        ASSERT(ip0->i_ino != ip1->i_ino);
        if (ip0->i_ino > ip1->i_ino) {
@@ -818,7 +871,7 @@ xfs_ialloc(
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, flags);
-        /* now that we have an i_mode we can setup inode ops and unlock */
+        /* now that we have an i_mode we can setup the inode structure */
        xfs_setup_inode(ip);
        *ipp = ip;
@@ -1235,12 +1288,14 @@ xfs_create(
        xfs_trans_cancel(tp, cancel_flags);
 out_release_inode:
        /*
-         * Wait until after the current transaction is aborted to
+         * Wait until after the current transaction is aborted to finish the
-         * release the inode.  This prevents recursive transactions
+         * setup of the inode and release the inode.  This prevents recursive
-         * and deadlocks from xfs_inactive.
+         * transactions and deadlocks from xfs_inactive.
         */
-        if (ip)
+        if (ip) {
+                xfs_finish_inode_setup(ip);
                IRELE(ip);
+        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -1345,12 +1400,14 @@ xfs_create_tmpfile(
        xfs_trans_cancel(tp, cancel_flags);
 out_release_inode:
        /*
-         * Wait until after the current transaction is aborted to
+         * Wait until after the current transaction is aborted to finish the
-         * release the inode.  This prevents recursive transactions
+         * setup of the inode and release the inode.  This prevents recursive
-         * and deadlocks from xfs_inactive.
+         * transactions and deadlocks from xfs_inactive.
         */
-        if (ip)
+        if (ip) {
+                xfs_finish_inode_setup(ip);
                IRELE(ip);
+        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -2611,19 +2668,22 @@ xfs_remove(
 /*
 * Enter all inodes for a rename transaction into a sorted array.
 */
+#define __XFS_SORT_INODES       5
 STATIC void
 xfs_sort_for_rename(
-        xfs_inode_t     *dp1,   /* in: old (source) directory inode */
+        struct xfs_inode        *dp1,   /* in: old (source) directory inode */
-        xfs_inode_t     *dp2,   /* in: new (target) directory inode */
+        struct xfs_inode        *dp2,   /* in: new (target) directory inode */
-        xfs_inode_t     *ip1,   /* in: inode of old entry */
+        struct xfs_inode        *ip1,   /* in: inode of old entry */
-        xfs_inode_t     *ip2,   /* in: inode of new entry, if it
+        struct xfs_inode        *ip2,   /* in: inode of new entry */
-                                   already exists, NULL otherwise. */
+        struct xfs_inode        *wip,   /* in: whiteout inode */
-        xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
+        struct xfs_inode        **i_tab,/* out: sorted array of inodes */
-        int             *num_inodes)  /* out: number of inodes in array */
+        int                     *num_inodes)  /* in/out: inodes in array */
 {
-        xfs_inode_t             *temp;
        int                     i, j;
+        ASSERT(*num_inodes == __XFS_SORT_INODES);
+        memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
        /*
         * i_tab contains a list of pointers to inodes.  We initialize
         * the table here & we'll sort it.  We will then use it to
@@ -2631,25 +2691,24 @@ xfs_sort_for_rename(
         *
         * Note that the table may contain duplicates.  e.g., dp1 == dp2.
         */
-        i_tab[0] = dp1;
+        i = 0;
-        i_tab[1] = dp2;
+        i_tab[i++] = dp1;
-        i_tab[2] = ip1;
+        i_tab[i++] = dp2;
-        if (ip2) {
+        i_tab[i++] = ip1;
-                *num_inodes = 4;
+        if (ip2)
-                i_tab[3] = ip2;
+                i_tab[i++] = ip2;
-        } else {
+        if (wip)
-                *num_inodes = 3;
+                i_tab[i++] = wip;
-                i_tab[3] = NULL;
+        *num_inodes = i;
-        }
        /*
         * Sort the elements via bubble sort.  (Remember, there are at
-         * most 4 elements to sort, so this is adequate.)
+         * most 5 elements to sort, so this is adequate.)
         */
        for (i = 0; i < *num_inodes; i++) {
                for (j = 1; j < *num_inodes; j++) {
                        if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
-                                temp = i_tab[j];
+                                struct xfs_inode *temp = i_tab[j];
                                i_tab[j] = i_tab[j-1];
                                i_tab[j-1] = temp;
                        }
@@ -2657,6 +2716,31 @@ xfs_sort_for_rename(
        }
 }
+static int
+xfs_finish_rename(
+        struct xfs_trans        *tp,
+        struct xfs_bmap_free    *free_list)
+{
+        int                     committed = 0;
+        int                     error;
+        /*
+         * If this is a synchronous mount, make sure that the rename transaction
+         * goes to disk before returning to the user.
+         */
+        if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+                xfs_trans_set_sync(tp);
+        error = xfs_bmap_finish(&tp, free_list, &committed);
+        if (error) {
+                xfs_bmap_cancel(free_list);
+                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+                return error;
+        }
+        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+}
 /*
 * xfs_cross_rename()
 *
@@ -2685,14 +2769,14 @@ xfs_cross_rename(
                                ip2->i_ino,
                                first_block, free_list, spaceres);
        if (error)
-                goto out;
+                goto out_trans_abort;
        /* Swap inode number for dirent in second parent */
        error = xfs_dir_replace(tp, dp2, name2,
                                ip1->i_ino,
                                first_block, free_list, spaceres);
        if (error)
-                goto out;
+                goto out_trans_abort;
        /*
         * If we're renaming one or more directories across different parents,
@@ -2707,16 +2791,16 @@ xfs_cross_rename(
                                                dp1->i_ino, first_block,
                                                free_list, spaceres);
                        if (error)
-                                goto out;
+                                goto out_trans_abort;
                        /* transfer ip2 ".." reference to dp1 */
                        if (!S_ISDIR(ip1->i_d.di_mode)) {
                                error = xfs_droplink(tp, dp2);
                                if (error)
-                                        goto out;
+                                        goto out_trans_abort;
                                error = xfs_bumplink(tp, dp1);
                                if (error)
-                                        goto out;
+                                        goto out_trans_abort;
                        }
                        /*
@@ -2734,16 +2818,16 @@ xfs_cross_rename(
                                                dp2->i_ino, first_block,
                                                free_list, spaceres);
                        if (error)
-                                goto out;
+                                goto out_trans_abort;
                        /* transfer ip1 ".." reference to dp2 */
                        if (!S_ISDIR(ip2->i_d.di_mode)) {
                                error = xfs_droplink(tp, dp1);
                                if (error)
-                                        goto out;
+                                        goto out_trans_abort;
                                error = xfs_bumplink(tp, dp2);
                                if (error)
-                                        goto out;
+                                        goto out_trans_abort;
                        }
                        /*
@@ -2771,66 +2855,108 @@ xfs_cross_rename(
        }
        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
-out:
+        return xfs_finish_rename(tp, free_list);
+out_trans_abort:
+        xfs_bmap_cancel(free_list);
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
        return error;
 }
 /*
+ * xfs_rename_alloc_whiteout()
+ *
+ * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * whiteout in a rename transaction. We use a tmpfile inode here so that if we
+ * crash between allocating the inode and linking it into the rename transaction
+ * recovery will free the inode and we won't leak it.
+ */
+static int
+xfs_rename_alloc_whiteout(
+        struct xfs_inode        *dp,
+        struct xfs_inode        **wip)
+{
+        struct xfs_inode        *tmpfile;
+        int                     error;
+        error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+        if (error)
+                return error;
+        /* Satisfy xfs_bumplink that this is a real tmpfile */
+        xfs_finish_inode_setup(tmpfile);
+        VFS_I(tmpfile)->i_state |= I_LINKABLE;
+        *wip = tmpfile;
+        return 0;
+}
+/*
 * xfs_rename
 */
 int
 xfs_rename(
-        xfs_inode_t     *src_dp,
+        struct xfs_inode        *src_dp,
-        struct xfs_name *src_name,
+        struct xfs_name         *src_name,
-        xfs_inode_t     *src_ip,
+        struct xfs_inode        *src_ip,
-        xfs_inode_t     *target_dp,
+        struct xfs_inode        *target_dp,
-        struct xfs_name *target_name,
+        struct xfs_name         *target_name,
-        xfs_inode_t     *target_ip,
+        struct xfs_inode        *target_ip,
-        unsigned int    flags)
+        unsigned int            flags)
 {
-        xfs_trans_t     *tp = NULL;
+        struct xfs_mount        *mp = src_dp->i_mount;
-        xfs_mount_t     *mp = src_dp->i_mount;
+        struct xfs_trans        *tp;
-        int             new_parent;             /* moving to a new dir */
+        struct xfs_bmap_free    free_list;
-        int             src_is_directory;       /* src_name is a directory */
+        xfs_fsblock_t           first_block;
-        int             error;
+        struct xfs_inode        *wip = NULL;            /* whiteout inode */
-        xfs_bmap_free_t free_list;
+        struct xfs_inode        *inodes[__XFS_SORT_INODES];
-        xfs_fsblock_t   first_block;
+        int                     num_inodes = __XFS_SORT_INODES;
-        int             cancel_flags;
+        bool                    new_parent = (src_dp != target_dp);
-        int             committed;
+        bool                    src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
-        xfs_inode_t     *inodes[4];
+        int                     cancel_flags = 0;
-        int             spaceres;
+        int                     spaceres;
-        int             num_inodes;
+        int                     error;
        trace_xfs_rename(src_dp, target_dp, src_name, target_name);
-        new_parent = (src_dp != target_dp);
+        if ((flags & RENAME_EXCHANGE) && !target_ip)
-        src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+                return -EINVAL;
-        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+        /*
+         * If we are doing a whiteout operation, allocate the whiteout inode
+         * we will be placing at the target and ensure the type is set
+         * appropriately.
+         */
+        if (flags & RENAME_WHITEOUT) {
+                ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
+                error = xfs_rename_alloc_whiteout(target_dp, &wip);
+                if (error)
+                        return error;
+                /* setup target dirent info as whiteout */
+                src_name->type = XFS_DIR3_FT_CHRDEV;
+        }
+        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
                                inodes, &num_inodes);
-        xfs_bmap_init(&free_list, &first_block);
        tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
-        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
        if (error == -ENOSPC) {
                spaceres = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
        }
-        if (error) {
+        if (error)
-                xfs_trans_cancel(tp, 0);
+                goto out_trans_cancel;
-                goto std_return;
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        }
        /*
         * Attach the dquots to the inodes
         */
        error = xfs_qm_vop_rename_dqattach(inodes);
-        if (error) {
+        if (error)
-                xfs_trans_cancel(tp, cancel_flags);
+                goto out_trans_cancel;
-                goto std_return;
-        }
        /*
         * Lock all the participating inodes. Depending upon whether
@@ -2851,6 +2977,8 @@ xfs_rename(
        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
        if (target_ip)
                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+        if (wip)
+                xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
        /*
         * If we are using project inheritance, we only allow renames
@@ -2860,24 +2988,16 @@ xfs_rename(
        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
                error = -EXDEV;
-                goto error_return;
+                goto out_trans_cancel;
        }
-        /*
+        xfs_bmap_init(&free_list, &first_block);
-         * Handle RENAME_EXCHANGE flags
-         */
+        /* RENAME_EXCHANGE is unique from here on. */
-        if (flags & RENAME_EXCHANGE) {
+        if (flags & RENAME_EXCHANGE)
-                if (target_ip == NULL) {
+                return xfs_cross_rename(tp, src_dp, src_name, src_ip,
-                        error = -EINVAL;
+                                        target_dp, target_name, target_ip,
-                        goto error_return;
+                                        &free_list, &first_block, spaceres);
-                }
-                error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
-                                         target_dp, target_name, target_ip,
-                                         &free_list, &first_block, spaceres);
-                if (error)
-                        goto abort_return;
-                goto finish_rename;
-        }
        /*
         * Set up the target.
@@ -2890,7 +3010,7 @@ xfs_rename(
                if (!spaceres) {
                        error = xfs_dir_canenter(tp, target_dp, target_name);
                        if (error)
-                                goto error_return;
+                                goto out_trans_cancel;
                }
                /*
                 * If target does not exist and the rename crosses
@@ -2901,9 +3021,9 @@ xfs_rename(
                                                src_ip->i_ino, &first_block,
                                                &free_list, spaceres);
                if (error == -ENOSPC)
-                        goto error_return;
+                        goto out_bmap_cancel;
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
                xfs_trans_ichgtime(tp, target_dp,
                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2911,7 +3031,7 @@ xfs_rename(
                if (new_parent && src_is_directory) {
                        error = xfs_bumplink(tp, target_dp);
                        if (error)
-                                goto abort_return;
+                                goto out_trans_abort;
                }
        } else { /* target_ip != NULL */
                /*
@@ -2926,7 +3046,7 @@ xfs_rename(
                        if (!(xfs_dir_isempty(target_ip)) ||
                            (target_ip->i_d.di_nlink > 2)) {
                                error = -EEXIST;
-                                goto error_return;
+                                goto out_trans_cancel;
                        }
                }
@@ -2943,7 +3063,7 @@ xfs_rename(
                                        src_ip->i_ino,
                                        &first_block, &free_list, spaceres);
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
                xfs_trans_ichgtime(tp, target_dp,
                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2954,7 +3074,7 @@ xfs_rename(
                 */
                error = xfs_droplink(tp, target_ip);
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
                if (src_is_directory) {
                        /*
@@ -2962,7 +3082,7 @@ xfs_rename(
                         */
                        error = xfs_droplink(tp, target_ip);
                        if (error)
-                                goto abort_return;
+                                goto out_trans_abort;
                }
        } /* target_ip != NULL */
@@ -2979,7 +3099,7 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
                ASSERT(error != -EEXIST);
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
        }
        /*
@@ -3005,49 +3125,67 @@ xfs_rename(
                 */
                error = xfs_droplink(tp, src_dp);
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
        }
-        error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+        /*
+         * For whiteouts, we only need to update the source dirent with the
+         * inode number of the whiteout inode rather than removing it
+         * altogether.
+         */
+        if (wip) {
+                error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
                                        &first_block, &free_list, spaceres);
+        } else
+                error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+                                           &first_block, &free_list, spaceres);
        if (error)
-                goto abort_return;
+                goto out_trans_abort;
-        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-        if (new_parent)
-                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-finish_rename:
        /*
-         * If this is a synchronous mount, make sure that the
+         * For whiteouts, we need to bump the link count on the whiteout inode.
-         * rename transaction goes to disk before returning to
+         * This means that failures all the way up to this point leave the inode
-         * the user.
+         * on the unlinked list and so cleanup is a simple matter of dropping
+         * the remaining reference to it. If we fail here after bumping the link
+         * count, we're shutting down the filesystem so we'll never see the
+         * intermediate state on disk.
         */
-        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+        if (wip) {
-                xfs_trans_set_sync(tp);
+                ASSERT(wip->i_d.di_nlink == 0);
-        }
+                error = xfs_bumplink(tp, wip);
+                if (error)
+                        goto out_trans_abort;
+                error = xfs_iunlink_remove(tp, wip);
+                if (error)
+                        goto out_trans_abort;
+                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
-        error = xfs_bmap_finish(&tp, &free_list, &committed);
+                /*
-        if (error) {
+                 * Now we have a real link, clear the "I'm a tmpfile" state
-                xfs_bmap_cancel(&free_list);
+                 * flag from the inode so it doesn't accidentally get misused in
-                xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+                 * future.
-                                 XFS_TRANS_ABORT));
+                 */
-                goto std_return;
+                VFS_I(wip)->i_state &= ~I_LINKABLE;
        }
-        /*
+        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-         * trans_commit will unlock src_ip, target_ip & decrement
+        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-         * the vnode references.
+        if (new_parent)
-         */
+                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- abort_return:
+        error = xfs_finish_rename(tp, &free_list);
+        if (wip)
+                IRELE(wip);
+        return error;
+out_trans_abort:
        cancel_flags |= XFS_TRANS_ABORT;
- error_return:
+out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
+out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
- std_return:
+        if (wip)
+                IRELE(wip);
        return error;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a1cd55f3f351..8f22d20368d8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,6 +56,7 @@ typedef struct xfs_inode {
        struct xfs_inode_log_item *i_itemp;     /* logging information */
        mrlock_t                i_lock;         /* inode lock */
        mrlock_t                i_iolock;       /* inode IO lock */
+        mrlock_t                i_mmaplock;     /* inode mmap IO lock */
        atomic_t                i_pincount;     /* inode pin count */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
        /* Miscellaneous state. */
@@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 #define XFS_IOLOCK_SHARED       (1<<1)
 #define XFS_ILOCK_EXCL          (1<<2)
 #define XFS_ILOCK_SHARED        (1<<3)
+#define XFS_MMAPLOCK_EXCL       (1<<4)
+#define XFS_MMAPLOCK_SHARED     (1<<5)
 #define XFS_LOCK_MASK           (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
-                                | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
+                                | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
+                                | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
 #define XFS_LOCK_FLAGS \
        { XFS_IOLOCK_EXCL,      "IOLOCK_EXCL" }, \
        { XFS_IOLOCK_SHARED,    "IOLOCK_SHARED" }, \
        { XFS_ILOCK_EXCL,       "ILOCK_EXCL" }, \
-        { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }
+        { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }, \
+        { XFS_MMAPLOCK_EXCL,    "MMAPLOCK_EXCL" }, \
+        { XFS_MMAPLOCK_SHARED,  "MMAPLOCK_SHARED" }
 /*
@@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 #define XFS_IOLOCK_SHIFT        16
 #define XFS_IOLOCK_PARENT       (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
+#define XFS_MMAPLOCK_SHIFT      20
 #define XFS_ILOCK_SHIFT         24
 #define XFS_ILOCK_PARENT        (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
 #define XFS_ILOCK_RTBITMAP      (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
 #define XFS_ILOCK_RTSUM         (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
-#define XFS_IOLOCK_DEP_MASK     0x00ff0000
+#define XFS_IOLOCK_DEP_MASK     0x000f0000
+#define XFS_MMAPLOCK_DEP_MASK   0x00f00000
 #define XFS_ILOCK_DEP_MASK      0xff000000
-#define XFS_LOCK_DEP_MASK       (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK)
+#define XFS_LOCK_DEP_MASK       (XFS_IOLOCK_DEP_MASK | \
+                                 XFS_MMAPLOCK_DEP_MASK | \
+                                 XFS_ILOCK_DEP_MASK)
-#define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
+#define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) \
-#define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+                                        >> XFS_IOLOCK_SHIFT)
+#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \
+                                        >> XFS_MMAPLOCK_SHIFT)
+#define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) \
+                                        >> XFS_ILOCK_SHIFT)
 /*
 * For multiple groups support: if S_ISGID bit is set in the parent
@@ -391,6 +406,28 @@ int	xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
 int     xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+/* from xfs_iops.c */
+/*
+ * When setting up a newly allocated inode, we need to call
+ * xfs_finish_inode_setup() once the inode is fully instantiated at
+ * the VFS level to prevent the rest of the world seeing the inode
+ * before we've completed instantiation. Otherwise we can do it
+ * the moment the inode lookup is complete.
+ */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
+{
+        xfs_iflags_clear(ip, XFS_INEW);
+        barrier();
+        unlock_new_inode(VFS_I(ip));
+}
+static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
+{
+        xfs_setup_inode(ip);
+        xfs_finish_inode_setup(ip);
+}
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ac4feae45eb3..5f4a396f5186 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -631,7 +631,7 @@ xfs_ioc_space(
        if (filp->f_flags & O_DSYNC)
                flags |= XFS_PREALLOC_SYNC;
-        if (ioflags & XFS_IO_INVIS)     
+        if (ioflags & XFS_IO_INVIS)
                flags |= XFS_PREALLOC_INVISIBLE;
        error = mnt_want_write_file(filp);
@@ -639,10 +639,13 @@ xfs_ioc_space(
                return error;
        xfs_ilock(ip, iolock);
-        error = xfs_break_layouts(inode, &iolock);
+        error = xfs_break_layouts(inode, &iolock, false);
        if (error)
                goto out_unlock;
+        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+        iolock |= XFS_MMAPLOCK_EXCL;
        switch (bf->l_whence) {
        case 0: /*SEEK_SET*/
                break;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ccb1dd0d509e..38e633bad8c2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -460,8 +460,7 @@ xfs_iomap_prealloc_size(
        alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
                                       alloc_blocks);
-        xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+        freesp = percpu_counter_read_positive(&mp->m_fdblocks);
-        freesp = mp->m_sb.sb_fdblocks;
        if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
                shift = 2;
                if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e53a90331422..2f1839e4dd1b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -187,6 +187,8 @@ xfs_generic_create(
        else
                d_instantiate(dentry, inode);
+        xfs_finish_inode_setup(ip);
 out_free_acl:
        if (default_acl)
                posix_acl_release(default_acl);
@@ -195,6 +197,7 @@ xfs_generic_create(
        return error;
 out_cleanup_inode:
+        xfs_finish_inode_setup(ip);
        if (!tmpfile)
                xfs_cleanup_inode(dir, inode, dentry);
        iput(inode);
@@ -367,9 +370,11 @@ xfs_vn_symlink(
                goto out_cleanup_inode;
        d_instantiate(dentry, inode);
+        xfs_finish_inode_setup(cip);
        return 0;
 out_cleanup_inode:
+        xfs_finish_inode_setup(cip);
        xfs_cleanup_inode(dir, inode, dentry);
        iput(inode);
 out:
@@ -389,7 +394,7 @@ xfs_vn_rename(
        struct xfs_name oname;
        struct xfs_name nname;
-        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;
        /* if we are exchanging files, we need to set i_mode of both files */
@@ -766,6 +771,7 @@ xfs_setattr_size(
                return error;
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
        ASSERT(S_ISREG(ip->i_d.di_mode));
        ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
                ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -829,55 +835,27 @@ xfs_setattr_size(
        inode_dio_wait(inode);
        /*
-         * Do all the page cache truncate work outside the transaction context
+         * We've already locked out new page faults, so now we can safely remove
-         * as the "lock" order is page lock->log space reservation.  i.e.
+         * pages from the page cache knowing they won't get refaulted until we
-         * locking pages inside the transaction can ABBA deadlock with
+         * drop the XFS_MMAP_EXCL lock after the extent manipulations are
-         * writeback. We have to do the VFS inode size update before we truncate
+         * complete. The truncate_setsize() call also cleans partial EOF page
-         * the pagecache, however, to avoid racing with page faults beyond the
+         * PTEs on extending truncates and hence ensures sub-page block size
-         * new EOF they are not serialised against truncate operations except by
+         * filesystems are correctly handled, too.
-         * page locks and size updates.
         *
-         * Hence we are in a situation where a truncate can fail with ENOMEM
+         * We have to do all the page cache truncate work outside the
-         * from xfs_trans_reserve(), but having already truncated the in-memory
+         * transaction context as the "lock" order is page lock->log space
-         * version of the file (i.e. made user visible changes). There's not
+         * reservation as defined by extent allocation in the writeback path.
-         * much we can do about this, except to hope that the caller sees ENOMEM
+         * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
-         * and retries the truncate operation.
+         * having already truncated the in-memory version of the file (i.e. made
+         * user visible changes). There's not much we can do about this, except
+         * to hope that the caller sees ENOMEM and retries the truncate
+         * operation.
         */
        error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
        if (error)
                return error;
        truncate_setsize(inode, newsize);
-        /*
-         * The "we can't serialise against page faults" pain gets worse.
-         *
-         * If the file is mapped then we have to clean the page at the old EOF
-         * when extending the file. Extending the file can expose changes the
-         * underlying page mapping (e.g. from beyond EOF to a hole or
-         * unwritten), and so on the next attempt to write to that page we need
-         * to remap it for write. i.e. we need .page_mkwrite() to be called.
-         * Hence we need to clean the page to clean the pte and so a new write
-         * fault will be triggered appropriately.
-         *
-         * If we do it before we change the inode size, then we can race with a
-         * page fault that maps the page with exactly the same problem. If we do
-         * it after we change the file size, then a new page fault can come in
-         * and allocate space before we've run the rest of the truncate
-         * transaction. That's kinda grotesque, but it's better than have data
-         * over a hole, and so that's the lesser evil that has been chosen here.
-         *
-         * The real solution, however, is to have some mechanism for locking out
-         * page faults while a truncate is in progress.
-         */
-        if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
-                error = filemap_write_and_wait_range(
-                                VFS_I(ip)->i_mapping,
-                                round_down(oldsize, PAGE_CACHE_SIZE),
-                                round_up(oldsize, PAGE_CACHE_SIZE) - 1);
-                if (error)
-                        return error;
-        }
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error)
@@ -975,9 +953,13 @@ xfs_vn_setattr(
                uint            iolock = XFS_IOLOCK_EXCL;
                xfs_ilock(ip, iolock);
-                error = xfs_break_layouts(dentry->d_inode, &iolock);
+                error = xfs_break_layouts(dentry->d_inode, &iolock, true);
-                if (!error)
+                if (!error) {
+                        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+                        iolock |= XFS_MMAPLOCK_EXCL;
                        error = xfs_setattr_size(ip, iattr);
+                }
                xfs_iunlock(ip, iolock);
        } else {
                error = xfs_setattr_nonsize(ip, iattr, 0);
@@ -1228,16 +1210,12 @@ xfs_diflags_to_iflags(
 }
 /*
- * Initialize the Linux inode, set up the operation vectors and
+ * Initialize the Linux inode and set up the operation vectors.
- * unlock the inode.
 *
- * When reading existing inodes from disk this is called directly
+ * When reading existing inodes from disk this is called directly from xfs_iget,
- * from xfs_iget, when creating a new inode it is called from
+ * when creating a new inode it is called from xfs_ialloc after setting up the
- * xfs_ialloc after setting up the inode.
+ * inode. These callers have different criteria for clearing XFS_INEW, so leave
- *
+ * it up to the caller to deal with unlocking the inode appropriately.
- * We are always called with an uninitialised linux inode here.
- * We need to initialise the necessary fields and take a reference
- * on it.
 */
 void
 xfs_setup_inode(
@@ -1324,9 +1302,4 @@ xfs_setup_inode(
                inode_has_no_xattr(inode);
                cache_no_acl(inode);
        }
-        xfs_iflags_clear(ip, XFS_INEW);
-        barrier();
-        unlock_new_inode(inode);
 }
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index ea7a98e9cb70..a0f84abb0d09 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations;
 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-extern void xfs_setup_inode(struct xfs_inode *);
 /*
 * Internal setattr interfaces.
 */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 82e314258f73..80429891dc9b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -229,7 +229,7 @@ xfs_bulkstat_grab_ichunk(
        error = xfs_inobt_get_rec(cur, irec, &stat);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(stat == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
        /* Check if the record contains the inode in request */
        if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index c31d2c2eadc4..7c7842c85a08 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -116,15 +116,6 @@ typedef __uint64_t __psunsigned_t;
 #undef XFS_NATIVE_HOST
 #endif
-/*
- * Feature macros (disable/enable)
- */
-#ifdef CONFIG_SMP
-#define HAVE_PERCPU_SB  /* per cpu superblock counters are a 2.6 feature */
-#else
-#undef  HAVE_PERCPU_SB  /* per cpu superblock counters are a 2.6 feature */
-#endif
 #define irix_sgid_inherit       xfs_params.sgid_inherit.val
 #define irix_symlink_mode       xfs_params.symlink_mode.val
 #define xfs_panic_mask          xfs_params.panic_mask.val
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a5a945fc3bdc..4f5784f85a5b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4463,10 +4463,10 @@ xlog_do_recover(
        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
        ASSERT(xfs_sb_good_version(sbp));
+        xfs_reinit_percpu_counters(log->l_mp);
        xfs_buf_relse(bp);
-        /* We've re-read the superblock so re-initialize per-cpu counters */
-        xfs_icsb_reinit_counters(log->l_mp);
        xlog_recover_check_summary(log);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4fa80e63eea2..2ce7ee3b4ec1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,18 +43,6 @@
 #include "xfs_sysfs.h"
-#ifdef HAVE_PERCPU_SB
-STATIC void     xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
-                                                int);
-STATIC void     xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
-                                                int);
-STATIC void     xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
-#else
-#define xfs_icsb_balance_counter(mp, a, b)              do { } while (0)
-#define xfs_icsb_balance_counter_locked(mp, a, b)       do { } while (0)
-#endif
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
 static uuid_t *xfs_uuid_table;
@@ -347,8 +335,7 @@ reread:
                goto reread;
        }
-        /* Initialize per-cpu counters */
+        xfs_reinit_percpu_counters(mp);
-        xfs_icsb_reinit_counters(mp);
        /* no need to be quiet anymore, so reset the buf ops */
        bp->b_ops = &xfs_sb_buf_ops;
@@ -1087,8 +1074,6 @@ xfs_log_sbcount(xfs_mount_t *mp)
        if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
                return 0;
-        xfs_icsb_sync_counters(mp, 0);
        /*
         * we don't need to do this if we are updating the superblock
         * counters on every modification.
@@ -1099,253 +1084,136 @@ xfs_log_sbcount(xfs_mount_t *mp)
        return xfs_sync_sb(mp, true);
 }
-/*
+int
- * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply
+xfs_mod_icount(
- * a delta to a specified field in the in-core superblock.  Simply
+        struct xfs_mount        *mp,
- * switch on the field indicated and apply the delta to that field.
+        int64_t                 delta)
- * Fields are not allowed to dip below zero, so if the delta would
- * do this do not apply it and return EINVAL.
- *
- * The m_sb_lock must be held when this routine is called.
- */
-STATIC int
-xfs_mod_incore_sb_unlocked(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field,
-        int64_t         delta,
-        int             rsvd)
 {
-        int             scounter;       /* short counter for 32 bit fields */
+        /* deltas are +/-64, hence the large batch size of 128. */
-        long long       lcounter;       /* long counter for 64 bit fields */
+        __percpu_counter_add(&mp->m_icount, delta, 128);
-        long long       res_used, rem;
+        if (percpu_counter_compare(&mp->m_icount, 0) < 0) {
-        /*
-         * With the in-core superblock spin lock held, switch
-         * on the indicated field.  Apply the delta to the
-         * proper field.  If the fields value would dip below
-         * 0, then do not apply the delta and return EINVAL.
-         */
-        switch (field) {
-        case XFS_SBS_ICOUNT:
-                lcounter = (long long)mp->m_sb.sb_icount;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_icount = lcounter;
-                return 0;
-        case XFS_SBS_IFREE:
-                lcounter = (long long)mp->m_sb.sb_ifree;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_ifree = lcounter;
-                return 0;
-        case XFS_SBS_FDBLOCKS:
-                lcounter = (long long)
-                        mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
-                res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
-                if (delta > 0) {                /* Putting blocks back */
-                        if (res_used > delta) {
-                                mp->m_resblks_avail += delta;
-                        } else {
-                                rem = delta - res_used;
-                                mp->m_resblks_avail = mp->m_resblks;
-                                lcounter += rem;
-                        }
-                } else {                                /* Taking blocks away */
-                        lcounter += delta;
-                        if (lcounter >= 0) {
-                                mp->m_sb.sb_fdblocks = lcounter +
-                                                        XFS_ALLOC_SET_ASIDE(mp);
-                                return 0;
-                        }
-                        /*
-                         * We are out of blocks, use any available reserved
-                         * blocks if were allowed to.
-                         */
-                        if (!rsvd)
-                                return -ENOSPC;
-                        lcounter = (long long)mp->m_resblks_avail + delta;
-                        if (lcounter >= 0) {
-                                mp->m_resblks_avail = lcounter;
-                                return 0;
-                        }
-                        printk_once(KERN_WARNING
-                                "Filesystem \"%s\": reserve blocks depleted! "
-                                "Consider increasing reserve pool size.",
-                                mp->m_fsname);
-                        return -ENOSPC;
-                }
-                mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
-                return 0;
-        case XFS_SBS_FREXTENTS:
-                lcounter = (long long)mp->m_sb.sb_frextents;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        return -ENOSPC;
-                }
-                mp->m_sb.sb_frextents = lcounter;
-                return 0;
-        case XFS_SBS_DBLOCKS:
-                lcounter = (long long)mp->m_sb.sb_dblocks;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_dblocks = lcounter;
-                return 0;
-        case XFS_SBS_AGCOUNT:
-                scounter = mp->m_sb.sb_agcount;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_agcount = scounter;
-                return 0;
-        case XFS_SBS_IMAX_PCT:
-                scounter = mp->m_sb.sb_imax_pct;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_imax_pct = scounter;
-                return 0;
-        case XFS_SBS_REXTSIZE:
-                scounter = mp->m_sb.sb_rextsize;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rextsize = scounter;
-                return 0;
-        case XFS_SBS_RBMBLOCKS:
-                scounter = mp->m_sb.sb_rbmblocks;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rbmblocks = scounter;
-                return 0;
-        case XFS_SBS_RBLOCKS:
-                lcounter = (long long)mp->m_sb.sb_rblocks;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rblocks = lcounter;
-                return 0;
-        case XFS_SBS_REXTENTS:
-                lcounter = (long long)mp->m_sb.sb_rextents;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rextents = lcounter;
-                return 0;
-        case XFS_SBS_REXTSLOG:
-                scounter = mp->m_sb.sb_rextslog;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rextslog = scounter;
-                return 0;
-        default:
                ASSERT(0);
+                percpu_counter_add(&mp->m_icount, -delta);
                return -EINVAL;
        }
+        return 0;
 }
-/*
- * xfs_mod_incore_sb() is used to change a field in the in-core
- * superblock structure by the specified delta.  This modification
- * is protected by the m_sb_lock.  Just use the xfs_mod_incore_sb_unlocked()
- * routine to do the work.
- */
 int
-xfs_mod_incore_sb(
+xfs_mod_ifree(
        struct xfs_mount        *mp,
-        xfs_sb_field_t          field,
+        int64_t                 delta)
-        int64_t                 delta,
-        int                     rsvd)
 {
-        int                     status;
+        percpu_counter_add(&mp->m_ifree, delta);
+        if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
-#ifdef HAVE_PERCPU_SB
+                ASSERT(0);
-        ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
+                percpu_counter_add(&mp->m_ifree, -delta);
-#endif
+                return -EINVAL;
-        spin_lock(&mp->m_sb_lock);
+        }
-        status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+        return 0;
-        spin_unlock(&mp->m_sb_lock);
-        return status;
 }
-/*
- * Change more than one field in the in-core superblock structure at a time.
- *
- * The fields and changes to those fields are specified in the array of
- * xfs_mod_sb structures passed in.  Either all of the specified deltas
- * will be applied or none of them will.  If any modified field dips below 0,
- * then all modifications will be backed out and EINVAL will be returned.
- *
- * Note that this function may not be used for the superblock values that
- * are tracked with the in-memory per-cpu counters - a direct call to
- * xfs_icsb_modify_counters is required for these.
- */
 int
-xfs_mod_incore_sb_batch(
+xfs_mod_fdblocks(
        struct xfs_mount        *mp,
-        xfs_mod_sb_t            *msb,
+        int64_t                 delta,
-        uint                    nmsb,
+        bool                    rsvd)
-        int                     rsvd)
 {
-        xfs_mod_sb_t            *msbp;
+        int64_t                 lcounter;
-        int                     error = 0;
+        long long               res_used;
+        s32                     batch;
+        if (delta > 0) {
+                /*
+                 * If the reserve pool is depleted, put blocks back into it
+                 * first. Most of the time the pool is full.
+                 */
+                if (likely(mp->m_resblks == mp->m_resblks_avail)) {
+                        percpu_counter_add(&mp->m_fdblocks, delta);
+                        return 0;
+                }
+                spin_lock(&mp->m_sb_lock);
+                res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+                if (res_used > delta) {
+                        mp->m_resblks_avail += delta;
+                } else {
+                        delta -= res_used;
+                        mp->m_resblks_avail = mp->m_resblks;
+                        percpu_counter_add(&mp->m_fdblocks, delta);
+                }
+                spin_unlock(&mp->m_sb_lock);
+                return 0;
+        }
        /*
-         * Loop through the array of mod structures and apply each individually.
+         * Taking blocks away, need to be more accurate the closer we
-         * If any fail, then back out all those which have already been applied.
+         * are to zero.
-         * Do all of this within the scope of the m_sb_lock so that all of the
+         *
-         * changes will be atomic.
+         * batch size is set to a maximum of 1024 blocks - if we are
+         * allocating of freeing extents larger than this then we aren't
+         * going to be hammering the counter lock so a lock per update
+         * is not a problem.
+         *
+         * If the counter has a value of less than 2 * max batch size,
+         * then make everything serialise as we are real close to
+         * ENOSPC.
+         */
+#define __BATCH 1024
+        if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0)
+                batch = 1;
+        else
+                batch = __BATCH;
+#undef __BATCH
+        __percpu_counter_add(&mp->m_fdblocks, delta, batch);
+        if (percpu_counter_compare(&mp->m_fdblocks,
+                                   XFS_ALLOC_SET_ASIDE(mp)) >= 0) {
+                /* we had space! */
+                return 0;
+        }
+        /*
+         * lock up the sb for dipping into reserves before releasing the space
+         * that took us to ENOSPC.
         */
        spin_lock(&mp->m_sb_lock);
-        for (msbp = msb; msbp < (msb + nmsb); msbp++) {
+        percpu_counter_add(&mp->m_fdblocks, -delta);
-                ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
+        if (!rsvd)
-                       msbp->msb_field > XFS_SBS_FDBLOCKS);
+                goto fdblocks_enospc;
-                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+        lcounter = (long long)mp->m_resblks_avail + delta;
-                                                   msbp->msb_delta, rsvd);
+        if (lcounter >= 0) {
-                if (error)
+                mp->m_resblks_avail = lcounter;
-                        goto unwind;
+                spin_unlock(&mp->m_sb_lock);
+                return 0;
        }
+        printk_once(KERN_WARNING
+                "Filesystem \"%s\": reserve blocks depleted! "
+                "Consider increasing reserve pool size.",
+                mp->m_fsname);
+fdblocks_enospc:
        spin_unlock(&mp->m_sb_lock);
-        return 0;
+        return -ENOSPC;
+}
-unwind:
+int
-        while (--msbp >= msb) {
+xfs_mod_frextents(
-                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+        struct xfs_mount        *mp,
-                                                   -msbp->msb_delta, rsvd);
+        int64_t                 delta)
-                ASSERT(error == 0);
+{
-        }
+        int64_t                 lcounter;
+        int                     ret = 0;
+        spin_lock(&mp->m_sb_lock);
+        lcounter = mp->m_sb.sb_frextents + delta;
+        if (lcounter < 0)
+                ret = -ENOSPC;
+        else
+                mp->m_sb.sb_frextents = lcounter;
        spin_unlock(&mp->m_sb_lock);
-        return error;
+        return ret;
 }
 /*
@@ -1407,573 +1275,3 @@ xfs_dev_is_read_only(
        }
        return 0;
 }
-#ifdef HAVE_PERCPU_SB
-/*
- * Per-cpu incore superblock counters
- *
- * Simple concept, difficult implementation
- *
- * Basically, replace the incore superblock counters with a distributed per cpu
- * counter for contended fields (e.g.  free block count).
- *
- * Difficulties arise in that the incore sb is used for ENOSPC checking, and
- * hence needs to be accurately read when we are running low on space. Hence
- * there is a method to enable and disable the per-cpu counters based on how
- * much "stuff" is available in them.
- *
- * Basically, a counter is enabled if there is enough free resource to justify
- * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
- * ENOSPC), then we disable the counters to synchronise all callers and
- * re-distribute the available resources.
- *
- * If, once we redistributed the available resources, we still get a failure,
- * we disable the per-cpu counter and go through the slow path.
- *
- * The slow path is the current xfs_mod_incore_sb() function.  This means that
- * when we disable a per-cpu counter, we need to drain its resources back to
- * the global superblock. We do this after disabling the counter to prevent
- * more threads from queueing up on the counter.
- *
- * Essentially, this means that we still need a lock in the fast path to enable
- * synchronisation between the global counters and the per-cpu counters. This
- * is not a problem because the lock will be local to a CPU almost all the time
- * and have little contention except when we get to ENOSPC conditions.
- *
- * Basically, this lock becomes a barrier that enables us to lock out the fast
- * path while we do things like enabling and disabling counters and
- * synchronising the counters.
- *
- * Locking rules:
- *
- *      1. m_sb_lock before picking up per-cpu locks
- *      2. per-cpu locks always picked up via for_each_online_cpu() order
- *      3. accurate counter sync requires m_sb_lock + per cpu locks
- *      4. modifying per-cpu counters requires holding per-cpu lock
- *      5. modifying global counters requires holding m_sb_lock
- *      6. enabling or disabling a counter requires holding the m_sb_lock 
- *         and _none_ of the per-cpu locks.
- *
- * Disabled counters are only ever re-enabled by a balance operation
- * that results in more free resources per CPU than a given threshold.
- * To ensure counters don't remain disabled, they are rebalanced when
- * the global resource goes above a higher threshold (i.e. some hysteresis
- * is present to prevent thrashing).
- */
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * hot-plug CPU notifier support.
- *
- * We need a notifier per filesystem as we need to be able to identify
- * the filesystem to balance the counters out. This is achieved by
- * having a notifier block embedded in the xfs_mount_t and doing pointer
- * magic to get the mount pointer from the notifier block address.
- */
-STATIC int
-xfs_icsb_cpu_notify(
-        struct notifier_block *nfb,
-        unsigned long action,
-        void *hcpu)
-{
-        xfs_icsb_cnts_t *cntp;
-        xfs_mount_t     *mp;
-        mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
-        cntp = (xfs_icsb_cnts_t *)
-                        per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
-        switch (action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                /* Easy Case - initialize the area and locks, and
-                 * then rebalance when online does everything else for us. */
-                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-                break;
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                xfs_icsb_lock(mp);
-                xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-                xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-                xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
-                xfs_icsb_unlock(mp);
-                break;
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                /* Disable all the counters, then fold the dead cpu's
-                 * count into the total on the global superblock and
-                 * re-enable the counters. */
-                xfs_icsb_lock(mp);
-                spin_lock(&mp->m_sb_lock);
-                xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
-                xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
-                xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
-                mp->m_sb.sb_icount += cntp->icsb_icount;
-                mp->m_sb.sb_ifree += cntp->icsb_ifree;
-                mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
-                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-                xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
-                xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
-                xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
-                spin_unlock(&mp->m_sb_lock);
-                xfs_icsb_unlock(mp);
-                break;
-        }
-        return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-int
-xfs_icsb_init_counters(
-        xfs_mount_t     *mp)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
-        if (mp->m_sb_cnts == NULL)
-                return -ENOMEM;
-        for_each_online_cpu(i) {
-                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-        }
-        mutex_init(&mp->m_icsb_mutex);
-        /*
-         * start with all counters disabled so that the
-         * initial balance kicks us off correctly
-         */
-        mp->m_icsb_counters = -1;
-#ifdef CONFIG_HOTPLUG_CPU
-        mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
-        mp->m_icsb_notifier.priority = 0;
-        register_hotcpu_notifier(&mp->m_icsb_notifier);
-#endif /* CONFIG_HOTPLUG_CPU */
-        return 0;
-}
-void
-xfs_icsb_reinit_counters(
-        xfs_mount_t     *mp)
-{
-        xfs_icsb_lock(mp);
-        /*
-         * start with all counters disabled so that the
-         * initial balance kicks us off correctly
-         */
-        mp->m_icsb_counters = -1;
-        xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-        xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-        xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
-        xfs_icsb_unlock(mp);
-}
-void
-xfs_icsb_destroy_counters(
-        xfs_mount_t     *mp)
-{
-        if (mp->m_sb_cnts) {
-                unregister_hotcpu_notifier(&mp->m_icsb_notifier);
-                free_percpu(mp->m_sb_cnts);
-        }
-        mutex_destroy(&mp->m_icsb_mutex);
-}
-STATIC void
-xfs_icsb_lock_cntr(
-        xfs_icsb_cnts_t *icsbp)
-{
-        while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
-                ndelay(1000);
-        }
-}
-STATIC void
-xfs_icsb_unlock_cntr(
-        xfs_icsb_cnts_t *icsbp)
-{
-        clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
-}
-STATIC void
-xfs_icsb_lock_all_counters(
-        xfs_mount_t     *mp)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        for_each_online_cpu(i) {
-                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-                xfs_icsb_lock_cntr(cntp);
-        }
-}
-STATIC void
-xfs_icsb_unlock_all_counters(
-        xfs_mount_t     *mp)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        for_each_online_cpu(i) {
-                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-                xfs_icsb_unlock_cntr(cntp);
-        }
-}
-STATIC void
-xfs_icsb_count(
-        xfs_mount_t     *mp,
-        xfs_icsb_cnts_t *cnt,
-        int             flags)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
-        if (!(flags & XFS_ICSB_LAZY_COUNT))
-                xfs_icsb_lock_all_counters(mp);
-        for_each_online_cpu(i) {
-                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-                cnt->icsb_icount += cntp->icsb_icount;
-                cnt->icsb_ifree += cntp->icsb_ifree;
-                cnt->icsb_fdblocks += cntp->icsb_fdblocks;
-        }
-        if (!(flags & XFS_ICSB_LAZY_COUNT))
-                xfs_icsb_unlock_all_counters(mp);
-}
-STATIC int
-xfs_icsb_counter_disabled(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field)
-{
-        ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-        return test_bit(field, &mp->m_icsb_counters);
-}
-STATIC void
-xfs_icsb_disable_counter(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field)
-{
-        xfs_icsb_cnts_t cnt;
-        ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-        /*
-         * If we are already disabled, then there is nothing to do
-         * here. We check before locking all the counters to avoid
-         * the expensive lock operation when being called in the
-         * slow path and the counter is already disabled. This is
-         * safe because the only time we set or clear this state is under
-         * the m_icsb_mutex.
-         */
-        if (xfs_icsb_counter_disabled(mp, field))
-                return;
-        xfs_icsb_lock_all_counters(mp);
-        if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
-                /* drain back to superblock */
-                xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
-                switch(field) {
-                case XFS_SBS_ICOUNT:
-                        mp->m_sb.sb_icount = cnt.icsb_icount;
-                        break;
-                case XFS_SBS_IFREE:
-                        mp->m_sb.sb_ifree = cnt.icsb_ifree;
-                        break;
-                case XFS_SBS_FDBLOCKS:
-                        mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-                        break;
-                default:
-                        BUG();
-                }
-        }
-        xfs_icsb_unlock_all_counters(mp);
-}
-STATIC void
-xfs_icsb_enable_counter(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field,
-        uint64_t        count,
-        uint64_t        resid)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-        xfs_icsb_lock_all_counters(mp);
-        for_each_online_cpu(i) {
-                cntp = per_cpu_ptr(mp->m_sb_cnts, i);
-                switch (field) {
-                case XFS_SBS_ICOUNT:
-                        cntp->icsb_icount = count + resid;
-                        break;
-                case XFS_SBS_IFREE:
-                        cntp->icsb_ifree = count + resid;
-                        break;
-                case XFS_SBS_FDBLOCKS:
-                        cntp->icsb_fdblocks = count + resid;
-                        break;
-                default:
-                        BUG();
-                        break;
-                }
-                resid = 0;
-        }
-        clear_bit(field, &mp->m_icsb_counters);
-        xfs_icsb_unlock_all_counters(mp);
-}
-void
-xfs_icsb_sync_counters_locked(
-        xfs_mount_t     *mp,
-        int             flags)
-{
-        xfs_icsb_cnts_t cnt;
-        xfs_icsb_count(mp, &cnt, flags);
-        if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
-                mp->m_sb.sb_icount = cnt.icsb_icount;
-        if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
-                mp->m_sb.sb_ifree = cnt.icsb_ifree;
-        if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
-                mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-}
-/*
- * Accurate update of per-cpu counters to incore superblock
- */
-void
-xfs_icsb_sync_counters(
-        xfs_mount_t     *mp,
-        int             flags)
-{
-        spin_lock(&mp->m_sb_lock);
-        xfs_icsb_sync_counters_locked(mp, flags);
-        spin_unlock(&mp->m_sb_lock);
-}
-/*
- * Balance and enable/disable counters as necessary.
- *
- * Thresholds for re-enabling counters are somewhat magic.  inode counts are
- * chosen to be the same number as single on disk allocation chunk per CPU, and
- * free blocks is something far enough zero that we aren't going thrash when we
- * get near ENOSPC. We also need to supply a minimum we require per cpu to
- * prevent looping endlessly when xfs_alloc_space asks for more than will
- * be distributed to a single CPU but each CPU has enough blocks to be
- * reenabled.
- *
- * Note that we can be called when counters are already disabled.
- * xfs_icsb_disable_counter() optimises the counter locking in this case to
- * prevent locking every per-cpu counter needlessly.
- */
-#define XFS_ICSB_INO_CNTR_REENABLE      (uint64_t)64
-#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
-                (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
-STATIC void
-xfs_icsb_balance_counter_locked(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field,
-        int             min_per_cpu)
-{
-        uint64_t        count, resid;
-        int             weight = num_online_cpus();
-        uint64_t        min = (uint64_t)min_per_cpu;
-        /* disable counter and sync counter */
-        xfs_icsb_disable_counter(mp, field);
-        /* update counters  - first CPU gets residual*/
-        switch (field) {
-        case XFS_SBS_ICOUNT:
-                count = mp->m_sb.sb_icount;
-                resid = do_div(count, weight);
-                if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-                        return;
-                break;
-        case XFS_SBS_IFREE:
-                count = mp->m_sb.sb_ifree;
-                resid = do_div(count, weight);
-                if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-                        return;
-                break;
-        case XFS_SBS_FDBLOCKS:
-                count = mp->m_sb.sb_fdblocks;
-                resid = do_div(count, weight);
-                if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
-                        return;
-                break;
-        default:
-                BUG();
-                count = resid = 0;      /* quiet, gcc */
-                break;
-        }
-        xfs_icsb_enable_counter(mp, field, count, resid);
-}
-STATIC void
-xfs_icsb_balance_counter(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  fields,
-        int             min_per_cpu)
-{
-        spin_lock(&mp->m_sb_lock);
-        xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
-        spin_unlock(&mp->m_sb_lock);
-}
-int
-xfs_icsb_modify_counters(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field,
-        int64_t         delta,
-        int             rsvd)
-{
-        xfs_icsb_cnts_t *icsbp;
-        long long       lcounter;       /* long counter for 64 bit fields */
-        int             ret = 0;
-        might_sleep();
-again:
-        preempt_disable();
-        icsbp = this_cpu_ptr(mp->m_sb_cnts);
-        /*
-         * if the counter is disabled, go to slow path
-         */
-        if (unlikely(xfs_icsb_counter_disabled(mp, field)))
-                goto slow_path;
-        xfs_icsb_lock_cntr(icsbp);
-        if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
-                xfs_icsb_unlock_cntr(icsbp);
-                goto slow_path;
-        }
-        switch (field) {
-        case XFS_SBS_ICOUNT:
-                lcounter = icsbp->icsb_icount;
-                lcounter += delta;
-                if (unlikely(lcounter < 0))
-                        goto balance_counter;
-                icsbp->icsb_icount = lcounter;
-                break;
-        case XFS_SBS_IFREE:
-                lcounter = icsbp->icsb_ifree;
-                lcounter += delta;
-                if (unlikely(lcounter < 0))
-                        goto balance_counter;
-                icsbp->icsb_ifree = lcounter;
-                break;
-        case XFS_SBS_FDBLOCKS:
-                BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
-                lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
-                lcounter += delta;
-                if (unlikely(lcounter < 0))
-                        goto balance_counter;
-                icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
-                break;
-        default:
-                BUG();
-                break;
-        }
-        xfs_icsb_unlock_cntr(icsbp);
-        preempt_enable();
-        return 0;
-slow_path:
-        preempt_enable();
-        /*
-         * serialise with a mutex so we don't burn lots of cpu on
-         * the superblock lock. We still need to hold the superblock
-         * lock, however, when we modify the global structures.
-         */
-        xfs_icsb_lock(mp);
-        /*
-         * Now running atomically.
-         *
-         * If the counter is enabled, someone has beaten us to rebalancing.
-         * Drop the lock and try again in the fast path....
-         */
-        if (!(xfs_icsb_counter_disabled(mp, field))) {
-                xfs_icsb_unlock(mp);
-                goto again;
-        }
-        /*
-         * The counter is currently disabled. Because we are
-         * running atomically here, we know a rebalance cannot
-         * be in progress. Hence we can go straight to operating
-         * on the global superblock. We do not call xfs_mod_incore_sb()
-         * here even though we need to get the m_sb_lock. Doing so
-         * will cause us to re-enter this function and deadlock.
-         * Hence we get the m_sb_lock ourselves and then call
-         * xfs_mod_incore_sb_unlocked() as the unlocked path operates
-         * directly on the global counters.
-         */
-        spin_lock(&mp->m_sb_lock);
-        ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-        spin_unlock(&mp->m_sb_lock);
-        /*
-         * Now that we've modified the global superblock, we
-         * may be able to re-enable the distributed counters
-         * (e.g. lots of space just got freed). After that
-         * we are done.
-         */
-        if (ret != -ENOSPC)
-                xfs_icsb_balance_counter(mp, field, 0);
-        xfs_icsb_unlock(mp);
-        return ret;
-balance_counter:
-        xfs_icsb_unlock_cntr(icsbp);
-        preempt_enable();
-        /*
-         * We may have multiple threads here if multiple per-cpu
-         * counters run dry at the same time. This will mean we can
-         * do more balances than strictly necessary but it is not
-         * the common slowpath case.
-         */
-        xfs_icsb_lock(mp);
-        /*
-         * running atomically.
-         *
-         * This will leave the counter in the correct state for future
-         * accesses. After the rebalance, we simply try again and our retry
-         * will either succeed through the fast path or slow path without
-         * another balance operation being required.
-         */
-        xfs_icsb_balance_counter(mp, field, delta);
-        xfs_icsb_unlock(mp);
-        goto again;
-}
-#endif
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0d8abd6364d9..8c995a2ccb6f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,8 +18,6 @@
 #ifndef __XFS_MOUNT_H__
 #define __XFS_MOUNT_H__
-#ifdef __KERNEL__
 struct xlog;
 struct xfs_inode;
 struct xfs_mru_cache;
@@ -29,44 +27,6 @@ struct xfs_quotainfo;
 struct xfs_dir_ops;
 struct xfs_da_geometry;
-#ifdef HAVE_PERCPU_SB
-/*
- * Valid per-cpu incore superblock counters. Note that if you add new counters,
- * you may need to define new counter disabled bit field descriptors as there
- * are more possible fields in the superblock that can fit in a bitfield on a
- * 32 bit platform. The XFS_SBS_* values for the current current counters just
- * fit.
- */
-typedef struct xfs_icsb_cnts {
-        uint64_t        icsb_fdblocks;
-        uint64_t        icsb_ifree;
-        uint64_t        icsb_icount;
-        unsigned long   icsb_flags;
-} xfs_icsb_cnts_t;
-#define XFS_ICSB_FLAG_LOCK      (1 << 0)        /* counter lock bit */
-#define XFS_ICSB_LAZY_COUNT     (1 << 1)        /* accuracy not needed */
-extern int      xfs_icsb_init_counters(struct xfs_mount *);
-extern void     xfs_icsb_reinit_counters(struct xfs_mount *);
-extern void     xfs_icsb_destroy_counters(struct xfs_mount *);
-extern void     xfs_icsb_sync_counters(struct xfs_mount *, int);
-extern void     xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
-extern int      xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
-                                                int64_t, int);
-#else
-#define xfs_icsb_init_counters(mp)              (0)
-#define xfs_icsb_destroy_counters(mp)           do { } while (0)
-#define xfs_icsb_reinit_counters(mp)            do { } while (0)
-#define xfs_icsb_sync_counters(mp, flags)       do { } while (0)
-#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
-#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
-        xfs_mod_incore_sb(mp, field, delta, rsvd)
-#endif
 /* dynamic preallocation free space thresholds, 5% down to 1% */
 enum {
        XFS_LOWSP_1_PCNT = 0,
@@ -81,8 +41,13 @@ typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
        struct xfs_ail          *m_ail;         /* fs active log item list */
-        xfs_sb_t                m_sb;           /* copy of fs superblock */
+        struct xfs_sb           m_sb;           /* copy of fs superblock */
        spinlock_t              m_sb_lock;      /* sb counter lock */
+        struct percpu_counter   m_icount;       /* allocated inodes counter */
+        struct percpu_counter   m_ifree;        /* free inodes counter */
+        struct percpu_counter   m_fdblocks;     /* free block counter */
        struct xfs_buf          *m_sb_bp;       /* buffer for superblock */
        char                    *m_fsname;      /* filesystem name */
        int                     m_fsname_len;   /* strlen of fs name */
@@ -152,12 +117,6 @@ typedef struct xfs_mount {
        const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
        uint                    m_chsize;       /* size of next field */
        atomic_t                m_active_trans; /* number trans frozen */
-#ifdef HAVE_PERCPU_SB
-        xfs_icsb_cnts_t __percpu *m_sb_cnts;    /* per-cpu superblock counters */
-        unsigned long           m_icsb_counters; /* disabled per-cpu counters */
-        struct notifier_block   m_icsb_notifier; /* hotplug cpu notifier */
-        struct mutex            m_icsb_mutex;   /* balancer sync lock */
-#endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
        struct delayed_work     m_reclaim_work; /* background inode reclaim */
        struct delayed_work     m_eofblocks_work; /* background eof blocks
@@ -301,35 +260,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 }
 /*
- * Per-cpu superblock locking functions
- */
-#ifdef HAVE_PERCPU_SB
-static inline void
-xfs_icsb_lock(xfs_mount_t *mp)
-{
-        mutex_lock(&mp->m_icsb_mutex);
-}
-static inline void
-xfs_icsb_unlock(xfs_mount_t *mp)
-{
-        mutex_unlock(&mp->m_icsb_mutex);
-}
-#else
-#define xfs_icsb_lock(mp)
-#define xfs_icsb_unlock(mp)
-#endif
-/*
- * This structure is for use by the xfs_mod_incore_sb_batch() routine.
- * xfs_growfs can specify a few fields which are more than int limit
- */
-typedef struct xfs_mod_sb {
-        xfs_sb_field_t  msb_field;      /* Field to modify, see below */
-        int64_t         msb_delta;      /* Change to make to specified field */
-} xfs_mod_sb_t;
-/*
 * Per-ag incore structure, copies of information in agf and agi, to improve the
 * performance of allocation group selection.
 */
@@ -383,11 +313,14 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern int      xfs_mountfs(xfs_mount_t *mp);
 extern int      xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
                                     xfs_agnumber_t *maxagi);
 extern void     xfs_unmountfs(xfs_mount_t *);
-extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int      xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
+extern int      xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
-                        uint, int);
+extern int      xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
+extern int      xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
+                                 bool reserved);
+extern int      xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
 extern int      xfs_mount_log_sb(xfs_mount_t *);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
@@ -399,6 +332,4 @@ extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
 extern void     xfs_set_low_space_thresholds(struct xfs_mount *);
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 30ecca3037e3..f8a674d7f092 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -437,7 +437,7 @@ xfs_mru_cache_insert(
        if (!mru || !mru->lists)
                return -EINVAL;
-        if (radix_tree_preload(GFP_KERNEL))
+        if (radix_tree_preload(GFP_NOFS))
                return -ENOMEM;
        INIT_LIST_HEAD(&elem->list_node);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 365dd57ea760..981a657eca39 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -31,7 +31,8 @@
 int
 xfs_break_layouts(
        struct inode            *inode,
-        uint                    *iolock)
+        uint                    *iolock,
+        bool                    with_imutex)
 {
        struct xfs_inode        *ip = XFS_I(inode);
        int                     error;
@@ -40,8 +41,12 @@ xfs_break_layouts(
        while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
                xfs_iunlock(ip, *iolock);
+                if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
+                        mutex_unlock(&inode->i_mutex);
                error = break_layout(inode, true);
                *iolock = XFS_IOLOCK_EXCL;
+                if (with_imutex)
+                        mutex_lock(&inode->i_mutex);
                xfs_ilock(ip, *iolock);
        }
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index b7fbfce660f6..8147ac108820 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,9 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
                struct iattr *iattr);
-int xfs_break_layouts(struct inode *inode, uint *iolock);
+int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
 #else
-static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
+static inline int
+xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
 {
        return 0;
 }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index fbbb9e62e274..5538468c7f63 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -719,6 +719,7 @@ xfs_qm_qino_alloc(
        xfs_trans_t     *tp;
        int             error;
        int             committed;
+        bool            need_alloc = true;
        *ip = NULL;
        /*
@@ -747,6 +748,7 @@ xfs_qm_qino_alloc(
                                return error;
                        mp->m_sb.sb_gquotino = NULLFSINO;
                        mp->m_sb.sb_pquotino = NULLFSINO;
+                        need_alloc = false;
                }
        }
@@ -758,7 +760,7 @@ xfs_qm_qino_alloc(
                return error;
        }
-        if (!*ip) {
+        if (need_alloc) {
                error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
                                                                &committed);
                if (error) {
@@ -794,11 +796,14 @@ xfs_qm_qino_alloc(
        spin_unlock(&mp->m_sb_lock);
        xfs_log_sb(tp);
-        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        if (error) {
+                ASSERT(XFS_FORCED_SHUTDOWN(mp));
                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
-                return error;
        }
-        return 0;
+        if (need_alloc)
+                xfs_finish_inode_setup(*ip);
+        return error;
 }
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8fcc4ccc5c79..5f357ca97e76 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -109,8 +109,6 @@ static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG    "delaylog"   /* Delayed logging enabled */
-#define MNTOPT_NODELAYLOG  "nodelaylog" /* Delayed logging disabled */
 #define MNTOPT_DISCARD     "discard"    /* Discard unused blocks */
 #define MNTOPT_NODISCARD   "nodiscard"  /* Do not discard unused blocks */
@@ -361,28 +359,10 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_GQUOTA_ENFD;
-                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
-                        xfs_warn(mp,
-        "delaylog is the default now, option is deprecated.");
-                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
-                        xfs_warn(mp,
-        "nodelaylog support has been removed, option is deprecated.");
                } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
                        mp->m_flags |= XFS_MOUNT_DISCARD;
                } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
                        mp->m_flags &= ~XFS_MOUNT_DISCARD;
-                } else if (!strcmp(this_char, "ihashsize")) {
-                        xfs_warn(mp,
-        "ihashsize no longer used, option is deprecated.");
-                } else if (!strcmp(this_char, "osyncisdsync")) {
-                        xfs_warn(mp,
-        "osyncisdsync has no effect, option is deprecated.");
-                } else if (!strcmp(this_char, "osyncisosync")) {
-                        xfs_warn(mp,
-        "osyncisosync has no effect, option is deprecated.");
-                } else if (!strcmp(this_char, "irixsgid")) {
-                        xfs_warn(mp,
-        "irixsgid is now a sysctl(2) variable, option is deprecated.");
                } else {
                        xfs_warn(mp, "unknown mount option [%s].", this_char);
                        return -EINVAL;
@@ -986,6 +966,8 @@ xfs_fs_inode_init_once(
        atomic_set(&ip->i_pincount, 0);
        spin_lock_init(&ip->i_flags_lock);
+        mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                     "xfsino", ip->i_ino);
        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
                     "xfsino", ip->i_ino);
 }
@@ -1033,23 +1015,6 @@ xfs_free_fsname(
        kfree(mp->m_logname);
 }
-STATIC void
-xfs_fs_put_super(
-        struct super_block      *sb)
-{
-        struct xfs_mount        *mp = XFS_M(sb);
-        xfs_filestream_unmount(mp);
-        xfs_unmountfs(mp);
-        xfs_freesb(mp);
-        xfs_icsb_destroy_counters(mp);
-        xfs_destroy_mount_workqueues(mp);
-        xfs_close_devices(mp);
-        xfs_free_fsname(mp);
-        kfree(mp);
-}
 STATIC int
 xfs_fs_sync_fs(
        struct super_block      *sb,
@@ -1085,6 +1050,9 @@ xfs_fs_statfs(
        xfs_sb_t                *sbp = &mp->m_sb;
        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
        __uint64_t              fakeinos, id;
+        __uint64_t              icount;
+        __uint64_t              ifree;
+        __uint64_t              fdblocks;
        xfs_extlen_t            lsize;
        __int64_t               ffree;
@@ -1095,17 +1063,21 @@ xfs_fs_statfs(
        statp->f_fsid.val[0] = (u32)id;
        statp->f_fsid.val[1] = (u32)(id >> 32);
-        xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+        icount = percpu_counter_sum(&mp->m_icount);
+        ifree = percpu_counter_sum(&mp->m_ifree);
+        fdblocks = percpu_counter_sum(&mp->m_fdblocks);
        spin_lock(&mp->m_sb_lock);
        statp->f_bsize = sbp->sb_blocksize;
        lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
        statp->f_blocks = sbp->sb_dblocks - lsize;
-        statp->f_bfree = statp->f_bavail =
+        spin_unlock(&mp->m_sb_lock);
-                                sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+        statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+        statp->f_bavail = statp->f_bfree;
        fakeinos = statp->f_bfree << sbp->sb_inopblog;
-        statp->f_files =
+        statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
-            MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
        if (mp->m_maxicount)
                statp->f_files = min_t(typeof(statp->f_files),
                                        statp->f_files,
@@ -1117,10 +1089,9 @@ xfs_fs_statfs(
                                        sbp->sb_icount);
        /* make sure statp->f_ffree does not underflow */
-        ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        ffree = statp->f_files - (icount - ifree);
        statp->f_ffree = max_t(__int64_t, ffree, 0);
-        spin_unlock(&mp->m_sb_lock);
        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
            ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
@@ -1256,6 +1227,12 @@ xfs_fs_remount(
        /* ro -> rw */
        if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+                if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+                        xfs_warn(mp,
+                "ro->rw transition prohibited on norecovery mount");
+                        return -EINVAL;
+                }
                mp->m_flags &= ~XFS_MOUNT_RDONLY;
                /*
@@ -1401,6 +1378,51 @@ xfs_finish_flags(
        return 0;
 }
+static int
+xfs_init_percpu_counters(
+        struct xfs_mount        *mp)
+{
+        int             error;
+        error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
+        if (error)
+                return -ENOMEM;
+        error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL);
+        if (error)
+                goto free_icount;
+        error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
+        if (error)
+                goto free_ifree;
+        return 0;
+free_ifree:
+        percpu_counter_destroy(&mp->m_ifree);
+free_icount:
+        percpu_counter_destroy(&mp->m_icount);
+        return -ENOMEM;
+}
+void
+xfs_reinit_percpu_counters(
+        struct xfs_mount        *mp)
+{
+        percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
+        percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
+        percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+}
+static void
+xfs_destroy_percpu_counters(
+        struct xfs_mount        *mp)
+{
+        percpu_counter_destroy(&mp->m_icount);
+        percpu_counter_destroy(&mp->m_ifree);
+        percpu_counter_destroy(&mp->m_fdblocks);
+}
 STATIC int
 xfs_fs_fill_super(
        struct super_block      *sb,
@@ -1449,7 +1471,7 @@ xfs_fs_fill_super(
        if (error)
                goto out_close_devices;
-        error = xfs_icsb_init_counters(mp);
+        error = xfs_init_percpu_counters(mp);
        if (error)
                goto out_destroy_workqueues;
@@ -1507,7 +1529,7 @@ xfs_fs_fill_super(
 out_free_sb:
        xfs_freesb(mp);
 out_destroy_counters:
-        xfs_icsb_destroy_counters(mp);
+        xfs_destroy_percpu_counters(mp);
 out_destroy_workqueues:
        xfs_destroy_mount_workqueues(mp);
 out_close_devices:
@@ -1524,6 +1546,24 @@ out_destroy_workqueues:
        goto out_free_sb;
 }
+STATIC void
+xfs_fs_put_super(
+        struct super_block      *sb)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        xfs_notice(mp, "Unmounting Filesystem");
+        xfs_filestream_unmount(mp);
+        xfs_unmountfs(mp);
+        xfs_freesb(mp);
+        xfs_destroy_percpu_counters(mp);
+        xfs_destroy_mount_workqueues(mp);
+        xfs_close_devices(mp);
+        xfs_free_fsname(mp);
+        kfree(mp);
+}
 STATIC struct dentry *
 xfs_fs_mount(
        struct file_system_type *fs_type,
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 2b830c2f322e..499058fea303 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -72,6 +72,8 @@ extern const struct export_operations xfs_export_operations;
 extern const struct xattr_handler *xfs_xattr_handlers[];
 extern const struct quotactl_ops xfs_quotactl_operations;
+extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
 #define XFS_M(sb)               ((struct xfs_mount *)((sb)->s_fs_info))
 #endif  /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 25791df6f638..3df411eadb86 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -177,7 +177,7 @@ xfs_symlink(
        int                     pathlen;
        struct xfs_bmap_free    free_list;
        xfs_fsblock_t           first_block;
-        bool                    unlock_dp_on_error = false;
+        bool                    unlock_dp_on_error = false;
        uint                    cancel_flags;
        int                     committed;
        xfs_fileoff_t           first_fsb;
@@ -221,7 +221,7 @@ xfs_symlink(
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
                        &udqp, &gdqp, &pdqp);
        if (error)
-                goto std_return;
+                return error;
        tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
@@ -241,7 +241,7 @@ xfs_symlink(
        }
        if (error) {
                cancel_flags = 0;
-                goto error_return;
+                goto out_trans_cancel;
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -252,7 +252,7 @@ xfs_symlink(
         */
        if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
                error = -EPERM;
-                goto error_return;
+                goto out_trans_cancel;
        }
        /*
@@ -261,7 +261,7 @@ xfs_symlink(
        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
                                                pdqp, resblks, 1, 0);
        if (error)
-                goto error_return;
+                goto out_trans_cancel;
        /*
         * Check for ability to enter directory entry, if no space reserved.
@@ -269,7 +269,7 @@ xfs_symlink(
        if (!resblks) {
                error = xfs_dir_canenter(tp, dp, link_name);
                if (error)
-                        goto error_return;
+                        goto out_trans_cancel;
        }
        /*
         * Initialize the bmap freelist prior to calling either
@@ -282,15 +282,14 @@ xfs_symlink(
         */
        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
                               prid, resblks > 0, &ip, NULL);
-        if (error) {
+        if (error)
-                if (error == -ENOSPC)
+                goto out_trans_cancel;
-                        goto error_return;
-                goto error1;
-        }
        /*
-         * An error after we've joined dp to the transaction will result in the
+         * Now we join the directory inode to the transaction.  We do not do it
-         * transaction cancel unlocking dp so don't do it explicitly in the
+         * earlier because xfs_dir_ialloc might commit the previous transaction
+         * (and release all the locks).  An error from here on will result in
+         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
@@ -330,7 +329,7 @@ xfs_symlink(
                                  XFS_BMAPI_METADATA, &first_block, resblks,
                                  mval, &nmaps, &free_list);
                if (error)
-                        goto error2;
+                        goto out_bmap_cancel;
                if (resblks)
                        resblks -= fs_blocks;
@@ -348,7 +347,7 @@ xfs_symlink(
                                               BTOBB(byte_cnt), 0);
                        if (!bp) {
                                error = -ENOMEM;
-                                goto error2;
+                                goto out_bmap_cancel;
                        }
                        bp->b_ops = &xfs_symlink_buf_ops;
@@ -378,7 +377,7 @@ xfs_symlink(
        error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
-                goto error2;
+                goto out_bmap_cancel;
        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -392,10 +391,13 @@ xfs_symlink(
        }
        error = xfs_bmap_finish(&tp, &free_list, &committed);
-        if (error) {
+        if (error)
-                goto error2;
+                goto out_bmap_cancel;
-        }
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        if (error)
+                goto out_release_inode;
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        xfs_qm_dqrele(pdqp);
@@ -403,20 +405,28 @@ xfs_symlink(
        *ipp = ip;
        return 0;
- error2:
+out_bmap_cancel:
-        IRELE(ip);
- error1:
        xfs_bmap_cancel(&free_list);
        cancel_flags |= XFS_TRANS_ABORT;
- error_return:
+out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
+out_release_inode:
+        /*
+         * Wait until after the current transaction is aborted to finish the
+         * setup of the inode and release the inode.  This prevents recursive
+         * transactions and deadlocks from xfs_inactive.
+         */
+        if (ip) {
+                xfs_finish_inode_setup(ip);
+                IRELE(ip);
+        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        xfs_qm_dqrele(pdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
        return error;
 }
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 51372e34d988..615781bf4ee5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -115,7 +115,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
                __entry->refcount = refcount;
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+        TP_printk("dev %d:%d agno %u refcount %d caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->refcount,
@@ -239,7 +239,7 @@ TRACE_EVENT(xfs_iext_insert,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-                  "offset %lld block %lld count %lld flag %d caller %pf",
+                  "offset %lld block %lld count %lld flag %d caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -283,7 +283,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-                  "offset %lld block %lld count %lld flag %d caller %pf",
+                  "offset %lld block %lld count %lld flag %d caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
-                  "lock %d flags %s caller %pf",
+                  "lock %d flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->bno,
                  __entry->nblks,
@@ -402,7 +402,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                  "lock %d flags %s caller %pf",
+                  "lock %d flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->bno,
                  __entry->buffer_length,
@@ -447,7 +447,7 @@ TRACE_EVENT(xfs_buf_ioerror,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                  "lock %d error %d flags %s caller %pf",
+                  "lock %d error %d flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->bno,
                  __entry->buffer_length,
@@ -613,7 +613,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class,
                __entry->lock_flags = lock_flags;
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
+        TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
@@ -664,6 +664,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
 DEFINE_INODE_EVENT(xfs_zero_file_space);
 DEFINE_INODE_EVENT(xfs_collapse_file_space);
+DEFINE_INODE_EVENT(xfs_insert_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
 #ifdef CONFIG_XFS_POSIX_ACL
 DEFINE_INODE_EVENT(xfs_get_acl);
@@ -685,6 +686,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
 DECLARE_EVENT_CLASS(xfs_iref_class,
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
        TP_ARGS(ip, caller_ip),
@@ -702,7 +706,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
                __entry->pincount = atomic_read(&ip->i_pincount);
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
+        TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->count,
@@ -1217,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1333,7 +1342,7 @@ TRACE_EVENT(xfs_bunmap,
                __entry->flags = flags;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
-                  "flags %s caller %pf",
+                  "flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
@@ -1466,7 +1475,7 @@ TRACE_EVENT(xfs_agf,
        ),
        TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
                  "levels b %u c %u flfirst %u fllast %u flcount %u "
-                  "freeblks %u longest %u caller %pf",
+                  "freeblks %u longest %u caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index eb90cd59a0ec..220ef2c906b2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -173,7 +173,7 @@ xfs_trans_reserve(
        uint                    rtextents)
 {
        int             error = 0;
-        int             rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+        bool            rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
        /* Mark this thread as being in a transaction */
        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -184,8 +184,7 @@ xfs_trans_reserve(
         * fail if the count would go below zero.
         */
        if (blocks > 0) {
-                error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
-                                          -((int64_t)blocks), rsvd);
                if (error != 0) {
                        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                        return -ENOSPC;
@@ -236,8 +235,7 @@ xfs_trans_reserve(
         * fail if the count would go below zero.
         */
        if (rtextents > 0) {
-                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
+                error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
-                                          -((int64_t)rtextents), rsvd);
                if (error) {
                        error = -ENOSPC;
                        goto undo_log;
@@ -268,8 +266,7 @@ undo_log:
 undo_blocks:
        if (blocks > 0) {
-                xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
-                                         (int64_t)blocks, rsvd);
                tp->t_blk_res = 0;
        }
@@ -488,6 +485,54 @@ xfs_trans_apply_sb_deltas(
                                  sizeof(sbp->sb_frextents) - 1);
 }
+STATIC int
+xfs_sb_mod8(
+        uint8_t                 *field,
+        int8_t                  delta)
+{
+        int8_t                  counter = *field;
+        counter += delta;
+        if (counter < 0) {
+                ASSERT(0);
+                return -EINVAL;
+        }
+        *field = counter;
+        return 0;
+}
+STATIC int
+xfs_sb_mod32(
+        uint32_t                *field,
+        int32_t                 delta)
+{
+        int32_t                 counter = *field;
+        counter += delta;
+        if (counter < 0) {
+                ASSERT(0);
+                return -EINVAL;
+        }
+        *field = counter;
+        return 0;
+}
+STATIC int
+xfs_sb_mod64(
+        uint64_t                *field,
+        int64_t                 delta)
+{
+        int64_t                 counter = *field;
+        counter += delta;
+        if (counter < 0) {
+                ASSERT(0);
+                return -EINVAL;
+        }
+        *field = counter;
+        return 0;
+}
 /*
 * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
 * and apply superblock counter changes to the in-core superblock.  The
@@ -495,13 +540,6 @@ xfs_trans_apply_sb_deltas(
 * applied to the in-core superblock.  The idea is that that has already been
 * done.
 *
- * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
- * However, we have to ensure that we only modify each superblock field only
- * once because the application of the delta values may not be atomic. That can
- * lead to ENOSPC races occurring if we have two separate modifcations of the
- * free space counter to put back the entire reservation and then take away
- * what we used.
- *
 * If we are not logging superblock counters, then the inode allocated/free and
 * used block counts are not updated in the on disk superblock. In this case,
 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
@@ -509,21 +547,15 @@ xfs_trans_apply_sb_deltas(
 */
 void
 xfs_trans_unreserve_and_mod_sb(
-        xfs_trans_t     *tp)
+        struct xfs_trans        *tp)
 {
-        xfs_mod_sb_t    msb[9]; /* If you add cases, add entries */
+        struct xfs_mount        *mp = tp->t_mountp;
-        xfs_mod_sb_t    *msbp;
+        bool                    rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
-        xfs_mount_t     *mp = tp->t_mountp;
+        int64_t                 blkdelta = 0;
-        /* REFERENCED */
+        int64_t                 rtxdelta = 0;
-        int             error;
+        int64_t                 idelta = 0;
-        int             rsvd;
+        int64_t                 ifreedelta = 0;
-        int64_t         blkdelta = 0;
+        int                     error;
-        int64_t         rtxdelta = 0;
-        int64_t         idelta = 0;
-        int64_t         ifreedelta = 0;
-        msbp = msb;
-        rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
        /* calculate deltas */
        if (tp->t_blk_res > 0)
@@ -547,97 +579,115 @@ xfs_trans_unreserve_and_mod_sb(
        /* apply the per-cpu counters */
        if (blkdelta) {
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
-                                                 blkdelta, rsvd);
                if (error)
                        goto out;
        }
        if (idelta) {
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
+                error = xfs_mod_icount(mp, idelta);
-                                                 idelta, rsvd);
                if (error)
                        goto out_undo_fdblocks;
        }
        if (ifreedelta) {
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
+                error = xfs_mod_ifree(mp, ifreedelta);
-                                                 ifreedelta, rsvd);
                if (error)
                        goto out_undo_icount;
        }
+        if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+                return;
        /* apply remaining deltas */
-        if (rtxdelta != 0) {
+        spin_lock(&mp->m_sb_lock);
-                msbp->msb_field = XFS_SBS_FREXTENTS;
+        if (rtxdelta) {
-                msbp->msb_delta = rtxdelta;
+                error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta);
-                msbp++;
+                if (error)
+                        goto out_undo_ifree;
        }
-        if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+        if (tp->t_dblocks_delta != 0) {
-                if (tp->t_dblocks_delta != 0) {
+                error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
-                        msbp->msb_field = XFS_SBS_DBLOCKS;
+                if (error)
-                        msbp->msb_delta = tp->t_dblocks_delta;
+                        goto out_undo_frextents;
-                        msbp++;
-                }
-                if (tp->t_agcount_delta != 0) {
-                        msbp->msb_field = XFS_SBS_AGCOUNT;
-                        msbp->msb_delta = tp->t_agcount_delta;
-                        msbp++;
-                }
-                if (tp->t_imaxpct_delta != 0) {
-                        msbp->msb_field = XFS_SBS_IMAX_PCT;
-                        msbp->msb_delta = tp->t_imaxpct_delta;
-                        msbp++;
-                }
-                if (tp->t_rextsize_delta != 0) {
-                        msbp->msb_field = XFS_SBS_REXTSIZE;
-                        msbp->msb_delta = tp->t_rextsize_delta;
-                        msbp++;
-                }
-                if (tp->t_rbmblocks_delta != 0) {
-                        msbp->msb_field = XFS_SBS_RBMBLOCKS;
-                        msbp->msb_delta = tp->t_rbmblocks_delta;
-                        msbp++;
-                }
-                if (tp->t_rblocks_delta != 0) {
-                        msbp->msb_field = XFS_SBS_RBLOCKS;
-                        msbp->msb_delta = tp->t_rblocks_delta;
-                        msbp++;
-                }
-                if (tp->t_rextents_delta != 0) {
-                        msbp->msb_field = XFS_SBS_REXTENTS;
-                        msbp->msb_delta = tp->t_rextents_delta;
-                        msbp++;
-                }
-                if (tp->t_rextslog_delta != 0) {
-                        msbp->msb_field = XFS_SBS_REXTSLOG;
-                        msbp->msb_delta = tp->t_rextslog_delta;
-                        msbp++;
-                }
        }
+        if (tp->t_agcount_delta != 0) {
-        /*
+                error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
-         * If we need to change anything, do it.
-         */
-        if (msbp > msb) {
-                error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
-                        (uint)(msbp - msb), rsvd);
                if (error)
-                        goto out_undo_ifreecount;
+                        goto out_undo_dblocks;
        }
+        if (tp->t_imaxpct_delta != 0) {
+                error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta);
+                if (error)
+                        goto out_undo_agcount;
+        }
+        if (tp->t_rextsize_delta != 0) {
+                error = xfs_sb_mod32(&mp->m_sb.sb_rextsize,
+                                     tp->t_rextsize_delta);
+                if (error)
+                        goto out_undo_imaxpct;
+        }
+        if (tp->t_rbmblocks_delta != 0) {
+                error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks,
+                                     tp->t_rbmblocks_delta);
+                if (error)
+                        goto out_undo_rextsize;
+        }
+        if (tp->t_rblocks_delta != 0) {
+                error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta);
+                if (error)
+                        goto out_undo_rbmblocks;
+        }
+        if (tp->t_rextents_delta != 0) {
+                error = xfs_sb_mod64(&mp->m_sb.sb_rextents,
+                                     tp->t_rextents_delta);
+                if (error)
+                        goto out_undo_rblocks;
+        }
+        if (tp->t_rextslog_delta != 0) {
+                error = xfs_sb_mod8(&mp->m_sb.sb_rextslog,
+                                     tp->t_rextslog_delta);
+                if (error)
+                        goto out_undo_rextents;
+        }
+        spin_unlock(&mp->m_sb_lock);
        return;
-out_undo_ifreecount:
+out_undo_rextents:
+        if (tp->t_rextents_delta)
+                xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta);
+out_undo_rblocks:
+        if (tp->t_rblocks_delta)
+                xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta);
+out_undo_rbmblocks:
+        if (tp->t_rbmblocks_delta)
+                xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta);
+out_undo_rextsize:
+        if (tp->t_rextsize_delta)
+                xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta);
+out_undo_imaxpct:
+        if (tp->t_rextsize_delta)
+                xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta);
+out_undo_agcount:
+        if (tp->t_agcount_delta)
+                xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta);
+out_undo_dblocks:
+        if (tp->t_dblocks_delta)
+                xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta);
+out_undo_frextents:
+        if (rtxdelta)
+                xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta);
+out_undo_ifree:
+        spin_unlock(&mp->m_sb_lock);
        if (ifreedelta)
-                xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+                xfs_mod_ifree(mp, -ifreedelta);
 out_undo_icount:
        if (idelta)
-                xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+                xfs_mod_icount(mp, -idelta);
 out_undo_fdblocks:
        if (blkdelta)
-                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+                xfs_mod_fdblocks(mp, -blkdelta, rsvd);
 out:
        ASSERT(error == 0);
        return;
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 31591686ac2d..996111000a8c 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -21,4 +21,10 @@ struct space_resv {
 #define FS_IOC_RESVSP           _IOW('X', 40, struct space_resv)
 #define FS_IOC_RESVSP64         _IOW('X', 42, struct space_resv)
+#define FALLOC_FL_SUPPORTED_MASK        (FALLOC_FL_KEEP_SIZE |          \
+                                         FALLOC_FL_PUNCH_HOLE |         \
+                                         FALLOC_FL_COLLAPSE_RANGE |     \
+                                         FALLOC_FL_ZERO_RANGE |         \
+                                         FALLOC_FL_INSERT_RANGE)
 #endif /* _FALLOC_H_ */
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index d1197ae3723c..3e445a760f14 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -41,4 +41,21 @@
 */
 #define FALLOC_FL_ZERO_RANGE            0x10
+/*
+ * FALLOC_FL_INSERT_RANGE is use to insert space within the file size without
+ * overwriting any existing data. The contents of the file beyond offset are
+ * shifted towards right by len bytes to create a hole.  As such, this
+ * operation will increase the size of the file by len bytes.
+ *
+ * Different filesystems may implement different limitations on the granularity
+ * of the operation. Most will limit operations to filesystem block size
+ * boundaries, but this boundary may be larger or smaller depending on
+ * the filesystem and/or the configuration of the filesystem or file.
+ *
+ * Attempting to insert space using this flag at OR beyond the end of
+ * the file is considered an illegal operation - just use ftruncate(2) or
+ * fallocate(2) with mode 0 for such type of operations.
+ */
+#define FALLOC_FL_INSERT_RANGE          0x20
 #endif /* _UAPI_FALLOC_H_ */