79 files changed, 1738 insertions, 1467 deletions
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 0fdd4109c624..6e247a99f5db 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -160,30 +160,38 @@ typedef struct xfs_agi {
         * still being referenced.
         */
        __be32          agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+        /*
+         * This marks the end of logging region 1 and start of logging region 2.
+         */
        uuid_t          agi_uuid;       /* uuid of filesystem */
        __be32          agi_crc;        /* crc of agi sector */
        __be32          agi_pad32;
        __be64          agi_lsn;        /* last write sequence */
+        __be32          agi_free_root; /* root of the free inode btree */
+        __be32          agi_free_level;/* levels in free inode btree */
        /* structure must be padded to 64 bit alignment */
 } xfs_agi_t;
 #define XFS_AGI_CRC_OFF         offsetof(struct xfs_agi, agi_crc)
-#define XFS_AGI_MAGICNUM        0x00000001
+#define XFS_AGI_MAGICNUM        (1 << 0)
-#define XFS_AGI_VERSIONNUM      0x00000002
+#define XFS_AGI_VERSIONNUM      (1 << 1)
-#define XFS_AGI_SEQNO           0x00000004
+#define XFS_AGI_SEQNO           (1 << 2)
-#define XFS_AGI_LENGTH          0x00000008
+#define XFS_AGI_LENGTH          (1 << 3)
-#define XFS_AGI_COUNT           0x00000010
+#define XFS_AGI_COUNT           (1 << 4)
-#define XFS_AGI_ROOT            0x00000020
+#define XFS_AGI_ROOT            (1 << 5)
-#define XFS_AGI_LEVEL           0x00000040
+#define XFS_AGI_LEVEL           (1 << 6)
-#define XFS_AGI_FREECOUNT       0x00000080
+#define XFS_AGI_FREECOUNT       (1 << 7)
-#define XFS_AGI_NEWINO          0x00000100
+#define XFS_AGI_NEWINO          (1 << 8)
-#define XFS_AGI_DIRINO          0x00000200
+#define XFS_AGI_DIRINO          (1 << 9)
-#define XFS_AGI_UNLINKED        0x00000400
+#define XFS_AGI_UNLINKED        (1 << 10)
-#define XFS_AGI_NUM_BITS        11
+#define XFS_AGI_NUM_BITS_R1     11      /* end of the 1st agi logging region */
-#define XFS_AGI_ALL_BITS        ((1 << XFS_AGI_NUM_BITS) - 1)
+#define XFS_AGI_ALL_BITS_R1     ((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT       (1 << 11)
+#define XFS_AGI_FREE_LEVEL      (1 << 12)
+#define XFS_AGI_NUM_BITS_R2     13
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGI_DADDR(mp)       ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index cc1eadcbb049..8358f1ded94d 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -70,7 +70,6 @@ xfs_allocbt_alloc_block(
        struct xfs_btree_cur    *cur,
        union xfs_btree_ptr     *start,
        union xfs_btree_ptr     *new,
-        int                     length,
        int                     *stat)
 {
        int                     error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 75df77d09f75..0479c32c5eb1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1344,6 +1344,14 @@ __xfs_get_blocks(
        /*
         * If this is O_DIRECT or the mpage code calling tell them how large
         * the mapping is, so that we can avoid repeated get_blocks calls.
+         *
+         * If the mapping spans EOF, then we have to break the mapping up as the
+         * mapping for blocks beyond EOF must be marked new so that sub block
+         * regions can be correctly zeroed. We can't do this for mappings within
+         * EOF unless the mapping was just allocated or is unwritten, otherwise
+         * the callers would overwrite existing data with zeros. Hence we have
+         * to split the mapping into a range up to and including EOF, and a
+         * second mapping for beyond EOF.
         */
        if (direct || size > (1 << inode->i_blkbits)) {
                xfs_off_t               mapping_size;
@@ -1354,6 +1362,12 @@ __xfs_get_blocks(
                ASSERT(mapping_size > 0);
                if (mapping_size > size)
                        mapping_size = size;
+                if (offset < i_size_read(inode) &&
+                    offset + mapping_size >= i_size_read(inode)) {
+                        /* limit mapping to block that spans EOF */
+                        mapping_size = roundup_64(i_size_read(inode) - offset,
+                                                  1 << inode->i_blkbits);
+                }
                if (mapping_size > LONG_MAX)
                        mapping_size = LONG_MAX;
@@ -1566,6 +1580,16 @@ xfs_vm_write_failed(
                xfs_vm_kill_delalloc_range(inode, block_offset,
                                           block_offset + bh->b_size);
+                /*
+                 * This buffer does not contain data anymore. make sure anyone
+                 * who finds it knows that for certain.
+                 */
+                clear_buffer_delay(bh);
+                clear_buffer_uptodate(bh);
+                clear_buffer_mapped(bh);
+                clear_buffer_new(bh);
+                clear_buffer_dirty(bh);
        }
 }
@@ -1599,12 +1623,21 @@ xfs_vm_write_begin(
        status = __block_write_begin(page, pos, len, xfs_get_blocks);
        if (unlikely(status)) {
                struct inode    *inode = mapping->host;
+                size_t          isize = i_size_read(inode);
                xfs_vm_write_failed(inode, page, pos, len);
                unlock_page(page);
-                if (pos + len > i_size_read(inode))
+                /*
-                        truncate_pagecache(inode, i_size_read(inode));
+                 * If the write is beyond EOF, we only want to kill blocks
+                 * allocated in this write, not blocks that were previously
+                 * written successfully.
+                 */
+                if (pos + len > isize) {
+                        ssize_t start = max_t(ssize_t, pos, isize);
+                        truncate_pagecache_range(inode, start, pos + len);
+                }
                page_cache_release(page);
                page = NULL;
@@ -1615,9 +1648,12 @@ xfs_vm_write_begin(
 }
 /*
- * On failure, we only need to kill delalloc blocks beyond EOF because they
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * will never be written. For blocks within EOF, generic_write_end() zeros them
+ * this specific write because they will never be written. Previous writes
- * so they are safe to leave alone and be written with all the other valid data.
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
 */
 STATIC int
 xfs_vm_write_end(
@@ -1640,8 +1676,11 @@ xfs_vm_write_end(
                loff_t          to = pos + len;
                if (to > isize) {
-                        truncate_pagecache(inode, isize);
+                        /* only kill blocks in this write beyond EOF */
+                        if (pos > isize)
+                                isize = pos;
                        xfs_vm_kill_delalloc_range(inode, isize, to);
+                        truncate_pagecache_range(inode, isize, to);
                }
        }
        return ret;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 86f482e5798f..1fc1f06277da 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -187,7 +187,7 @@ xfs_attr_calc_size(
                 * Out of line attribute, cannot double split, but
                 * make room for the attribute value itself.
                 */
-                uint    dblocks = XFS_B_TO_FSB(mp, args->valuelen);
+                uint    dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
                nblks += dblocks;
                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
        }
@@ -604,11 +604,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                trace_xfs_attr_leaf_replace(args);
+                /* save the attribute state for later removal*/
                args->op_flags |= XFS_DA_OP_RENAME;     /* an atomic rename */
                args->blkno2 = args->blkno;             /* set 2nd entry info*/
                args->index2 = args->index;
                args->rmtblkno2 = args->rmtblkno;
                args->rmtblkcnt2 = args->rmtblkcnt;
+                args->rmtvaluelen2 = args->rmtvaluelen;
+                /*
+                 * clear the remote attr state now that it is saved so that the
+                 * values reflect the state of the attribute we are about to
+                 * add, not the attribute we just found and will remove later.
+                 */
+                args->rmtblkno = 0;
+                args->rmtblkcnt = 0;
+                args->rmtvaluelen = 0;
        }
        /*
@@ -700,6 +711,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                args->blkno = args->blkno2;
                args->rmtblkno = args->rmtblkno2;
                args->rmtblkcnt = args->rmtblkcnt2;
+                args->rmtvaluelen = args->rmtvaluelen2;
                if (args->rmtblkno) {
                        error = xfs_attr_rmtval_remove(args);
                        if (error)
@@ -905,13 +917,22 @@ restart:
                trace_xfs_attr_node_replace(args);
+                /* save the attribute state for later removal*/
                args->op_flags |= XFS_DA_OP_RENAME;     /* atomic rename op */
                args->blkno2 = args->blkno;             /* set 2nd entry info*/
                args->index2 = args->index;
                args->rmtblkno2 = args->rmtblkno;
                args->rmtblkcnt2 = args->rmtblkcnt;
+                args->rmtvaluelen2 = args->rmtvaluelen;
+                /*
+                 * clear the remote attr state now that it is saved so that the
+                 * values reflect the state of the attribute we are about to
+                 * add, not the attribute we just found and will remove later.
+                 */
                args->rmtblkno = 0;
                args->rmtblkcnt = 0;
+                args->rmtvaluelen = 0;
        }
        retval = xfs_attr3_leaf_add(blk->bp, state->args);
@@ -1039,6 +1060,7 @@ restart:
                args->blkno = args->blkno2;
                args->rmtblkno = args->rmtblkno2;
                args->rmtblkcnt = args->rmtblkcnt2;
+                args->rmtvaluelen = args->rmtvaluelen2;
                if (args->rmtblkno) {
                        error = xfs_attr_rmtval_remove(args);
                        if (error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index fe9587fab17a..511c283459b1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -1229,6 +1229,7 @@ xfs_attr3_leaf_add_work(
                name_rmt->valueblk = 0;
                args->rmtblkno = 1;
                args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+                args->rmtvaluelen = args->valuelen;
        }
        xfs_trans_log_buf(args->trans, bp,
             XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
@@ -2167,11 +2168,11 @@ xfs_attr3_leaf_lookup_int(
                        if (!xfs_attr_namesp_match(args->flags, entry->flags))
                                continue;
                        args->index = probe;
-                        args->valuelen = be32_to_cpu(name_rmt->valuelen);
+                        args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
                        args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
                        args->rmtblkcnt = xfs_attr3_rmt_blocks(
                                                        args->dp->i_mount,
-                                                        args->valuelen);
+                                                        args->rmtvaluelen);
                        return XFS_ERROR(EEXIST);
                }
        }
@@ -2220,19 +2221,19 @@ xfs_attr3_leaf_getvalue(
                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
                ASSERT(name_rmt->namelen == args->namelen);
                ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-                valuelen = be32_to_cpu(name_rmt->valuelen);
+                args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
                args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
                args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-                                                       valuelen);
+                                                       args->rmtvaluelen);
                if (args->flags & ATTR_KERNOVAL) {
-                        args->valuelen = valuelen;
+                        args->valuelen = args->rmtvaluelen;
                        return 0;
                }
-                if (args->valuelen < valuelen) {
+                if (args->valuelen < args->rmtvaluelen) {
-                        args->valuelen = valuelen;
+                        args->valuelen = args->rmtvaluelen;
                        return XFS_ERROR(ERANGE);
                }
-                args->valuelen = valuelen;
+                args->valuelen = args->rmtvaluelen;
        }
        return 0;
 }
@@ -2519,7 +2520,7 @@ xfs_attr3_leaf_clearflag(
                ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
                name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-                name_rmt->valuelen = cpu_to_be32(args->valuelen);
+                name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
                xfs_trans_log_buf(args->trans, bp,
                         XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
        }
@@ -2677,7 +2678,7 @@ xfs_attr3_leaf_flipflags(
                ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
                name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
                name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-                name_rmt->valuelen = cpu_to_be32(args->valuelen);
+                name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
                xfs_trans_log_buf(args->trans, bp1,
                         XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
        }
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 01db96f60cf0..833fe5d98d80 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int(
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
+                                args.rmtvaluelen = valuelen;
                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = xfs_attr3_rmt_blocks(
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 6e37823e2932..0f0679a134e2 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -68,7 +68,6 @@ xfs_attr3_rmt_blocks(
 */
 static bool
 xfs_attr3_rmt_hdr_ok(
-        struct xfs_mount        *mp,
        void                    *ptr,
        xfs_ino_t               ino,
        uint32_t                offset,
@@ -251,7 +250,7 @@ xfs_attr_rmtval_copyout(
                byte_cnt = min(*valuelen, byte_cnt);
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
-                        if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
+                        if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
                                                  byte_cnt, bno)) {
                                xfs_alert(mp,
 "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
@@ -337,7 +336,7 @@ xfs_attr_rmtval_get(
        struct xfs_buf          *bp;
        xfs_dablk_t             lblkno = args->rmtblkno;
        __uint8_t               *dst = args->value;
-        int                     valuelen = args->valuelen;
+        int                     valuelen;
        int                     nmap;
        int                     error;
        int                     blkcnt = args->rmtblkcnt;
@@ -347,7 +346,9 @@ xfs_attr_rmtval_get(
        trace_xfs_attr_rmtval_get(args);
        ASSERT(!(args->flags & ATTR_KERNOVAL));
+        ASSERT(args->rmtvaluelen == args->valuelen);
+        valuelen = args->rmtvaluelen;
        while (valuelen > 0) {
                nmap = ATTR_RMTVALUE_MAPSIZE;
                error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
@@ -415,7 +416,7 @@ xfs_attr_rmtval_set(
         * attributes have headers, we can't just do a straight byte to FSB
         * conversion and have to take the header space into account.
         */
-        blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+        blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
        error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
                                                   XFS_ATTR_FORK);
        if (error)
@@ -480,7 +481,7 @@ xfs_attr_rmtval_set(
         */
        lblkno = args->rmtblkno;
        blkcnt = args->rmtblkcnt;
-        valuelen = args->valuelen;
+        valuelen = args->rmtvaluelen;
        while (valuelen > 0) {
                struct xfs_buf  *bp;
                xfs_daddr_t     dblkno;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5b6092ef51ef..1ff0da6e2bf9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -94,7 +94,7 @@ xfs_bmap_compute_maxlevels(
                maxleafents = MAXAEXTNUM;
                sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
        }
-        maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
+        maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
        minleafrecs = mp->m_bmap_dmnr[0];
        minnoderecs = mp->m_bmap_dmnr[1];
        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -233,7 +233,6 @@ xfs_default_attroffset(
 */
 STATIC void
 xfs_bmap_forkoff_reset(
-        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
        int             whichfork)
 {
@@ -905,7 +904,7 @@ xfs_bmap_local_to_extents_empty(
        ASSERT(ifp->if_bytes == 0);
        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
-        xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+        xfs_bmap_forkoff_reset(ip, whichfork);
        ifp->if_flags &= ~XFS_IFINLINE;
        ifp->if_flags |= XFS_IFEXTENTS;
        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
@@ -1675,7 +1674,6 @@ xfs_bmap_isaeof(
 */
 int
 xfs_bmap_last_offset(
-        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
        xfs_fileoff_t           *last_block,
        int                     whichfork)
@@ -3517,6 +3515,67 @@ xfs_bmap_adjacent(
 #undef ISVALID
 }
+static int
+xfs_bmap_longest_free_extent(
+        struct xfs_trans        *tp,
+        xfs_agnumber_t          ag,
+        xfs_extlen_t            *blen,
+        int                     *notinit)
+{
+        struct xfs_mount        *mp = tp->t_mountp;
+        struct xfs_perag        *pag;
+        xfs_extlen_t            longest;
+        int                     error = 0;
+        pag = xfs_perag_get(mp, ag);
+        if (!pag->pagf_init) {
+                error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+                if (error)
+                        goto out;
+                if (!pag->pagf_init) {
+                        *notinit = 1;
+                        goto out;
+                }
+        }
+        longest = xfs_alloc_longest_free_extent(mp, pag);
+        if (*blen < longest)
+                *blen = longest;
+out:
+        xfs_perag_put(pag);
+        return error;
+}
+static void
+xfs_bmap_select_minlen(
+        struct xfs_bmalloca     *ap,
+        struct xfs_alloc_arg    *args,
+        xfs_extlen_t            *blen,
+        int                     notinit)
+{
+        if (notinit || *blen < ap->minlen) {
+                /*
+                 * Since we did a BUF_TRYLOCK above, it is possible that
+                 * there is space for this request.
+                 */
+                args->minlen = ap->minlen;
+        } else if (*blen < args->maxlen) {
+                /*
+                 * If the best seen length is less than the request length,
+                 * use the best as the minimum.
+                 */
+                args->minlen = *blen;
+        } else {
+                /*
+                 * Otherwise we've seen an extent as big as maxlen, use that
+                 * as the minimum.
+                 */
+                args->minlen = args->maxlen;
+        }
+}
 STATIC int
 xfs_bmap_btalloc_nullfb(
        struct xfs_bmalloca     *ap,
@@ -3524,111 +3583,74 @@ xfs_bmap_btalloc_nullfb(
        xfs_extlen_t            *blen)
 {
        struct xfs_mount        *mp = ap->ip->i_mount;
-        struct xfs_perag        *pag;
        xfs_agnumber_t          ag, startag;
        int                     notinit = 0;
        int                     error;
-        if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+        args->type = XFS_ALLOCTYPE_START_BNO;
-                args->type = XFS_ALLOCTYPE_NEAR_BNO;
-        else
-                args->type = XFS_ALLOCTYPE_START_BNO;
        args->total = ap->total;
-        /*
-         * Search for an allocation group with a single extent large enough
-         * for the request.  If one isn't found, then adjust the minimum
-         * allocation size to the largest space found.
-         */
        startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
        if (startag == NULLAGNUMBER)
                startag = ag = 0;
-        pag = xfs_perag_get(mp, ag);
        while (*blen < args->maxlen) {
-                if (!pag->pagf_init) {
+                error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
-                        error = xfs_alloc_pagf_init(mp, args->tp, ag,
+                                                     &notinit);
-                                                    XFS_ALLOC_FLAG_TRYLOCK);
+                if (error)
-                        if (error) {
+                        return error;
-                                xfs_perag_put(pag);
-                                return error;
-                        }
-                }
-                /*
-                 * See xfs_alloc_fix_freelist...
-                 */
-                if (pag->pagf_init) {
-                        xfs_extlen_t    longest;
-                        longest = xfs_alloc_longest_free_extent(mp, pag);
-                        if (*blen < longest)
-                                *blen = longest;
-                } else
-                        notinit = 1;
-                if (xfs_inode_is_filestream(ap->ip)) {
-                        if (*blen >= args->maxlen)
-                                break;
-                        if (ap->userdata) {
-                                /*
-                                 * If startag is an invalid AG, we've
-                                 * come here once before and
-                                 * xfs_filestream_new_ag picked the
-                                 * best currently available.
-                                 *
-                                 * Don't continue looping, since we
-                                 * could loop forever.
-                                 */
-                                if (startag == NULLAGNUMBER)
-                                        break;
-                                error = xfs_filestream_new_ag(ap, &ag);
-                                xfs_perag_put(pag);
-                                if (error)
-                                        return error;
-                                /* loop again to set 'blen'*/
-                                startag = NULLAGNUMBER;
-                                pag = xfs_perag_get(mp, ag);
-                                continue;
-                        }
-                }
                if (++ag == mp->m_sb.sb_agcount)
                        ag = 0;
                if (ag == startag)
                        break;
-                xfs_perag_put(pag);
-                pag = xfs_perag_get(mp, ag);
        }
-        xfs_perag_put(pag);
-        /*
+        xfs_bmap_select_minlen(ap, args, blen, notinit);
-         * Since the above loop did a BUF_TRYLOCK, it is
+        return 0;
-         * possible that there is space for this request.
+}
-         */
-        if (notinit || *blen < ap->minlen)
+STATIC int
-                args->minlen = ap->minlen;
+xfs_bmap_btalloc_filestreams(
-        /*
+        struct xfs_bmalloca     *ap,
-         * If the best seen length is less than the request
+        struct xfs_alloc_arg    *args,
-         * length, use the best as the minimum.
+        xfs_extlen_t            *blen)
-         */
+{
-        else if (*blen < args->maxlen)
+        struct xfs_mount        *mp = ap->ip->i_mount;
-                args->minlen = *blen;
+        xfs_agnumber_t          ag;
-        /*
+        int                     notinit = 0;
-         * Otherwise we've seen an extent as big as maxlen,
+        int                     error;
-         * use that as the minimum.
-         */
+        args->type = XFS_ALLOCTYPE_NEAR_BNO;
-        else
+        args->total = ap->total;
-                args->minlen = args->maxlen;
+        ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+        if (ag == NULLAGNUMBER)
+                ag = 0;
+        error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+        if (error)
+                return error;
+        if (*blen < args->maxlen) {
+                error = xfs_filestream_new_ag(ap, &ag);
+                if (error)
+                        return error;
+                error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+                                                     &notinit);
+                if (error)
+                        return error;
+        }
+        xfs_bmap_select_minlen(ap, args, blen, notinit);
        /*
-         * set the failure fallback case to look in the selected
+         * Set the failure fallback case to look in the selected AG as stream
-         * AG as the stream may have moved.
+         * may have moved.
         */
-        if (xfs_inode_is_filestream(ap->ip))
+        ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
-                ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
        return 0;
 }
@@ -3708,7 +3730,15 @@ xfs_bmap_btalloc(
        args.firstblock = *ap->firstblock;
        blen = 0;
        if (nullfb) {
-                error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+                /*
+                 * Search for an allocation group with a single extent large
+                 * enough for the request.  If one isn't found, then adjust
+                 * the minimum allocation size to the largest space found.
+                 */
+                if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+                        error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+                else
+                        error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
                if (error)
                        return error;
        } else if (ap->flist->xbf_low) {
@@ -5413,6 +5443,7 @@ xfs_bmap_shift_extents(
        int                             whichfork = XFS_DATA_FORK;
        int                             logflags;
        xfs_filblks_t                   blockcount = 0;
+        int                             total_extents;
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5429,7 +5460,6 @@ xfs_bmap_shift_extents(
        ASSERT(current_ext != NULL);
        ifp = XFS_IFORK_PTR(ip, whichfork);
        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
                /* Read in all the extents */
                error = xfs_iread_extents(tp, ip, whichfork);
@@ -5456,7 +5486,6 @@ xfs_bmap_shift_extents(
        /* We are going to change core inode */
        logflags = XFS_ILOG_CORE;
        if (ifp->if_flags & XFS_IFBROOT) {
                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                cur->bc_private.b.firstblock = *firstblock;
@@ -5467,8 +5496,14 @@ xfs_bmap_shift_extents(
                logflags |= XFS_ILOG_DEXT;
        }
-        while (nexts++ < num_exts &&
+        /*
-               *current_ext <  XFS_IFORK_NEXTENTS(ip, whichfork)) {
+         * There may be delalloc extents in the data fork before the range we
+         * are collapsing out, so we cannot
+         * use the count of real extents here. Instead we have to calculate it
+         * from the incore fork.
+         */
+        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        while (nexts++ < num_exts && *current_ext < total_extents) {
                gotp = xfs_iext_get_ext(ifp, *current_ext);
                xfs_bmbt_get_all(gotp, &got);
@@ -5556,10 +5591,11 @@ xfs_bmap_shift_extents(
                }
                (*current_ext)++;
+                total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
        }
        /* Check if we are done */
-        if (*current_ext ==  XFS_IFORK_NEXTENTS(ip, whichfork))
+        if (*current_ext == total_extents)
                *done = 1;
 del_cursor:
@@ -5568,6 +5604,5 @@ del_cursor:
                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
        xfs_trans_log_inode(tp, ip, logflags);
        return error;
 }
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index f84bd7af43be..38ba36e9b2f0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -156,8 +156,8 @@ int	xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
                xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
 int     xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
                xfs_fileoff_t *last_block, int whichfork);
-int     xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
+int     xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
-                xfs_fileoff_t *unused, int whichfork);
+                int whichfork);
 int     xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
 int     xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
                int whichfork);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 818d546664e7..948836c4fd90 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -84,7 +84,7 @@ xfs_bmdr_to_bmbt(
        rblock->bb_level = dblock->bb_level;
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        rblock->bb_numrecs = dblock->bb_numrecs;
-        dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+        dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
        fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
        tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
        fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
@@ -443,7 +443,7 @@ xfs_bmbt_to_bmdr(
        ASSERT(rblock->bb_level != 0);
        dblock->bb_level = rblock->bb_level;
        dblock->bb_numrecs = rblock->bb_numrecs;
-        dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+        dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
        fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
        tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
        fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
@@ -519,7 +519,6 @@ xfs_bmbt_alloc_block(
        struct xfs_btree_cur    *cur,
        union xfs_btree_ptr     *start,
        union xfs_btree_ptr     *new,
-        int                     length,
        int                     *stat)
 {
        xfs_alloc_arg_t         args;           /* block allocation args */
@@ -672,8 +671,7 @@ xfs_bmbt_get_dmaxrecs(
 {
        if (level != cur->bc_nlevels - 1)
                return cur->bc_mp->m_bmap_dmxr[level != 0];
-        return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
+        return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
-                                level == 0);
 }
 STATIC void
@@ -914,7 +912,6 @@ xfs_bmbt_maxrecs(
 */
 int
 xfs_bmdr_maxrecs(
-        struct xfs_mount        *mp,
        int                     blocklen,
        int                     leaf)
 {
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 6e42e1e50b89..819a8a4dee95 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -130,7 +130,7 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
                        xfs_bmdr_block_t *, int);
 extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
-extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
 extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 01f6a646caa1..296160b8e78c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1418,6 +1418,8 @@ xfs_zero_file_space(
        xfs_off_t               end_boundary;
        int                     error;
+        trace_xfs_zero_file_space(ip);
        granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        /*
@@ -1432,9 +1434,18 @@ xfs_zero_file_space(
        ASSERT(end_boundary <= offset + len);
        if (start_boundary < end_boundary - 1) {
-                /* punch out the page cache over the conversion range */
+                /*
+                 * punch out delayed allocation blocks and the page cache over
+                 * the conversion range
+                 */
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip,
+                                XFS_B_TO_FSBT(mp, start_boundary),
+                                XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                truncate_pagecache_range(VFS_I(ip), start_boundary,
                                         end_boundary - 1);
                /* convert the blocks */
                error = xfs_alloc_file_space(ip, start_boundary,
                                        end_boundary - start_boundary - 1,
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e80d59fdf89a..182bac2bb276 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -43,9 +43,10 @@ kmem_zone_t	*xfs_btree_cur_zone;
 * Btree magic numbers.
 */
 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
-        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC },
+        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+          XFS_FIBT_MAGIC },
        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
-          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC }
+          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
 };
 #define xfs_btree_magic(cur) \
        xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
@@ -1115,6 +1116,7 @@ xfs_btree_set_refs(
                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
+        case XFS_BTNUM_FINO:
                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
@@ -1159,7 +1161,6 @@ STATIC int
 xfs_btree_read_buf_block(
        struct xfs_btree_cur    *cur,
        union xfs_btree_ptr     *ptr,
-        int                     level,
        int                     flags,
        struct xfs_btree_block  **block,
        struct xfs_buf          **bpp)
@@ -1517,8 +1518,8 @@ xfs_btree_increment(
                union xfs_btree_ptr     *ptrp;
                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
-                error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                --lev;
-                                                        0, &block, &bp);
+                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
                if (error)
                        goto error0;
@@ -1616,8 +1617,8 @@ xfs_btree_decrement(
                union xfs_btree_ptr     *ptrp;
                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
-                error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                --lev;
-                                                        0, &block, &bp);
+                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
                if (error)
                        goto error0;
                xfs_btree_setbuf(cur, lev, bp);
@@ -1667,7 +1668,7 @@ xfs_btree_lookup_get_block(
                return 0;
        }
-        error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
        if (error)
                return error;
@@ -2018,7 +2019,7 @@ xfs_btree_lshift(
                goto out0;
        /* Set up the left neighbor as "left". */
-        error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
        if (error)
                goto error0;
@@ -2202,7 +2203,7 @@ xfs_btree_rshift(
                goto out0;
        /* Set up the right neighbor as "right". */
-        error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
        if (error)
                goto error0;
@@ -2372,7 +2373,7 @@ xfs_btree_split(
        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
        /* Allocate the new block. If we can't do it, we're toast. Give up. */
-        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
        if (error)
                goto error0;
        if (*stat == 0)
@@ -2470,7 +2471,7 @@ xfs_btree_split(
         * point back to right instead of to left.
         */
        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
-                error = xfs_btree_read_buf_block(cur, &rrptr, level,
+                error = xfs_btree_read_buf_block(cur, &rrptr,
                                                        0, &rrblock, &rrbp);
                if (error)
                        goto error0;
@@ -2545,7 +2546,7 @@ xfs_btree_new_iroot(
        pp = xfs_btree_ptr_addr(cur, 1, block);
        /* Allocate the new block. If we can't do it, we're toast. Give up. */
-        error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
        if (error)
                goto error0;
        if (*stat == 0) {
@@ -2649,7 +2650,7 @@ xfs_btree_new_root(
        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
        /* Allocate the new block. If we can't do it, we're toast. Give up. */
-        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
        if (error)
                goto error0;
        if (*stat == 0)
@@ -2684,8 +2685,7 @@ xfs_btree_new_root(
                lbp = bp;
                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
                left = block;
-                error = xfs_btree_read_buf_block(cur, &rptr,
+                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
-                                        cur->bc_nlevels - 1, 0, &right, &rbp);
                if (error)
                        goto error0;
                bp = rbp;
@@ -2696,8 +2696,7 @@ xfs_btree_new_root(
                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
                right = block;
                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
-                error = xfs_btree_read_buf_block(cur, &lptr,
+                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
-                                        cur->bc_nlevels - 1, 0, &left, &lbp);
                if (error)
                        goto error0;
                bp = lbp;
@@ -3649,8 +3648,7 @@ xfs_btree_delrec(
                rptr = cptr;
                right = block;
                rbp = bp;
-                error = xfs_btree_read_buf_block(cur, &lptr, level,
+                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
-                                                        0, &left, &lbp);
                if (error)
                        goto error0;
@@ -3667,8 +3665,7 @@ xfs_btree_delrec(
                lptr = cptr;
                left = block;
                lbp = bp;
-                error = xfs_btree_read_buf_block(cur, &rptr, level,
+                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
-                                                        0, &right, &rbp);
                if (error)
                        goto error0;
@@ -3740,8 +3737,7 @@ xfs_btree_delrec(
        /* If there is a right sibling, point it to the remaining block. */
        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
-                error = xfs_btree_read_buf_block(cur, &cptr, level,
+                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
-                                                        0, &rrblock, &rrbp);
                if (error)
                        goto error0;
                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 91e34f21bace..a04b69422f67 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -62,6 +62,7 @@ union xfs_btree_rec {
 #define XFS_BTNUM_CNT   ((xfs_btnum_t)XFS_BTNUM_CNTi)
 #define XFS_BTNUM_BMAP  ((xfs_btnum_t)XFS_BTNUM_BMAPi)
 #define XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
+#define XFS_BTNUM_FINO  ((xfs_btnum_t)XFS_BTNUM_FINOi)
 /*
 * For logging record fields.
@@ -92,6 +93,7 @@ do {    \
        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;   \
        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;  \
        case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;    \
+        case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break;  \
        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
        }       \
 } while (0)
@@ -105,6 +107,7 @@ do {    \
        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
        case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+        case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
        }       \
 } while (0)
@@ -129,7 +132,7 @@ struct xfs_btree_ops {
        int     (*alloc_block)(struct xfs_btree_cur *cur,
                               union xfs_btree_ptr *start_bno,
                               union xfs_btree_ptr *new_bno,
-                               int length, int *stat);
+                               int *stat);
        int     (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
        /* update last record information */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 107f2fdfe41f..7a34a1ae6552 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -216,8 +216,7 @@ _xfs_buf_alloc(
 STATIC int
 _xfs_buf_get_pages(
        xfs_buf_t               *bp,
-        int                     page_count,
+        int                     page_count)
-        xfs_buf_flags_t         flags)
 {
        /* Make sure that we have a page list */
        if (bp->b_pages == NULL) {
@@ -330,7 +329,7 @@ use_alloc_page:
        end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
                                                                >> PAGE_SHIFT;
        page_count = end - start;
-        error = _xfs_buf_get_pages(bp, page_count, flags);
+        error = _xfs_buf_get_pages(bp, page_count);
        if (unlikely(error))
                return error;
@@ -778,7 +777,7 @@ xfs_buf_associate_memory(
        bp->b_pages = NULL;
        bp->b_addr = mem;
-        rval = _xfs_buf_get_pages(bp, page_count, 0);
+        rval = _xfs_buf_get_pages(bp, page_count);
        if (rval)
                return rval;
@@ -811,7 +810,7 @@ xfs_buf_get_uncached(
                goto fail;
        page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
-        error = _xfs_buf_get_pages(bp, page_count, 0);
+        error = _xfs_buf_get_pages(bp, page_count);
        if (error)
                goto fail_free_buf;
@@ -1372,21 +1371,29 @@ xfs_buf_iorequest(
                xfs_buf_wait_unpin(bp);
        xfs_buf_hold(bp);
-        /* Set the count to 1 initially, this will stop an I/O
+        /*
+         * Set the count to 1 initially, this will stop an I/O
         * completion callout which happens before we have started
         * all the I/O from calling xfs_buf_ioend too early.
         */
        atomic_set(&bp->b_io_remaining, 1);
        _xfs_buf_ioapply(bp);
-        _xfs_buf_ioend(bp, 1);
+        /*
+         * If _xfs_buf_ioapply failed, we'll get back here with
+         * only the reference we took above.  _xfs_buf_ioend will
+         * drop it to zero, so we'd better not queue it for later,
+         * or we'll free it before it's done.
+         */
+        _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
        xfs_buf_rele(bp);
 }
 /*
 * Waits for I/O to complete on the buffer supplied.  It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer.  It
+ * no I/O is pending or there is already a pending error on the buffer, in which
- * returns the I/O error code, if any, or 0 if there was no error.
+ * case nothing will ever complete.  It returns the I/O error code, if any, or
+ * 0 if there was no error.
 */
 int
 xfs_buf_iowait(
@@ -1607,7 +1614,6 @@ xfs_free_buftarg(
 int
 xfs_setsize_buftarg(
        xfs_buftarg_t           *btp,
-        unsigned int            blocksize,
        unsigned int            sectorsize)
 {
        /* Set up metadata sector size info */
@@ -1642,16 +1648,13 @@ xfs_setsize_buftarg_early(
        xfs_buftarg_t           *btp,
        struct block_device     *bdev)
 {
-        return xfs_setsize_buftarg(btp, PAGE_SIZE,
+        return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
-                                   bdev_logical_block_size(bdev));
 }
 xfs_buftarg_t *
 xfs_alloc_buftarg(
        struct xfs_mount        *mp,
-        struct block_device     *bdev,
+        struct block_device     *bdev)
-        int                     external,
-        const char              *fsname)
 {
        xfs_buftarg_t           *btp;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b8a3abf6cf47..0e47fd1fedba 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -387,10 +387,10 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
 *      Handling of buftargs.
 */
 extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
-                        struct block_device *, int, const char *);
+                        struct block_device *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
 #define xfs_getsize_buftarg(buftarg)    block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)   bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 8752821443be..64b17f5bed9a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -812,7 +812,6 @@ xfs_buf_item_init(
 */
 static void
 xfs_buf_item_log_segment(
-        struct xfs_buf_log_item *bip,
        uint                    first,
        uint                    last,
        uint                    *map)
@@ -920,7 +919,7 @@ xfs_buf_item_log(
                if (end > last)
                        end = last;
-                xfs_buf_item_log_segment(bip, first, end,
+                xfs_buf_item_log_segment(first, end,
                                         &bip->bli_formats[i].blf_data_map[0]);
                start += bp->b_maps[i].bm_len;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 6cc5f6785a77..9eec594cc25a 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2462,7 +2462,6 @@ xfs_buf_map_from_irec(
 */
 static int
 xfs_dabuf_map(
-        struct xfs_trans        *trans,
        struct xfs_inode        *dp,
        xfs_dablk_t             bno,
        xfs_daddr_t             mappedbno,
@@ -2558,7 +2557,7 @@ xfs_da_get_buf(
        *bpp = NULL;
        mapp = &map;
        nmap = 1;
-        error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+        error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
                                &mapp, &nmap);
        if (error) {
                /* mapping a hole is not an error, but we don't continue */
@@ -2606,7 +2605,7 @@ xfs_da_read_buf(
        *bpp = NULL;
        mapp = &map;
        nmap = 1;
-        error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+        error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
                                &mapp, &nmap);
        if (error) {
                /* mapping a hole is not an error, but we don't continue */
@@ -2679,7 +2678,6 @@ out_free:
 */
 xfs_daddr_t
 xfs_da_reada_buf(
-        struct xfs_trans        *trans,
        struct xfs_inode        *dp,
        xfs_dablk_t             bno,
        xfs_daddr_t             mappedbno,
@@ -2693,7 +2691,7 @@ xfs_da_reada_buf(
        mapp = &map;
        nmap = 1;
-        error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+        error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
                                &mapp, &nmap);
        if (error) {
                /* mapping a hole is not an error, but we don't continue */
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 6e95ea79f5d7..c824a0aa039f 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -60,10 +60,12 @@ typedef struct xfs_da_args {
        int             index;          /* index of attr of interest in blk */
        xfs_dablk_t     rmtblkno;       /* remote attr value starting blkno */
        int             rmtblkcnt;      /* remote attr value block count */
+        int             rmtvaluelen;    /* remote attr value length in bytes */
        xfs_dablk_t     blkno2;         /* blkno of 2nd attr leaf of interest */
        int             index2;         /* index of 2nd attr in blk */
        xfs_dablk_t     rmtblkno2;      /* remote attr value starting blkno */
        int             rmtblkcnt2;     /* remote attr value block count */
+        int             rmtvaluelen2;   /* remote attr value length in bytes */
        int             op_flags;       /* operation flags */
        enum xfs_dacmp  cmpresult;      /* name compare result for lookups */
 } xfs_da_args_t;
@@ -183,9 +185,9 @@ int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
                               xfs_dablk_t bno, xfs_daddr_t mappedbno,
                               struct xfs_buf **bpp, int whichfork,
                               const struct xfs_buf_ops *ops);
-xfs_daddr_t     xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+xfs_daddr_t     xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
-                                xfs_dablk_t bno, xfs_daddr_t mapped_bno,
+                                xfs_daddr_t mapped_bno, int whichfork,
-                                int whichfork, const struct xfs_buf_ops *ops);
+                                const struct xfs_buf_ops *ops);
 int     xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                                          struct xfs_buf *dead_buf);
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index a19d3f8f639c..1432b576b4a7 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -541,7 +541,7 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
 * Convert dataptr to byte in file space
 */
 static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
 {
        return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
 }
@@ -550,7 +550,7 @@ xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
 * Convert byte in file space to dataptr.  It had better be aligned.
 */
 static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
 {
        return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
 }
@@ -571,7 +571,7 @@ xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
 static inline xfs_dir2_db_t
 xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
 {
-        return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp));
+        return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(dp));
 }
 /*
@@ -590,7 +590,7 @@ xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
 static inline xfs_dir2_data_aoff_t
 xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
 {
-        return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp));
+        return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(dp));
 }
 /*
@@ -629,7 +629,7 @@ static inline xfs_dir2_dataptr_t
 xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
                           xfs_dir2_data_aoff_t o)
 {
-        return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o));
+        return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(mp, db, o));
 }
 /*
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index fda46253966a..e365c98c0f1e 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -244,7 +244,7 @@ xfs_dir_createname(
                goto out_free;
        }
-        rval = xfs_dir2_isblock(tp, dp, &v);
+        rval = xfs_dir2_isblock(dp, &v);
        if (rval)
                goto out_free;
        if (v) {
@@ -252,7 +252,7 @@ xfs_dir_createname(
                goto out_free;
        }
-        rval = xfs_dir2_isleaf(tp, dp, &v);
+        rval = xfs_dir2_isleaf(dp, &v);
        if (rval)
                goto out_free;
        if (v)
@@ -336,7 +336,7 @@ xfs_dir_lookup(
                goto out_check_rval;
        }
-        rval = xfs_dir2_isblock(tp, dp, &v);
+        rval = xfs_dir2_isblock(dp, &v);
        if (rval)
                goto out_free;
        if (v) {
@@ -344,7 +344,7 @@ xfs_dir_lookup(
                goto out_check_rval;
        }
-        rval = xfs_dir2_isleaf(tp, dp, &v);
+        rval = xfs_dir2_isleaf(dp, &v);
        if (rval)
                goto out_free;
        if (v)
@@ -408,7 +408,7 @@ xfs_dir_removename(
                goto out_free;
        }
-        rval = xfs_dir2_isblock(tp, dp, &v);
+        rval = xfs_dir2_isblock(dp, &v);
        if (rval)
                goto out_free;
        if (v) {
@@ -416,7 +416,7 @@ xfs_dir_removename(
                goto out_free;
        }
-        rval = xfs_dir2_isleaf(tp, dp, &v);
+        rval = xfs_dir2_isleaf(dp, &v);
        if (rval)
                goto out_free;
        if (v)
@@ -472,7 +472,7 @@ xfs_dir_replace(
                goto out_free;
        }
-        rval = xfs_dir2_isblock(tp, dp, &v);
+        rval = xfs_dir2_isblock(dp, &v);
        if (rval)
                goto out_free;
        if (v) {
@@ -480,7 +480,7 @@ xfs_dir_replace(
                goto out_free;
        }
-        rval = xfs_dir2_isleaf(tp, dp, &v);
+        rval = xfs_dir2_isleaf(dp, &v);
        if (rval)
                goto out_free;
        if (v)
@@ -531,7 +531,7 @@ xfs_dir_canenter(
                goto out_free;
        }
-        rval = xfs_dir2_isblock(tp, dp, &v);
+        rval = xfs_dir2_isblock(dp, &v);
        if (rval)
                goto out_free;
        if (v) {
@@ -539,7 +539,7 @@ xfs_dir_canenter(
                goto out_free;
        }
-        rval = xfs_dir2_isleaf(tp, dp, &v);
+        rval = xfs_dir2_isleaf(dp, &v);
        if (rval)
                goto out_free;
        if (v)
@@ -607,7 +607,6 @@ xfs_dir2_grow_inode(
 */
 int
 xfs_dir2_isblock(
-        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
        int             *vp)            /* out: 1 is block, 0 is not block */
 {
@@ -616,7 +615,7 @@ xfs_dir2_isblock(
        int             rval;
        mp = dp->i_mount;
-        if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+        if ((rval = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK)))
                return rval;
        rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
        ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
@@ -629,7 +628,6 @@ xfs_dir2_isblock(
 */
 int
 xfs_dir2_isleaf(
-        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
        int             *vp)            /* out: 1 is leaf, 0 is not leaf */
 {
@@ -638,7 +636,7 @@ xfs_dir2_isleaf(
        int             rval;
        mp = dp->i_mount;
-        if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+        if ((rval = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK)))
                return rval;
        *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
        return 0;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index cec70e0781ab..64a6b19c2fd0 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -142,8 +142,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
 /*
 * Interface routines used by userspace utilities
 */
-extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
+extern int xfs_dir2_isblock(struct xfs_inode *dp, int *r);
-extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
+extern int xfs_dir2_isleaf(struct xfs_inode *dp, int *r);
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
                                struct xfs_buf *bp);
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 4f6a38cb83a4..dd9d00515582 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -319,7 +319,6 @@ xfs_dir2_block_compact(
                (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
                (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
                needlog, &needscan);
-        blp += be32_to_cpu(btp->stale) - 1;
        btp->stale = cpu_to_be32(1);
        /*
         * If we now need to rebuild the bestfree map, do so.
@@ -537,7 +536,7 @@ xfs_dir2_block_addname(
         * Fill in the leaf entry.
         */
        blp[mid].hashval = cpu_to_be32(args->hashval);
-        blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+        blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
                                (char *)dep - (char *)hdr));
        xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
        /*
@@ -1170,7 +1169,7 @@ xfs_dir2_sf_to_block(
        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
        xfs_dir2_data_log_entry(tp, dp, bp, dep);
        blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
-        blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+        blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
                                (char *)dep - (char *)hdr));
        /*
         * Create entry for ..
@@ -1184,7 +1183,7 @@ xfs_dir2_sf_to_block(
        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
        xfs_dir2_data_log_entry(tp, dp, bp, dep);
        blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
-        blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+        blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
                                (char *)dep - (char *)hdr));
        offset = dp->d_ops->data_first_offset;
        /*
@@ -1238,7 +1237,7 @@ xfs_dir2_sf_to_block(
                name.len = sfep->namelen;
                blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
                                                        hashname(&name));
-                blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+                blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
                                                 (char *)dep - (char *)hdr));
                offset = (int)((char *)(tagp + 1) - (char *)hdr);
                if (++i == sfp->count)
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index afa4ad523f3f..bae8b5b8d1c2 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -329,12 +329,11 @@ xfs_dir3_data_read(
 int
 xfs_dir3_data_readahead(
-        struct xfs_trans        *tp,
        struct xfs_inode        *dp,
        xfs_dablk_t             bno,
        xfs_daddr_t             mapped_bno)
 {
-        return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
+        return xfs_da_reada_buf(dp, bno, mapped_bno,
                                XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
 }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index d36e97df1187..f571723e2378 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1708,7 +1708,7 @@ xfs_dir2_node_to_leaf(
        /*
         * Get the last offset in the file.
         */
-        if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+        if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
                return error;
        }
        fo -= mp->m_dirblkfsbs;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index cb434d732681..9cb91ee0914b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1727,7 +1727,7 @@ xfs_dir2_node_addname_int(
        if (dbno == -1) {
                xfs_fileoff_t   fo;             /* freespace block number */
-                if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+                if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
                        return error;
                lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo);
                fbno = ifbno;
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 8b9d2281f85b..2429960739e9 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -54,8 +54,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
 extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
                xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
+extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
-                xfs_dablk_t bno, xfs_daddr_t mapped_bno);
+                xfs_daddr_t mapped_bno);
 extern struct xfs_dir2_data_free *
 xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index aead369e1c30..bf7a5cee7adc 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -434,7 +434,7 @@ xfs_dir2_leaf_readbuf(
                 */
                if (i > mip->ra_current &&
                    map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
-                        xfs_dir3_data_readahead(NULL, dp,
+                        xfs_dir3_data_readahead(dp,
                                map[mip->ra_index].br_startoff + mip->ra_offset,
                                XFS_FSB_TO_DADDR(mp,
                                        map[mip->ra_index].br_startblock +
@@ -447,7 +447,7 @@ xfs_dir2_leaf_readbuf(
                 * use our mapping, but this is a very rare case.
                 */
                else if (i > mip->ra_current) {
-                        xfs_dir3_data_readahead(NULL, dp,
+                        xfs_dir3_data_readahead(dp,
                                        map[mip->ra_index].br_startoff +
                                                        mip->ra_offset, -1);
                        mip->ra_current = i;
@@ -456,7 +456,7 @@ xfs_dir2_leaf_readbuf(
                /*
                 * Advance offset through the mapping table.
                 */
-                for (j = 0; j < mp->m_dirblkfsbs; j++) {
+                for (j = 0; j < mp->m_dirblkfsbs; j += length ) {
                        /*
                         * The rest of this extent but not more than a dir
                         * block.
@@ -464,7 +464,6 @@ xfs_dir2_leaf_readbuf(
                        length = min_t(int, mp->m_dirblkfsbs,
                                        map[mip->ra_index].br_blockcount -
                                                        mip->ra_offset);
-                        j += length;
                        mip->ra_offset += length;
                        /*
@@ -531,7 +530,7 @@ xfs_dir2_leaf_getdents(
         * Inside the loop we keep the main offset value as a byte offset
         * in the directory file.
         */
-        curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
+        curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
        /*
         * Force this conversion through db so we truncate the offset
@@ -635,7 +634,7 @@ xfs_dir2_leaf_getdents(
                length = dp->d_ops->data_entsize(dep->namelen);
                filetype = dp->d_ops->data_get_ftype(dep);
-                ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+                ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
                if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
                            be64_to_cpu(dep->inumber),
                            xfs_dir3_get_dtype(mp, filetype)))
@@ -653,10 +652,10 @@ xfs_dir2_leaf_getdents(
        /*
         * All done.  Set output offset value to current offset.
         */
-        if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
+        if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR))
                ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
        else
-                ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+                ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
        kmem_free(map_info);
        if (bp)
                xfs_trans_brelse(NULL, bp);
@@ -687,7 +686,7 @@ xfs_readdir(
        lock_mode = xfs_ilock_data_map_shared(dp);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_getdents(dp, ctx);
-        else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
+        else if ((rval = xfs_dir2_isblock(dp, &v)))
                ;
        else if (v)
                rval = xfs_dir2_block_getdents(dp, ctx);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 3725fb1b902b..7aab8ec117ad 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -285,14 +285,12 @@ int						/* error */
 xfs_dir2_sf_addname(
        xfs_da_args_t           *args)          /* operation arguments */
 {
-        int                     add_entsize;    /* size of the new entry */
        xfs_inode_t             *dp;            /* incore directory inode */
        int                     error;          /* error return value */
        int                     incr_isize;     /* total change in size */
        int                     new_isize;      /* di_size after adding name */
        int                     objchange;      /* changing to 8-byte inodes */
        xfs_dir2_data_aoff_t    offset = 0;     /* offset for new entry */
-        int                     old_isize;      /* di_size before adding name */
        int                     pick;           /* which algorithm to use */
        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
        xfs_dir2_sf_entry_t     *sfep = NULL;   /* shortform entry */
@@ -316,8 +314,7 @@ xfs_dir2_sf_addname(
        /*
         * Compute entry (and change in) size.
         */
-        add_entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
+        incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
-        incr_isize = add_entsize;
        objchange = 0;
 #if XFS_BIG_INUMS
        /*
@@ -325,11 +322,8 @@ xfs_dir2_sf_addname(
         */
        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
                /*
-                 * Yes, adjust the entry size and the total size.
+                 * Yes, adjust the inode size.  old count + (parent + new)
                 */
-                add_entsize +=
-                        (uint)sizeof(xfs_dir2_ino8_t) -
-                        (uint)sizeof(xfs_dir2_ino4_t);
                incr_isize +=
                        (sfp->count + 2) *
                        ((uint)sizeof(xfs_dir2_ino8_t) -
@@ -337,8 +331,7 @@ xfs_dir2_sf_addname(
                objchange = 1;
        }
 #endif
-        old_isize = (int)dp->i_d.di_size;
+        new_isize = (int)dp->i_d.di_size + incr_isize;
-        new_isize = old_isize + incr_isize;
        /*
         * Won't fit as shortform any more (due to size),
         * or the pick routine says it won't (due to offset values).
@@ -1110,9 +1103,9 @@ xfs_dir2_sf_toino4(
 }
 /*
- * Convert from 4-byte inode numbers to 8-byte inode numbers.
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
- * The new 8-byte inode number is not there yet, we leave with the
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
- * count 1 but no corresponding entry.
+ * i8count set to 1, but no corresponding 8-byte entry.
 */
 static void
 xfs_dir2_sf_toino8(
@@ -1145,7 +1138,7 @@ xfs_dir2_sf_toino8(
        ASSERT(oldsfp->i8count == 0);
        memcpy(buf, oldsfp, oldsize);
        /*
-         * Compute the new inode size.
+         * Compute the new inode size (nb: entry count + 1 for parent)
         */
        newsize =
                oldsize +
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 868b19f096bf..5fec738f1f2e 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -832,47 +832,6 @@ restart:
        return (0);
 }
-STATIC void
-xfs_qm_dqput_final(
-        struct xfs_dquot        *dqp)
-{
-        struct xfs_quotainfo    *qi = dqp->q_mount->m_quotainfo;
-        struct xfs_dquot        *gdqp;
-        struct xfs_dquot        *pdqp;
-        trace_xfs_dqput_free(dqp);
-        if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
-                XFS_STATS_INC(xs_qm_dquot_unused);
-        /*
-         * If we just added a udquot to the freelist, then we want to release
-         * the gdquot/pdquot reference that it (probably) has. Otherwise it'll
-         * keep the gdquot/pdquot from getting reclaimed.
-         */
-        gdqp = dqp->q_gdquot;
-        if (gdqp) {
-                xfs_dqlock(gdqp);
-                dqp->q_gdquot = NULL;
-        }
-        pdqp = dqp->q_pdquot;
-        if (pdqp) {
-                xfs_dqlock(pdqp);
-                dqp->q_pdquot = NULL;
-        }
-        xfs_dqunlock(dqp);
-        /*
-         * If we had a group/project quota hint, release it now.
-         */
-        if (gdqp)
-                xfs_qm_dqput(gdqp);
-        if (pdqp)
-                xfs_qm_dqput(pdqp);
-}
 /*
 * Release a reference to the dquot (decrement ref-count) and unlock it.
 *
@@ -888,10 +847,14 @@ xfs_qm_dqput(
        trace_xfs_dqput(dqp);
-        if (--dqp->q_nrefs > 0)
+        if (--dqp->q_nrefs == 0) {
-                xfs_dqunlock(dqp);
+                struct xfs_quotainfo    *qi = dqp->q_mount->m_quotainfo;
-        else
+                trace_xfs_dqput_free(dqp);
-                xfs_qm_dqput_final(dqp);
+                if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+                        XFS_STATS_INC(xs_qm_dquot_unused);
+        }
+        xfs_dqunlock(dqp);
 }
 /*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index d22ed0053c32..68a68f704837 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -52,8 +52,6 @@ typedef struct xfs_dquot {
        int              q_bufoffset;   /* off of dq in buffer (# dquots) */
        xfs_fileoff_t    q_fileoffset;  /* offset in quotas file */
-        struct xfs_dquot*q_gdquot;      /* group dquot, hint only */
-        struct xfs_dquot*q_pdquot;      /* project dquot, hint only */
        xfs_disk_dquot_t q_core;        /* actual usage & quotas */
        xfs_dq_logitem_t q_logitem;     /* dquot log item */
        xfs_qcnt_t       q_res_bcount;  /* total regular nblks used+reserved */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
index 610da8177737..c2ac0c611ad8 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -35,7 +35,6 @@
 int
 xfs_calc_dquots_per_chunk(
-        struct xfs_mount        *mp,
        unsigned int            nbblks) /* basic block units */
 {
        unsigned int    ndquots;
@@ -194,7 +193,7 @@ xfs_dquot_buf_verify_crc(
        if (mp->m_quotainfo)
                ndquots = mp->m_quotainfo->qi_dqperchunk;
        else
-                ndquots = xfs_calc_dquots_per_chunk(mp,
+                ndquots = xfs_calc_dquots_per_chunk(
                                        XFS_BB_TO_FSB(mp, bp->b_length));
        for (i = 0; i < ndquots; i++, d++) {
@@ -225,7 +224,7 @@ xfs_dquot_buf_verify(
        if (mp->m_quotainfo)
                ndquots = mp->m_quotainfo->qi_dqperchunk;
        else
-                ndquots = xfs_calc_dquots_per_chunk(mp, bp->b_length);
+                ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
        /*
         * On the first read of the buffer, verify that each dquot is valid.
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 1399e187d425..753e467aa1a5 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
        if (!lsn)
                return 0;
-        return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+        return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 79e96ce98733..1b8160dc04d1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -155,7 +155,7 @@ xfs_dir_fsync(
        if (!lsn)
                return 0;
-        return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+        return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 STATIC int
@@ -295,7 +295,7 @@ xfs_file_aio_read(
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
-                        ret = -filemap_write_and_wait_range(
+                        ret = filemap_write_and_wait_range(
                                                        VFS_I(ip)->i_mapping,
                                                        pos, -1);
                        if (ret) {
@@ -679,7 +679,7 @@ xfs_file_dio_aio_write(
                goto out;
        if (mapping->nrpages) {
-                ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+                ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                                    pos, -1);
                if (ret)
                        goto out;
@@ -837,11 +837,19 @@ xfs_file_fallocate(
                unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
                if (offset & blksize_mask || len & blksize_mask) {
-                        error = -EINVAL;
+                        error = EINVAL;
+                        goto out_unlock;
+                }
+                /*
+                 * There is no need to overlap collapse range with EOF,
+                 * in which case it is effectively a truncate operation
+                 */
+                if (offset + len >= i_size_read(inode)) {
+                        error = EINVAL;
                        goto out_unlock;
                }
-                ASSERT(offset + len < i_size_read(inode));
                new_size = i_size_read(inode) - len;
                error = xfs_collapse_file_space(ip, offset, len);
@@ -936,7 +944,7 @@ xfs_dir_open(
         */
        mode = xfs_ilock_data_map_shared(ip);
        if (ip->i_d.di_nextents > 0)
-                xfs_dir3_data_readahead(NULL, ip, 0, -1);
+                xfs_dir3_data_readahead(ip, 0, -1);
        xfs_iunlock(ip, mode);
        return 0;
 }
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 12b6e7701985..8ec81bed7992 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * Copyright (c) 2014 Christoph Hellwig.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -32,100 +33,20 @@
 #include "xfs_filestream.h"
 #include "xfs_trace.h"
-#ifdef XFS_FILESTREAMS_TRACE
+struct xfs_fstrm_item {
+        struct xfs_mru_cache_elem       mru;
-ktrace_t *xfs_filestreams_trace_buf;
+        struct xfs_inode                *ip;
+        xfs_agnumber_t                  ag; /* AG in use for this directory */
-STATIC void
+};
-xfs_filestreams_trace(
-        xfs_mount_t     *mp,    /* mount point */
-        int             type,   /* type of trace */
-        const char      *func,  /* source function */
-        int             line,   /* source line number */
-        __psunsigned_t  arg0,
-        __psunsigned_t  arg1,
-        __psunsigned_t  arg2,
-        __psunsigned_t  arg3,
-        __psunsigned_t  arg4,
-        __psunsigned_t  arg5)
-{
-        ktrace_enter(xfs_filestreams_trace_buf,
-                (void *)(__psint_t)(type | (line << 16)),
-                (void *)func,
-                (void *)(__psunsigned_t)current_pid(),
-                (void *)mp,
-                (void *)(__psunsigned_t)arg0,
-                (void *)(__psunsigned_t)arg1,
-                (void *)(__psunsigned_t)arg2,
-                (void *)(__psunsigned_t)arg3,
-                (void *)(__psunsigned_t)arg4,
-                (void *)(__psunsigned_t)arg5,
-                NULL, NULL, NULL, NULL, NULL, NULL);
-}
-#define TRACE0(mp,t)                    TRACE6(mp,t,0,0,0,0,0,0)
-#define TRACE1(mp,t,a0)                 TRACE6(mp,t,a0,0,0,0,0,0)
-#define TRACE2(mp,t,a0,a1)              TRACE6(mp,t,a0,a1,0,0,0,0)
-#define TRACE3(mp,t,a0,a1,a2)           TRACE6(mp,t,a0,a1,a2,0,0,0)
-#define TRACE4(mp,t,a0,a1,a2,a3)        TRACE6(mp,t,a0,a1,a2,a3,0,0)
-#define TRACE5(mp,t,a0,a1,a2,a3,a4)     TRACE6(mp,t,a0,a1,a2,a3,a4,0)
-#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
-        xfs_filestreams_trace(mp, t, __func__, __LINE__, \
-                                (__psunsigned_t)a0, (__psunsigned_t)a1, \
-                                (__psunsigned_t)a2, (__psunsigned_t)a3, \
-                                (__psunsigned_t)a4, (__psunsigned_t)a5)
-#define TRACE_AG_SCAN(mp, ag, ag2) \
-                TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
-#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
-                TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
-#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
-                TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
-                         cnt, free, scan, flag)
-#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
-                TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
-#define TRACE_FREE(mp, ip, pip, ag, cnt) \
-                TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
-#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
-                TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
-#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
-                TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
-#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
-                TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
-#define TRACE_ORPHAN(mp, ip, ag) \
-                TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
-#else
-#define TRACE_AG_SCAN(mp, ag, ag2)
-#define TRACE_AG_PICK1(mp, max_ag, maxfree)
-#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
-#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
-#define TRACE_FREE(mp, ip, pip, ag, cnt)
-#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
-#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
-#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
-#define TRACE_ORPHAN(mp, ip, ag)
-#endif
-static kmem_zone_t *item_zone;
-/*
+enum xfs_fstrm_alloc {
- * Structure for associating a file or a directory with an allocation group.
+        XFS_PICK_USERDATA = 1,
- * The parent directory pointer is only needed for files, but since there will
+        XFS_PICK_LOWSPACE = 2,
- * generally be vastly more files than directories in the cache, using the same
+};
- * data structure simplifies the code with very little memory overhead.
- */
-typedef struct fstrm_item
-{
-        xfs_agnumber_t  ag;     /* AG currently in use for the file/directory. */
-        xfs_inode_t     *ip;    /* inode self-pointer. */
-        xfs_inode_t     *pip;   /* Parent directory inode pointer. */
-} fstrm_item_t;
 /*
 * Allocation group filestream associations are tracked with per-ag atomic
- * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * counters.  These counters allow xfs_filestream_pick_ag() to tell whether a
 * particular AG already has active filestreams associated with it. The mount
 * point's m_peraglock is used to protect these counters from per-ag array
 * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
@@ -160,7 +81,7 @@ typedef struct fstrm_item
 * the cache that reference per-ag array elements that have since been
 * reallocated.
 */
-static int
+int
 xfs_filestream_peek_ag(
        xfs_mount_t     *mp,
        xfs_agnumber_t  agno)
@@ -200,23 +121,40 @@ xfs_filestream_put_ag(
        xfs_perag_put(pag);
 }
+static void
+xfs_fstrm_free_func(
+        struct xfs_mru_cache_elem *mru)
+{
+        struct xfs_fstrm_item   *item =
+                container_of(mru, struct xfs_fstrm_item, mru);
+        xfs_filestream_put_ag(item->ip->i_mount, item->ag);
+        trace_xfs_filestream_free(item->ip, item->ag);
+        kmem_free(item);
+}
 /*
 * Scan the AGs starting at startag looking for an AG that isn't in use and has
 * at least minlen blocks free.
 */
 static int
-_xfs_filestream_pick_ag(
+xfs_filestream_pick_ag(
-        xfs_mount_t     *mp,
+        struct xfs_inode        *ip,
-        xfs_agnumber_t  startag,
+        xfs_agnumber_t          startag,
-        xfs_agnumber_t  *agp,
+        xfs_agnumber_t          *agp,
-        int             flags,
+        int                     flags,
-        xfs_extlen_t    minlen)
+        xfs_extlen_t            minlen)
 {
-        int             streams, max_streams;
+        struct xfs_mount        *mp = ip->i_mount;
-        int             err, trylock, nscan;
+        struct xfs_fstrm_item   *item;
-        xfs_extlen_t    longest, free, minfree, maxfree = 0;
+        struct xfs_perag        *pag;
-        xfs_agnumber_t  ag, max_ag = NULLAGNUMBER;
+        xfs_extlen_t            longest, free = 0, minfree, maxfree = 0;
-        struct xfs_perag *pag;
+        xfs_agnumber_t          ag, max_ag = NULLAGNUMBER;
+        int                     err, trylock, nscan;
+        ASSERT(S_ISDIR(ip->i_d.di_mode));
        /* 2% of an AG's blocks must be free for it to be chosen. */
        minfree = mp->m_sb.sb_agblocks / 50;
@@ -228,8 +166,9 @@ _xfs_filestream_pick_ag(
        trylock = XFS_ALLOC_FLAG_TRYLOCK;
        for (nscan = 0; 1; nscan++) {
+                trace_xfs_filestream_scan(ip, ag);
                pag = xfs_perag_get(mp, ag);
-                TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
                if (!pag->pagf_init) {
                        err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
@@ -246,7 +185,6 @@ _xfs_filestream_pick_ag(
                /* Keep track of the AG with the most free blocks. */
                if (pag->pagf_freeblks > maxfree) {
                        maxfree = pag->pagf_freeblks;
-                        max_streams = atomic_read(&pag->pagf_fstrms);
                        max_ag = ag;
                }
@@ -269,7 +207,6 @@ _xfs_filestream_pick_ag(
                        /* Break out, retaining the reference on the AG. */
                        free = pag->pagf_freeblks;
-                        streams = atomic_read(&pag->pagf_fstrms);
                        xfs_perag_put(pag);
                        *agp = ag;
                        break;
@@ -305,317 +242,98 @@ next_ag:
                 */
                if (max_ag != NULLAGNUMBER) {
                        xfs_filestream_get_ag(mp, max_ag);
-                        TRACE_AG_PICK1(mp, max_ag, maxfree);
-                        streams = max_streams;
                        free = maxfree;
                        *agp = max_ag;
                        break;
                }
                /* take AG 0 if none matched */
-                TRACE_AG_PICK1(mp, max_ag, maxfree);
+                trace_xfs_filestream_pick(ip, *agp, free, nscan);
                *agp = 0;
                return 0;
        }
-        TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
+        trace_xfs_filestream_pick(ip, *agp, free, nscan);
-        return 0;
-}
-/*
+        if (*agp == NULLAGNUMBER)
- * Set the allocation group number for a file or a directory, updating inode
- * references and per-AG references as appropriate.
- */
-static int
-_xfs_filestream_update_ag(
-        xfs_inode_t     *ip,
-        xfs_inode_t     *pip,
-        xfs_agnumber_t  ag)
-{
-        int             err = 0;
-        xfs_mount_t     *mp;
-        xfs_mru_cache_t *cache;
-        fstrm_item_t    *item;
-        xfs_agnumber_t  old_ag;
-        xfs_inode_t     *old_pip;
-        /*
-         * Either ip is a regular file and pip is a directory, or ip is a
-         * directory and pip is NULL.
-         */
-        ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip &&
-                       S_ISDIR(pip->i_d.di_mode)) ||
-                      (S_ISDIR(ip->i_d.di_mode) && !pip)));
-        mp = ip->i_mount;
-        cache = mp->m_filestream;
-        item = xfs_mru_cache_lookup(cache, ip->i_ino);
-        if (item) {
-                ASSERT(item->ip == ip);
-                old_ag = item->ag;
-                item->ag = ag;
-                old_pip = item->pip;
-                item->pip = pip;
-                xfs_mru_cache_done(cache);
-                /*
-                 * If the AG has changed, drop the old ref and take a new one,
-                 * effectively transferring the reference from old to new AG.
-                 */
-                if (ag != old_ag) {
-                        xfs_filestream_put_ag(mp, old_ag);
-                        xfs_filestream_get_ag(mp, ag);
-                }
-                /*
-                 * If ip is a file and its pip has changed, drop the old ref and
-                 * take a new one.
-                 */
-                if (pip && pip != old_pip) {
-                        IRELE(old_pip);
-                        IHOLD(pip);
-                }
-                TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
-                                ag, xfs_filestream_peek_ag(mp, ag));
                return 0;
-        }
-        item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
+        err = ENOMEM;
+        item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
        if (!item)
-                return ENOMEM;
+                goto out_put_ag;
-        item->ag = ag;
+        item->ag = *agp;
        item->ip = ip;
-        item->pip = pip;
-        err = xfs_mru_cache_insert(cache, ip->i_ino, item);
+        err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
        if (err) {
-                kmem_zone_free(item_zone, item);
+                if (err == EEXIST)
-                return err;
+                        err = 0;
+                goto out_free_item;
        }
-        /* Take a reference on the AG. */
-        xfs_filestream_get_ag(mp, ag);
-        /*
-         * Take a reference on the inode itself regardless of whether it's a
-         * regular file or a directory.
-         */
-        IHOLD(ip);
-        /*
-         * In the case of a regular file, take a reference on the parent inode
-         * as well to ensure it remains in-core.
-         */
-        if (pip)
-                IHOLD(pip);
-        TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
-                        ag, xfs_filestream_peek_ag(mp, ag));
        return 0;
-}
-/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
-STATIC void
-xfs_fstrm_free_func(
-        unsigned long   ino,
-        void            *data)
-{
-        fstrm_item_t    *item  = (fstrm_item_t *)data;
-        xfs_inode_t     *ip = item->ip;
-        ASSERT(ip->i_ino == ino);
-        xfs_iflags_clear(ip, XFS_IFILESTREAM);
-        /* Drop the reference taken on the AG when the item was added. */
-        xfs_filestream_put_ag(ip->i_mount, item->ag);
-        TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
-                xfs_filestream_peek_ag(ip->i_mount, item->ag));
-        /*
-         * _xfs_filestream_update_ag() always takes a reference on the inode
-         * itself, whether it's a file or a directory.  Release it here.
-         * This can result in the inode being freed and so we must
-         * not hold any inode locks when freeing filesstreams objects
-         * otherwise we can deadlock here.
-         */
-        IRELE(ip);
-        /*
-         * In the case of a regular file, _xfs_filestream_update_ag() also
-         * takes a ref on the parent inode to keep it in-core.  Release that
-         * too.
-         */
-        if (item->pip)
-                IRELE(item->pip);
-        /* Finally, free the memory allocated for the item. */
-        kmem_zone_free(item_zone, item);
-}
-/*
- * xfs_filestream_init() is called at xfs initialisation time to set up the
- * memory zone that will be used for filestream data structure allocation.
- */
-int
-xfs_filestream_init(void)
-{
-        item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
-        if (!item_zone)
-                return -ENOMEM;
-        return 0;
-}
-/*
- * xfs_filestream_uninit() is called at xfs termination time to destroy the
- * memory zone that was used for filestream data structure allocation.
- */
-void
-xfs_filestream_uninit(void)
-{
-        kmem_zone_destroy(item_zone);
-}
-/*
- * xfs_filestream_mount() is called when a file system is mounted with the
- * filestream option.  It is responsible for allocating the data structures
- * needed to track the new file system's file streams.
- */
-int
-xfs_filestream_mount(
-        xfs_mount_t     *mp)
-{
-        int             err;
-        unsigned int    lifetime, grp_count;
-        /*
-         * The filestream timer tunable is currently fixed within the range of
-         * one second to four minutes, with five seconds being the default.  The
-         * group count is somewhat arbitrary, but it'd be nice to adhere to the
-         * timer tunable to within about 10 percent.  This requires at least 10
-         * groups.
-         */
-        lifetime  = xfs_fstrm_centisecs * 10;
-        grp_count = 10;
-        err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
-                             xfs_fstrm_free_func);
+out_free_item:
+        kmem_free(item);
+out_put_ag:
+        xfs_filestream_put_ag(mp, *agp);
        return err;
 }
-/*
+static struct xfs_inode *
- * xfs_filestream_unmount() is called when a file system that was mounted with
+xfs_filestream_get_parent(
- * the filestream option is unmounted.  It drains the data structures created
+        struct xfs_inode        *ip)
- * to track the file system's file streams and frees all the memory that was
- * allocated.
- */
-void
-xfs_filestream_unmount(
-        xfs_mount_t     *mp)
 {
-        xfs_mru_cache_destroy(mp->m_filestream);
+        struct inode            *inode = VFS_I(ip), *dir = NULL;
-}
+        struct dentry           *dentry, *parent;
-/*
+        dentry = d_find_alias(inode);
- * Return the AG of the filestream the file or directory belongs to, or
+        if (!dentry)
- * NULLAGNUMBER otherwise.
+                goto out;
- */
-xfs_agnumber_t
-xfs_filestream_lookup_ag(
-        xfs_inode_t     *ip)
-{
-        xfs_mru_cache_t *cache;
-        fstrm_item_t    *item;
-        xfs_agnumber_t  ag;
-        int             ref;
-        if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) {
-                ASSERT(0);
-                return NULLAGNUMBER;
-        }
-        cache = ip->i_mount->m_filestream;
+        parent = dget_parent(dentry);
-        item = xfs_mru_cache_lookup(cache, ip->i_ino);
+        if (!parent)
-        if (!item) {
+                goto out_dput;
-                TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
-                return NULLAGNUMBER;
-        }
-        ASSERT(ip == item->ip);
+        dir = igrab(parent->d_inode);
-        ag = item->ag;
+        dput(parent);
-        ref = xfs_filestream_peek_ag(ip->i_mount, ag);
-        xfs_mru_cache_done(cache);
-        TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
+out_dput:
-        return ag;
+        dput(dentry);
+out:
+        return dir ? XFS_I(dir) : NULL;
 }
 /*
- * xfs_filestream_associate() should only be called to associate a regular file
+ * Find the right allocation group for a file, either by finding an
- * with its parent directory.  Calling it with a child directory isn't
+ * existing file stream or creating a new one.
- * appropriate because filestreams don't apply to entire directory hierarchies.
- * Creating a file in a child directory of an existing filestream directory
- * starts a new filestream with its own allocation group association.
 *
- * Returns < 0 on error, 0 if successful association occurred, > 0 if
+ * Returns NULLAGNUMBER in case of an error.
- * we failed to get an association because of locking issues.
 */
-int
+xfs_agnumber_t
-xfs_filestream_associate(
+xfs_filestream_lookup_ag(
-        xfs_inode_t     *pip,
+        struct xfs_inode        *ip)
-        xfs_inode_t     *ip)
 {
-        xfs_mount_t     *mp;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_mru_cache_t *cache;
+        struct xfs_inode        *pip = NULL;
-        fstrm_item_t    *item;
+        xfs_agnumber_t          startag, ag = NULLAGNUMBER;
-        xfs_agnumber_t  ag, rotorstep, startag;
+        struct xfs_mru_cache_elem *mru;
-        int             err = 0;
-        ASSERT(S_ISDIR(pip->i_d.di_mode));
        ASSERT(S_ISREG(ip->i_d.di_mode));
-        if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode))
-                return -EINVAL;
-        mp = pip->i_mount;
+        pip = xfs_filestream_get_parent(ip);
-        cache = mp->m_filestream;
+        if (!pip)
+                goto out;
-        /*
+        mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
-         * We have a problem, Houston.
+        if (mru) {
-         *
+                ag = container_of(mru, struct xfs_fstrm_item, mru)->ag;
-         * Taking the iolock here violates inode locking order - we already
+                xfs_mru_cache_done(mp->m_filestream);
-         * hold the ilock. Hence if we block getting this lock we may never
-         * wake. Unfortunately, that means if we can't get the lock, we're
-         * screwed in terms of getting a stream association - we can't spin
-         * waiting for the lock because someone else is waiting on the lock we
-         * hold and we cannot drop that as we are in a transaction here.
-         *
-         * Lucky for us, this inversion is not a problem because it's a
-         * directory inode that we are trying to lock here.
-         *
-         * So, if we can't get the iolock without sleeping then just give up
-         */
-        if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
-                return 1;
-        /* If the parent directory is already in the cache, use its AG. */
-        item = xfs_mru_cache_lookup(cache, pip->i_ino);
-        if (item) {
-                ASSERT(item->ip == pip);
-                ag = item->ag;
-                xfs_mru_cache_done(cache);
-                TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
-                err = _xfs_filestream_update_ag(ip, pip, ag);
-                goto exit;
+                trace_xfs_filestream_lookup(ip, ag);
+                goto out;
        }
        /*
@@ -623,202 +341,94 @@ xfs_filestream_associate(
         * use the directory inode's AG.
         */
        if (mp->m_flags & XFS_MOUNT_32BITINODES) {
-                rotorstep = xfs_rotorstep;
+                xfs_agnumber_t   rotorstep = xfs_rotorstep;
                startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
                mp->m_agfrotor = (mp->m_agfrotor + 1) %
                                 (mp->m_sb.sb_agcount * rotorstep);
        } else
                startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
-        /* Pick a new AG for the parent inode starting at startag. */
+        if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0))
-        err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
+                ag = NULLAGNUMBER;
-        if (err || ag == NULLAGNUMBER)
+out:
-                goto exit_did_pick;
+        IRELE(pip);
+        return ag;
-        /* Associate the parent inode with the AG. */
-        err = _xfs_filestream_update_ag(pip, NULL, ag);
-        if (err)
-                goto exit_did_pick;
-        /* Associate the file inode with the AG. */
-        err = _xfs_filestream_update_ag(ip, pip, ag);
-        if (err)
-                goto exit_did_pick;
-        TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
-exit_did_pick:
-        /*
-         * If _xfs_filestream_pick_ag() returned a valid AG, remove the
-         * reference it took on it, since the file and directory will have taken
-         * their own now if they were successfully cached.
-         */
-        if (ag != NULLAGNUMBER)
-                xfs_filestream_put_ag(mp, ag);
-exit:
-        xfs_iunlock(pip, XFS_IOLOCK_EXCL);
-        return -err;
 }
 /*
- * Pick a new allocation group for the current file and its file stream.  This
+ * Pick a new allocation group for the current file and its file stream.
- * function is called by xfs_bmap_filestreams() with the mount point's per-ag
+ *
- * lock held.
+ * This is called when the allocator can't find a suitable extent in the
+ * current AG, and we have to move the stream into a new AG with more space.
 */
 int
 xfs_filestream_new_ag(
        struct xfs_bmalloca     *ap,
        xfs_agnumber_t          *agp)
 {
-        int             flags, err;
+        struct xfs_inode        *ip = ap->ip, *pip;
-        xfs_inode_t     *ip, *pip = NULL;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_mount_t     *mp;
+        xfs_extlen_t            minlen = ap->length;
-        xfs_mru_cache_t *cache;
+        xfs_agnumber_t          startag = 0;
-        xfs_extlen_t    minlen;
+        int                     flags, err = 0;
-        fstrm_item_t    *dir, *file;
+        struct xfs_mru_cache_elem *mru;
-        xfs_agnumber_t  ag = NULLAGNUMBER;
-        ip = ap->ip;
-        mp = ip->i_mount;
-        cache = mp->m_filestream;
-        minlen = ap->length;
-        *agp = NULLAGNUMBER;
-        /*
+        *agp = NULLAGNUMBER;
-         * Look for the file in the cache, removing it if it's found.  Doing
-         * this allows it to be held across the dir lookup that follows.
-         */
-        file = xfs_mru_cache_remove(cache, ip->i_ino);
-        if (file) {
-                ASSERT(ip == file->ip);
-                /* Save the file's parent inode and old AG number for later. */
-                pip = file->pip;
-                ag = file->ag;
-                /* Look for the file's directory in the cache. */
-                dir = xfs_mru_cache_lookup(cache, pip->i_ino);
-                if (dir) {
-                        ASSERT(pip == dir->ip);
-                        /*
-                         * If the directory has already moved on to a new AG,
-                         * use that AG as the new AG for the file. Don't
-                         * forget to twiddle the AG refcounts to match the
-                         * movement.
-                         */
-                        if (dir->ag != file->ag) {
-                                xfs_filestream_put_ag(mp, file->ag);
-                                xfs_filestream_get_ag(mp, dir->ag);
-                                *agp = file->ag = dir->ag;
-                        }
-                        xfs_mru_cache_done(cache);
-                }
-                /*
+        pip = xfs_filestream_get_parent(ip);
-                 * Put the file back in the cache.  If this fails, the free
+        if (!pip)
-                 * function needs to be called to tidy up in the same way as if
+                goto exit;
-                 * the item had simply expired from the cache.
-                 */
-                err = xfs_mru_cache_insert(cache, ip->i_ino, file);
-                if (err) {
-                        xfs_fstrm_free_func(ip->i_ino, file);
-                        return err;
-                }
-                /*
+        mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino);
-                 * If the file's AG was moved to the directory's new AG, there's
+        if (mru) {
-                 * nothing more to be done.
+                struct xfs_fstrm_item *item =
-                 */
+                        container_of(mru, struct xfs_fstrm_item, mru);
-                if (*agp != NULLAGNUMBER) {
+                startag = (item->ag + 1) % mp->m_sb.sb_agcount;
-                        TRACE_MOVEAG(mp, ip, pip,
-                                        ag, xfs_filestream_peek_ag(mp, ag),
-                                        *agp, xfs_filestream_peek_ag(mp, *agp));
-                        return 0;
-                }
        }
-        /*
-         * If the file's parent directory is known, take its iolock in exclusive
-         * mode to prevent two sibling files from racing each other to migrate
-         * themselves and their parent to different AGs.
-         *
-         * Note that we lock the parent directory iolock inside the child
-         * iolock here.  That's fine as we never hold both parent and child
-         * iolock in any other place.  This is different from the ilock,
-         * which requires locking of the child after the parent for namespace
-         * operations.
-         */
-        if (pip)
-                xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-        /*
-         * A new AG needs to be found for the file.  If the file's parent
-         * directory is also known, it will be moved to the new AG as well to
-         * ensure that files created inside it in future use the new AG.
-         */
-        ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
        flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
                (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
-        err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
+        err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
-        if (err || *agp == NULLAGNUMBER)
-                goto exit;
        /*
-         * If the file wasn't found in the file cache, then its parent directory
+         * Only free the item here so we skip over the old AG earlier.
-         * inode isn't known.  For this to have happened, the file must either
-         * be pre-existing, or it was created long enough ago that its cache
-         * entry has expired.  This isn't the sort of usage that the filestreams
-         * allocator is trying to optimise, so there's no point trying to track
-         * its new AG somehow in the filestream data structures.
         */
-        if (!pip) {
+        if (mru)
-                TRACE_ORPHAN(mp, ip, *agp);
+                xfs_fstrm_free_func(mru);
-                goto exit;
-        }
-        /* Associate the parent inode with the AG. */
-        err = _xfs_filestream_update_ag(pip, NULL, *agp);
-        if (err)
-                goto exit;
-        /* Associate the file inode with the AG. */
-        err = _xfs_filestream_update_ag(ip, pip, *agp);
-        if (err)
-                goto exit;
-        TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
-                        *agp, xfs_filestream_peek_ag(mp, *agp));
+        IRELE(pip);
 exit:
-        /*
+        if (*agp == NULLAGNUMBER)
-         * If _xfs_filestream_pick_ag() returned a valid AG, remove the
-         * reference it took on it, since the file and directory will have taken
-         * their own now if they were successfully cached.
-         */
-        if (*agp != NULLAGNUMBER)
-                xfs_filestream_put_ag(mp, *agp);
-        else
                *agp = 0;
-        if (pip)
-                xfs_iunlock(pip, XFS_IOLOCK_EXCL);
        return err;
 }
-/*
- * Remove an association between an inode and a filestream object.
- * Typically this is done on last close of an unlinked file.
- */
 void
 xfs_filestream_deassociate(
-        xfs_inode_t     *ip)
+        struct xfs_inode        *ip)
 {
-        xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
+        xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino);
+}
+int
+xfs_filestream_mount(
+        xfs_mount_t     *mp)
+{
+        /*
+         * The filestream timer tunable is currently fixed within the range of
+         * one second to four minutes, with five seconds being the default.  The
+         * group count is somewhat arbitrary, but it'd be nice to adhere to the
+         * timer tunable to within about 10 percent.  This requires at least 10
+         * groups.
+         */
+        return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10,
+                                    10, xfs_fstrm_free_func);
+}
-        xfs_mru_cache_delete(cache, ip->i_ino);
+void
+xfs_filestream_unmount(
+        xfs_mount_t     *mp)
+{
+        xfs_mru_cache_destroy(mp->m_filestream);
 }
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 6d61dbee8564..2ef43406e53b 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -20,50 +20,20 @@
 struct xfs_mount;
 struct xfs_inode;
-struct xfs_perag;
 struct xfs_bmalloca;
-#ifdef XFS_FILESTREAMS_TRACE
-#define XFS_FSTRM_KTRACE_INFO           1
-#define XFS_FSTRM_KTRACE_AGSCAN         2
-#define XFS_FSTRM_KTRACE_AGPICK1        3
-#define XFS_FSTRM_KTRACE_AGPICK2        4
-#define XFS_FSTRM_KTRACE_UPDATE         5
-#define XFS_FSTRM_KTRACE_FREE           6
-#define XFS_FSTRM_KTRACE_ITEM_LOOKUP    7
-#define XFS_FSTRM_KTRACE_ASSOCIATE      8
-#define XFS_FSTRM_KTRACE_MOVEAG         9
-#define XFS_FSTRM_KTRACE_ORPHAN         10
-#define XFS_FSTRM_KTRACE_SIZE   16384
-extern ktrace_t *xfs_filestreams_trace_buf;
-#endif
-/* allocation selection flags */
-typedef enum xfs_fstrm_alloc {
-        XFS_PICK_USERDATA = 1,
-        XFS_PICK_LOWSPACE = 2,
-} xfs_fstrm_alloc_t;
-/* prototypes for filestream.c */
-int xfs_filestream_init(void);
-void xfs_filestream_uninit(void);
 int xfs_filestream_mount(struct xfs_mount *mp);
 void xfs_filestream_unmount(struct xfs_mount *mp);
-xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
-int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
 void xfs_filestream_deassociate(struct xfs_inode *ip);
+xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
 int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
+int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno);
-/* filestreams for the inode? */
 static inline int
 xfs_inode_is_filestream(
        struct xfs_inode        *ip)
 {
        return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
-                xfs_iflags_test(ip, XFS_IFILESTREAM) ||
                (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
 }
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
index 9898f31d05d8..34d85aca3058 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/xfs_format.h
@@ -202,6 +202,8 @@ typedef __be32 xfs_alloc_ptr_t;
 */
 #define XFS_IBT_MAGIC           0x49414254      /* 'IABT' */
 #define XFS_IBT_CRC_MAGIC       0x49414233      /* 'IAB3' */
+#define XFS_FIBT_MAGIC          0x46494254      /* 'FIBT' */
+#define XFS_FIBT_CRC_MAGIC      0x46494233      /* 'FIB3' */
 typedef __uint64_t      xfs_inofree_t;
 #define XFS_INODES_PER_CHUNK            (NBBY * sizeof(xfs_inofree_t))
@@ -244,7 +246,17 @@ typedef __be32 xfs_inobt_ptr_t;
 * block numbers in the AG.
 */
 #define XFS_IBT_BLOCK(mp)               ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
-#define XFS_PREALLOC_BLOCKS(mp)         ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+#define XFS_FIBT_BLOCK(mp)              ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+/*
+ * The first data block of an AG depends on whether the filesystem was formatted
+ * with the finobt feature. If so, account for the finobt reserved root btree
+ * block.
+ */
+#define XFS_PREALLOC_BLOCKS(mp) \
+        (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+         XFS_FIBT_BLOCK(mp) + 1 : \
+         XFS_IBT_BLOCK(mp) + 1)
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c5fc116dfaa3..d34703dbcb42 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -238,6 +238,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_LAZYSB      0x4000  /* lazy superblock counters */
 #define XFS_FSOP_GEOM_FLAGS_V5SB        0x8000  /* version 5 superblock */
 #define XFS_FSOP_GEOM_FLAGS_FTYPE       0x10000 /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT      0x20000 /* free inode btree */
 /*
 * Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 02fb943cbf22..3445ead7c1fc 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -104,7 +104,9 @@ xfs_fs_geometry(
                        (xfs_sb_version_hascrc(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_V5SB : 0) |
                        (xfs_sb_version_hasftype(&mp->m_sb) ?
-                                XFS_FSOP_GEOM_FLAGS_FTYPE : 0);
+                                XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
+                        (xfs_sb_version_hasfinobt(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
                geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
                                mp->m_sb.sb_logsectsize : BBSIZE;
                geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -316,6 +318,10 @@ xfs_growfs_data_private(
                agi->agi_dirino = cpu_to_be32(NULLAGINO);
                if (xfs_sb_version_hascrc(&mp->m_sb))
                        uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+                if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+                        agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+                        agi->agi_free_level = cpu_to_be32(1);
+                }
                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
                        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
@@ -407,6 +413,34 @@ xfs_growfs_data_private(
                xfs_buf_relse(bp);
                if (error)
                        goto error0;
+                /*
+                 * FINO btree root block
+                 */
+                if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+                        bp = xfs_growfs_get_hdr_buf(mp,
+                                XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
+                                BTOBB(mp->m_sb.sb_blocksize), 0,
+                                &xfs_inobt_buf_ops);
+                        if (!bp) {
+                                error = ENOMEM;
+                                goto error0;
+                        }
+                        if (xfs_sb_version_hascrc(&mp->m_sb))
+                                xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC,
+                                                     0, 0, agno,
+                                                     XFS_BTREE_CRC_BLOCKS);
+                        else
+                                xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0,
+                                                     0, agno, 0);
+                        error = xfs_bwrite(bp);
+                        xfs_buf_relse(bp);
+                        if (error)
+                                goto error0;
+                }
        }
        xfs_trans_agblocks_delta(tp, nfree);
        /*
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 8f711db61a0c..6ac0c2986c32 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -112,6 +112,66 @@ xfs_inobt_get_rec(
 }
 /*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+        struct xfs_btree_cur    *cur,
+        __int32_t               freecount,
+        xfs_inofree_t           free,
+        int                     *stat)
+{
+        cur->bc_rec.i.ir_freecount = freecount;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_insert(cur, stat);
+}
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_buf          *agbp,
+        xfs_agino_t             newino,
+        xfs_agino_t             newlen,
+        xfs_btnum_t             btnum)
+{
+        struct xfs_btree_cur    *cur;
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+        xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
+        xfs_agino_t             thisino;
+        int                     i;
+        int                     error;
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+        for (thisino = newino;
+             thisino < newino + newlen;
+             thisino += XFS_INODES_PER_CHUNK) {
+                error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+                if (error) {
+                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                        return error;
+                }
+                ASSERT(i == 0);
+                error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+                                             XFS_INOBT_ALL_FREE, &i);
+                if (error) {
+                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                        return error;
+                }
+                ASSERT(i == 1);
+        }
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        return 0;
+}
+/*
 * Verify that the number of free inodes in the AGI is correct.
 */
 #ifdef DEBUG
@@ -303,13 +363,10 @@ xfs_ialloc_ag_alloc(
 {
        xfs_agi_t       *agi;           /* allocation group header */
        xfs_alloc_arg_t args;           /* allocation argument structure */
-        xfs_btree_cur_t *cur;           /* inode btree cursor */
        xfs_agnumber_t  agno;
        int             error;
-        int             i;
        xfs_agino_t     newino;         /* new first inode's number */
        xfs_agino_t     newlen;         /* new number of inodes */
-        xfs_agino_t     thisino;        /* current inode number, for loop */
        int             isaligned = 0;  /* inode allocation at stripe unit */
                                        /* boundary */
        struct xfs_perag *pag;
@@ -459,29 +516,19 @@ xfs_ialloc_ag_alloc(
        agi->agi_newino = cpu_to_be32(newino);
        /*
-         * Insert records describing the new inode chunk into the btree.
+         * Insert records describing the new inode chunk into the btrees.
         */
-        cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
+        error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-        for (thisino = newino;
+                                 XFS_BTNUM_INO);
-             thisino < newino + newlen;
+        if (error)
-             thisino += XFS_INODES_PER_CHUNK) {
+                return error;
-                cur->bc_rec.i.ir_startino = thisino;
-                cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+        if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
-                cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+                error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
-                error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+                                         XFS_BTNUM_FINO);
-                if (error) {
+                if (error)
-                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-                        return error;
-                }
-                ASSERT(i == 0);
-                error = xfs_btree_insert(cur, &i);
-                if (error) {
-                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
-                }
-                ASSERT(i == 1);
        }
-        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
        /*
         * Log allocation group header fields
         */
@@ -675,13 +722,10 @@ xfs_ialloc_get_rec(
 }
 /*
- * Allocate an inode.
+ * Allocate an inode using the inobt-only algorithm.
- *
- * The caller selected an AG for us, and made sure that free inodes are
- * available.
 */
 STATIC int
-xfs_dialloc_ag(
+xfs_dialloc_ag_inobt(
        struct xfs_trans        *tp,
        struct xfs_buf          *agbp,
        xfs_ino_t               parent,
@@ -707,7 +751,7 @@ xfs_dialloc_ag(
        ASSERT(pag->pagi_freecount > 0);
 restart_pagno:
-        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
        /*
         * If pagino is 0 (this is the root inode allocation) use newino.
         * This must work because we've just allocated some.
@@ -940,6 +984,294 @@ error0:
 }
 /*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+        xfs_agino_t                     pagino,
+        struct xfs_btree_cur            **ocur,
+        struct xfs_inobt_rec_incore     *rec)
+{
+        struct xfs_btree_cur            *lcur = *ocur;  /* left search cursor */
+        struct xfs_btree_cur            *rcur;  /* right search cursor */
+        struct xfs_inobt_rec_incore     rrec;
+        int                             error;
+        int                             i, j;
+        error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+        if (error)
+                return error;
+        if (i == 1) {
+                error = xfs_inobt_get_rec(lcur, rec, &i);
+                if (error)
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                /*
+                 * See if we've landed in the parent inode record. The finobt
+                 * only tracks chunks with at least one free inode, so record
+                 * existence is enough.
+                 */
+                if (pagino >= rec->ir_startino &&
+                    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+                        return 0;
+        }
+        error = xfs_btree_dup_cursor(lcur, &rcur);
+        if (error)
+                return error;
+        error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+        if (error)
+                goto error_rcur;
+        if (j == 1) {
+                error = xfs_inobt_get_rec(rcur, &rrec, &j);
+                if (error)
+                        goto error_rcur;
+                XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+        }
+        XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+        if (i == 1 && j == 1) {
+                /*
+                 * Both the left and right records are valid. Choose the closer
+                 * inode chunk to the target.
+                 */
+                if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+                    (rrec.ir_startino - pagino)) {
+                        *rec = rrec;
+                        xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+                        *ocur = rcur;
+                } else {
+                        xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+                }
+        } else if (j == 1) {
+                /* only the right record is valid */
+                *rec = rrec;
+                xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+                *ocur = rcur;
+        } else if (i == 1) {
+                /* only the left record is valid */
+                xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+        }
+        return 0;
+error_rcur:
+        xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+        struct xfs_agi                  *agi,
+        struct xfs_btree_cur            *cur,
+        struct xfs_inobt_rec_incore     *rec)
+{
+        int error;
+        int i;
+        if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+                error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+                                         &i);
+                if (error)
+                        return error;
+                if (i == 1) {
+                        error = xfs_inobt_get_rec(cur, rec, &i);
+                        if (error)
+                                return error;
+                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        return 0;
+                }
+        }
+        /*
+         * Find the first inode available in the AG.
+         */
+        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        error = xfs_inobt_get_rec(cur, rec, &i);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        return 0;
+}
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+        struct xfs_btree_cur            *cur,   /* inobt cursor */
+        struct xfs_inobt_rec_incore     *frec,  /* finobt record */
+        int                             offset) /* inode offset */
+{
+        struct xfs_inobt_rec_incore     rec;
+        int                             error;
+        int                             i;
+        error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        error = xfs_inobt_get_rec(cur, &rec, &i);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+                                   XFS_INODES_PER_CHUNK) == 0);
+        rec.ir_free &= ~XFS_INOBT_MASK(offset);
+        rec.ir_freecount--;
+        XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+                                  (rec.ir_freecount == frec->ir_freecount));
+        error = xfs_inobt_update(cur, &rec);
+        if (error)
+                return error;
+        return 0;
+}
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+        struct xfs_trans        *tp,
+        struct xfs_buf          *agbp,
+        xfs_ino_t               parent,
+        xfs_ino_t               *inop)
+{
+        struct xfs_mount                *mp = tp->t_mountp;
+        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+        xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
+        xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
+        struct xfs_perag                *pag;
+        struct xfs_btree_cur            *cur;   /* finobt cursor */
+        struct xfs_btree_cur            *icur;  /* inobt cursor */
+        struct xfs_inobt_rec_incore     rec;
+        xfs_ino_t                       ino;
+        int                             error;
+        int                             offset;
+        int                             i;
+        if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+                return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+        pag = xfs_perag_get(mp, agno);
+        /*
+         * If pagino is 0 (this is the root inode allocation) use newino.
+         * This must work because we've just allocated some.
+         */
+        if (!pagino)
+                pagino = be32_to_cpu(agi->agi_newino);
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error_cur;
+        /*
+         * The search algorithm depends on whether we're in the same AG as the
+         * parent. If so, find the closest available inode to the parent. If
+         * not, consider the agi hint or find the first free inode in the AG.
+         */
+        if (agno == pagno)
+                error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+        else
+                error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+        if (error)
+                goto error_cur;
+        offset = xfs_lowbit64(rec.ir_free);
+        ASSERT(offset >= 0);
+        ASSERT(offset < XFS_INODES_PER_CHUNK);
+        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+                                   XFS_INODES_PER_CHUNK) == 0);
+        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+        /*
+         * Modify or remove the finobt record.
+         */
+        rec.ir_free &= ~XFS_INOBT_MASK(offset);
+        rec.ir_freecount--;
+        if (rec.ir_freecount)
+                error = xfs_inobt_update(cur, &rec);
+        else
+                error = xfs_btree_delete(cur, &i);
+        if (error)
+                goto error_cur;
+        /*
+         * The finobt has now been updated appropriately. We haven't updated the
+         * agi and superblock yet, so we can create an inobt cursor and validate
+         * the original freecount. If all is well, make the equivalent update to
+         * the inobt using the finobt record and offset information.
+         */
+        icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+        error = xfs_check_agi_freecount(icur, agi);
+        if (error)
+                goto error_icur;
+        error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+        if (error)
+                goto error_icur;
+        /*
+         * Both trees have now been updated. We must update the perag and
+         * superblock before we can check the freecount for each btree.
+         */
+        be32_add_cpu(&agi->agi_freecount, -1);
+        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+        pag->pagi_freecount--;
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+        error = xfs_check_agi_freecount(icur, agi);
+        if (error)
+                goto error_icur;
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error_icur;
+        xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        xfs_perag_put(pag);
+        *inop = ino;
+        return 0;
+error_icur:
+        xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        xfs_perag_put(pag);
+        return error;
+}
+/*
 * Allocate an inode on disk.
 *
 * Mode is used to tell whether the new inode will need space, and whether it
@@ -1098,78 +1430,34 @@ out_error:
        return XFS_ERROR(error);
 }
-/*
+STATIC int
- * Free disk inode.  Carefully avoids touching the incore inode, all
+xfs_difree_inobt(
- * manipulations incore are the caller's responsibility.
+        struct xfs_mount                *mp,
- * The on-disk inode is not changed by this operation, only the
+        struct xfs_trans                *tp,
- * btree (free inode mask) is changed.
+        struct xfs_buf                  *agbp,
- */
+        xfs_agino_t                     agino,
-int
+        struct xfs_bmap_free            *flist,
-xfs_difree(
+        int                             *delete,
-        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_ino_t                       *first_ino,
-        xfs_ino_t       inode,          /* inode to be freed */
+        struct xfs_inobt_rec_incore     *orec)
-        xfs_bmap_free_t *flist,         /* extents to free */
-        int             *delete,        /* set if inode cluster was deleted */
-        xfs_ino_t       *first_ino)     /* first inode in deleted cluster */
 {
-        /* REFERENCED */
+        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
-        xfs_agblock_t   agbno;  /* block number containing inode */
+        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
-        xfs_buf_t       *agbp;  /* buffer containing allocation group header */
+        struct xfs_perag                *pag;
-        xfs_agino_t     agino;  /* inode number relative to allocation group */
+        struct xfs_btree_cur            *cur;
-        xfs_agnumber_t  agno;   /* allocation group number */
+        struct xfs_inobt_rec_incore     rec;
-        xfs_agi_t       *agi;   /* allocation group header */
+        int                             ilen;
-        xfs_btree_cur_t *cur;   /* inode btree cursor */
+        int                             error;
-        int             error;  /* error return value */
+        int                             i;
-        int             i;      /* result code */
+        int                             off;
-        int             ilen;   /* inodes in an inode cluster */
-        xfs_mount_t     *mp;    /* mount structure for filesystem */
-        int             off;    /* offset of inode in inode chunk */
-        xfs_inobt_rec_incore_t rec;     /* btree record */
-        struct xfs_perag *pag;
-        mp = tp->t_mountp;
-        /*
-         * Break up inode number into its components.
-         */
-        agno = XFS_INO_TO_AGNO(mp, inode);
-        if (agno >= mp->m_sb.sb_agcount)  {
-                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
-                        __func__, agno, mp->m_sb.sb_agcount);
-                ASSERT(0);
-                return XFS_ERROR(EINVAL);
-        }
-        agino = XFS_INO_TO_AGINO(mp, inode);
-        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
-                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
-                        __func__, (unsigned long long)inode,
-                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
-                ASSERT(0);
-                return XFS_ERROR(EINVAL);
-        }
-        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-        if (agbno >= mp->m_sb.sb_agblocks)  {
-                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
-                        __func__, agbno, mp->m_sb.sb_agblocks);
-                ASSERT(0);
-                return XFS_ERROR(EINVAL);
-        }
-        /*
-         * Get the allocation group header.
-         */
-        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-        if (error) {
-                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
-                        __func__, error);
-                return error;
-        }
-        agi = XFS_BUF_TO_AGI(agbp);
        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
-        ASSERT(agbno < be32_to_cpu(agi->agi_length));
+        ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
        /*
         * Initialize the cursor.
         */
-        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
        error = xfs_check_agi_freecount(cur, agi);
        if (error)
@@ -1261,6 +1549,7 @@ xfs_difree(
        if (error)
                goto error0;
+        *orec = rec;
        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
        return 0;
@@ -1269,6 +1558,182 @@ error0:
        return error;
 }
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+        struct xfs_mount                *mp,
+        struct xfs_trans                *tp,
+        struct xfs_buf                  *agbp,
+        xfs_agino_t                     agino,
+        struct xfs_inobt_rec_incore     *ibtrec) /* inobt record */
+{
+        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+        struct xfs_btree_cur            *cur;
+        struct xfs_inobt_rec_incore     rec;
+        int                             offset = agino - ibtrec->ir_startino;
+        int                             error;
+        int                             i;
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+        error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+        if (error)
+                goto error;
+        if (i == 0) {
+                /*
+                 * If the record does not exist in the finobt, we must have just
+                 * freed an inode in a previously fully allocated chunk. If not,
+                 * something is out of sync.
+                 */
+                XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+                error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+                                             ibtrec->ir_free, &i);
+                if (error)
+                        goto error;
+                ASSERT(i == 1);
+                goto out;
+        }
+        /*
+         * Read and update the existing record. We could just copy the ibtrec
+         * across here, but that would defeat the purpose of having redundant
+         * metadata. By making the modifications independently, we can catch
+         * corruptions that we wouldn't see if we just copied from one record
+         * to another.
+         */
+        error = xfs_inobt_get_rec(cur, &rec, &i);
+        if (error)
+                goto error;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+        rec.ir_free |= XFS_INOBT_MASK(offset);
+        rec.ir_freecount++;
+        XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+                                (rec.ir_freecount == ibtrec->ir_freecount),
+                                error);
+        /*
+         * The content of inobt records should always match between the inobt
+         * and finobt. The lifecycle of records in the finobt is different from
+         * the inobt in that the finobt only tracks records with at least one
+         * free inode. Hence, if all of the inodes are free and we aren't
+         * keeping inode chunks permanently on disk, remove the record.
+         * Otherwise, update the record with the new information.
+         */
+        if (rec.ir_freecount == mp->m_ialloc_inos &&
+            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+                error = xfs_btree_delete(cur, &i);
+                if (error)
+                        goto error;
+                ASSERT(i == 1);
+        } else {
+                error = xfs_inobt_update(cur, &rec);
+                if (error)
+                        goto error;
+        }
+out:
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error;
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        return 0;
+error:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        xfs_ino_t               inode,          /* inode to be freed */
+        struct xfs_bmap_free    *flist,         /* extents to free */
+        int                     *delete,/* set if inode cluster was deleted */
+        xfs_ino_t               *first_ino)/* first inode in deleted cluster */
+{
+        /* REFERENCED */
+        xfs_agblock_t           agbno;  /* block number containing inode */
+        struct xfs_buf          *agbp;  /* buffer for allocation group header */
+        xfs_agino_t             agino;  /* allocation group inode number */
+        xfs_agnumber_t          agno;   /* allocation group number */
+        int                     error;  /* error return value */
+        struct xfs_mount        *mp;    /* mount structure for filesystem */
+        struct xfs_inobt_rec_incore rec;/* btree record */
+        mp = tp->t_mountp;
+        /*
+         * Break up inode number into its components.
+         */
+        agno = XFS_INO_TO_AGNO(mp, inode);
+        if (agno >= mp->m_sb.sb_agcount)  {
+                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+                        __func__, agno, mp->m_sb.sb_agcount);
+                ASSERT(0);
+                return XFS_ERROR(EINVAL);
+        }
+        agino = XFS_INO_TO_AGINO(mp, inode);
+        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
+                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+                        __func__, (unsigned long long)inode,
+                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+                ASSERT(0);
+                return XFS_ERROR(EINVAL);
+        }
+        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+        if (agbno >= mp->m_sb.sb_agblocks)  {
+                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+                        __func__, agbno, mp->m_sb.sb_agblocks);
+                ASSERT(0);
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Get the allocation group header.
+         */
+        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+        if (error) {
+                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+                        __func__, error);
+                return error;
+        }
+        /*
+         * Fix up the inode allocation btree.
+         */
+        error = xfs_difree_inobt(mp, tp, agbp, agino, flist, delete, first_ino,
+                                 &rec);
+        if (error)
+                goto error0;
+        /*
+         * Fix up the free inode btree.
+         */
+        if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+                error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+                if (error)
+                        goto error0;
+        }
+        return 0;
+error0:
+        return error;
+}
 STATIC int
 xfs_imap_lookup(
        struct xfs_mount        *mp,
@@ -1300,7 +1765,7 @@ xfs_imap_lookup(
         * we have a record, we need to ensure it contains the inode number
         * we are looking up.
         */
-        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
        if (!error) {
                if (i)
@@ -1488,7 +1953,16 @@ xfs_ialloc_compute_maxlevels(
 }
 /*
- * Log specified fields for the ag hdr (inode section)
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
 */
 void
 xfs_ialloc_log_agi(
@@ -1511,6 +1985,8 @@ xfs_ialloc_log_agi(
                offsetof(xfs_agi_t, agi_newino),
                offsetof(xfs_agi_t, agi_dirino),
                offsetof(xfs_agi_t, agi_unlinked),
+                offsetof(xfs_agi_t, agi_free_root),
+                offsetof(xfs_agi_t, agi_free_level),
                sizeof(xfs_agi_t)
        };
 #ifdef DEBUG
@@ -1519,15 +1995,30 @@ xfs_ialloc_log_agi(
        agi = XFS_BUF_TO_AGI(bp);
        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
 #endif
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
        /*
-         * Compute byte offsets for the first and last fields.
+         * Compute byte offsets for the first and last fields in the first
+         * region and log the agi buffer. This only logs up through
+         * agi_unlinked.
         */
-        xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+        if (fields & XFS_AGI_ALL_BITS_R1) {
+                xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+                                  &first, &last);
+                xfs_trans_log_buf(tp, bp, first, last);
+        }
        /*
-         * Log the allocation group inode header buffer.
+         * Mask off the bits in the first region and calculate the first and
+         * last field offsets for any bits in the second region.
         */
-        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+        fields &= ~XFS_AGI_ALL_BITS_R1;
-        xfs_trans_log_buf(tp, bp, first, last);
+        if (fields) {
+                xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+                                  &first, &last);
+                xfs_trans_log_buf(tp, bp, first, last);
+        }
 }
 #ifdef DEBUG
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 7e309b11e87d..726f83a681a5 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -49,7 +49,8 @@ xfs_inobt_dup_cursor(
        struct xfs_btree_cur    *cur)
 {
        return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agbp, cur->bc_private.a.agno);
+                        cur->bc_private.a.agbp, cur->bc_private.a.agno,
+                        cur->bc_btnum);
 }
 STATIC void
@@ -66,12 +67,26 @@ xfs_inobt_set_root(
        xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
 }
+STATIC void
+xfs_finobt_set_root(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *nptr,
+        int                     inc)    /* level change */
+{
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+        agi->agi_free_root = nptr->s;
+        be32_add_cpu(&agi->agi_free_level, inc);
+        xfs_ialloc_log_agi(cur->bc_tp, agbp,
+                           XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
 STATIC int
 xfs_inobt_alloc_block(
        struct xfs_btree_cur    *cur,
        union xfs_btree_ptr     *start,
        union xfs_btree_ptr     *new,
-        int                     length,
        int                     *stat)
 {
        xfs_alloc_arg_t         args;           /* block allocation args */
@@ -173,6 +188,17 @@ xfs_inobt_init_ptr_from_cur(
        ptr->s = agi->agi_root;
 }
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+        ptr->s = agi->agi_free_root;
+}
 STATIC __int64_t
 xfs_inobt_key_diff(
        struct xfs_btree_cur    *cur,
@@ -203,6 +229,7 @@ xfs_inobt_verify(
         */
        switch (block->bb_magic) {
        case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+        case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
                if (!xfs_sb_version_hascrc(&mp->m_sb))
                        return false;
                if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
@@ -214,6 +241,7 @@ xfs_inobt_verify(
                        return false;
                /* fall through */
        case cpu_to_be32(XFS_IBT_MAGIC):
+        case cpu_to_be32(XFS_FIBT_MAGIC):
                break;
        default:
                return 0;
@@ -317,6 +345,28 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
 #endif
 };
+static const struct xfs_btree_ops xfs_finobt_ops = {
+        .rec_len                = sizeof(xfs_inobt_rec_t),
+        .key_len                = sizeof(xfs_inobt_key_t),
+        .dup_cursor             = xfs_inobt_dup_cursor,
+        .set_root               = xfs_finobt_set_root,
+        .alloc_block            = xfs_inobt_alloc_block,
+        .free_block             = xfs_inobt_free_block,
+        .get_minrecs            = xfs_inobt_get_minrecs,
+        .get_maxrecs            = xfs_inobt_get_maxrecs,
+        .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+        .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_finobt_init_ptr_from_cur,
+        .key_diff               = xfs_inobt_key_diff,
+        .buf_ops                = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+        .keys_inorder           = xfs_inobt_keys_inorder,
+        .recs_inorder           = xfs_inobt_recs_inorder,
+#endif
+};
 /*
 * Allocate a new inode btree cursor.
 */
@@ -325,7 +375,8 @@ xfs_inobt_init_cursor(
        struct xfs_mount        *mp,            /* file system mount point */
        struct xfs_trans        *tp,            /* transaction pointer */
        struct xfs_buf          *agbp,          /* buffer for agi structure */
-        xfs_agnumber_t          agno)           /* allocation group number */
+        xfs_agnumber_t          agno,           /* allocation group number */
+        xfs_btnum_t             btnum)          /* ialloc or free ino btree */
 {
        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
        struct xfs_btree_cur    *cur;
@@ -334,11 +385,17 @@ xfs_inobt_init_cursor(
        cur->bc_tp = tp;
        cur->bc_mp = mp;
-        cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+        cur->bc_btnum = btnum;
-        cur->bc_btnum = XFS_BTNUM_INO;
+        if (btnum == XFS_BTNUM_INO) {
+                cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+                cur->bc_ops = &xfs_inobt_ops;
+        } else {
+                cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+                cur->bc_ops = &xfs_finobt_ops;
+        }
        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        cur->bc_ops = &xfs_inobt_ops;
        if (xfs_sb_version_hascrc(&mp->m_sb))
                cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f38b22011c4e..d7ebea72c2d0 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -58,7 +58,8 @@ struct xfs_mount;
                 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
 extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
-                struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+                struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
+                xfs_btnum_t);
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 #endif  /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 98d35244eecc..c48df5f25b9f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -507,8 +507,7 @@ STATIC int
 xfs_inode_ag_walk(
        struct xfs_mount        *mp,
        struct xfs_perag        *pag,
-        int                     (*execute)(struct xfs_inode *ip,
+        int                     (*execute)(struct xfs_inode *ip, int flags,
-                                           struct xfs_perag *pag, int flags,
                                           void *args),
        int                     flags,
        void                    *args,
@@ -582,7 +581,7 @@ restart:
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
                                continue;
-                        error = execute(batch[i], pag, flags, args);
+                        error = execute(batch[i], flags, args);
                        IRELE(batch[i]);
                        if (error == EAGAIN) {
                                skipped++;
@@ -636,8 +635,7 @@ xfs_eofblocks_worker(
 int
 xfs_inode_ag_iterator(
        struct xfs_mount        *mp,
-        int                     (*execute)(struct xfs_inode *ip,
+        int                     (*execute)(struct xfs_inode *ip, int flags,
-                                           struct xfs_perag *pag, int flags,
                                           void *args),
        int                     flags,
        void                    *args)
@@ -664,8 +662,7 @@ xfs_inode_ag_iterator(
 int
 xfs_inode_ag_iterator_tag(
        struct xfs_mount        *mp,
-        int                     (*execute)(struct xfs_inode *ip,
+        int                     (*execute)(struct xfs_inode *ip, int flags,
-                                           struct xfs_perag *pag, int flags,
                                           void *args),
        int                     flags,
        void                    *args,
@@ -1209,7 +1206,6 @@ xfs_inode_match_id(
 STATIC int
 xfs_inode_free_eofblocks(
        struct xfs_inode        *ip,
-        struct xfs_perag        *pag,
        int                     flags,
        void                    *args)
 {
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 9ed68bb750f5..9cf017b899be 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -60,12 +60,10 @@ int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
 void xfs_eofblocks_worker(struct work_struct *);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
-        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+        int (*execute)(struct xfs_inode *ip, int flags, void *args),
-                int flags, void *args),
        int flags, void *args);
 int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
-        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+        int (*execute)(struct xfs_inode *ip, int flags, void *args),
-                int flags, void *args),
        int flags, void *args, int tag);
 static inline int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5e7a38fa6ee6..6d6b44a508f9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -655,7 +655,6 @@ xfs_ialloc(
        uint            flags;
        int             error;
        timespec_t      tv;
-        int             filestreams = 0;
        /*
         * Call the space management code to pick
@@ -772,13 +771,6 @@ xfs_ialloc(
                flags |= XFS_ILOG_DEV;
                break;
        case S_IFREG:
-                /*
-                 * we can't set up filestreams until after the VFS inode
-                 * is set up properly.
-                 */
-                if (pip && xfs_inode_is_filestream(pip))
-                        filestreams = 1;
-                /* fall through */
        case S_IFDIR:
                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
                        uint    di_flags = 0;
@@ -844,15 +836,6 @@ xfs_ialloc(
        /* now that we have an i_mode we can setup inode ops and unlock */
        xfs_setup_inode(ip);
-        /* now we have set up the vfs inode we can associate the filestream */
-        if (filestreams) {
-                error = xfs_filestream_associate(pip, ip);
-                if (error < 0)
-                        return -error;
-                if (!error)
-                        xfs_iflags_set(ip, XFS_IFILESTREAM);
-        }
        *ipp = ip;
        return 0;
 }
@@ -1334,7 +1317,8 @@ int
 xfs_create_tmpfile(
        struct xfs_inode        *dp,
        struct dentry           *dentry,
-        umode_t                 mode)
+        umode_t                 mode,
+        struct xfs_inode        **ipp)
 {
        struct xfs_mount        *mp = dp->i_mount;
        struct xfs_inode        *ip = NULL;
@@ -1402,7 +1386,6 @@ xfs_create_tmpfile(
        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
        ip->i_d.di_nlink--;
-        d_tmpfile(dentry, VFS_I(ip));
        error = xfs_iunlink(tp, ip);
        if (error)
                goto out_trans_abort;
@@ -1415,6 +1398,7 @@ xfs_create_tmpfile(
        xfs_qm_dqrele(gdqp);
        xfs_qm_dqrele(pdqp);
+        *ipp = ip;
        return 0;
 out_trans_abort:
@@ -1698,16 +1682,6 @@ xfs_release(
                int truncated;
                /*
-                 * If we are using filestreams, and we have an unlinked
-                 * file that we are processing the last close on, then nothing
-                 * will be able to reopen and write to this file. Purge this
-                 * inode from the filestreams cache so that it doesn't delay
-                 * teardown of the inode.
-                 */
-                if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
-                        xfs_filestream_deassociate(ip);
-                /*
                 * If we previously truncated this file and removed old data
                 * in the process, we want to initiate "early" writeout on
                 * the last close.  This is an attempt to combat the notorious
@@ -1837,9 +1811,33 @@ xfs_inactive_ifree(
        int                     error;
        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
+        /*
+         * The ifree transaction might need to allocate blocks for record
+         * insertion to the finobt. We don't want to fail here at ENOSPC, so
+         * allow ifree to dip into the reserved block pool if necessary.
+         *
+         * Freeing large sets of inodes generally means freeing inode chunks,
+         * directory and file data blocks, so this should be relatively safe.
+         * Only under severe circumstances should it be possible to free enough
+         * inodes to exhaust the reserve block pool via finobt expansion while
+         * at the same time not creating free space in the filesystem.
+         *
+         * Send a warning if the reservation does happen to fail, as the inode
+         * now remains allocated and sits on the unlinked list until the fs is
+         * repaired.
+         */
+        tp->t_flags |= XFS_TRANS_RESERVE;
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
+                                  XFS_IFREE_SPACE_RES(mp), 0);
        if (error) {
-                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                if (error == ENOSPC) {
+                        xfs_warn_ratelimited(mp,
+                        "Failed to remove inode(s) from unlinked list. "
+                        "Please free space, unmount and run xfs_repair.");
+                } else {
+                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                }
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
                return error;
        }
@@ -2663,13 +2661,7 @@ xfs_remove(
        if (error)
                goto std_return;
-        /*
+        if (is_dir && xfs_inode_is_filestream(ip))
-         * If we are using filestreams, kill the stream association.
-         * If the file is still open it may get a new one but that
-         * will get killed on last close in xfs_close() so we don't
-         * have to worry about that.
-         */
-        if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
                xfs_filestream_deassociate(ip);
        return 0;
@@ -3371,9 +3363,9 @@ xfs_iflush_int(
                }
        }
-        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
+        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
        if (XFS_IFORK_Q(ip))
-                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
+                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
        xfs_inobp_check(mp, bp);
        /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 396cc1fafd0d..13aea548206c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -209,7 +209,6 @@ xfs_get_initial_prid(struct xfs_inode *dp)
 #define XFS_ISTALE              (1 << 1) /* inode has been staled */
 #define XFS_IRECLAIMABLE        (1 << 2) /* inode can be reclaimed */
 #define XFS_INEW                (1 << 3) /* inode has just been allocated */
-#define XFS_IFILESTREAM         (1 << 4) /* inode is in a filestream dir. */
 #define XFS_ITRUNCATED          (1 << 5) /* truncated down so flush-on-close */
 #define XFS_IDIRTY_RELEASE      (1 << 6) /* dirty release already seen */
 #define __XFS_IFLOCK_BIT        7        /* inode is being flushed right now */
@@ -225,8 +224,7 @@ xfs_get_initial_prid(struct xfs_inode *dp)
 */
 #define XFS_IRECLAIM_RESET_FLAGS        \
        (XFS_IRECLAIMABLE | XFS_IRECLAIM | \
-         XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
+         XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
-         XFS_IFILESTREAM);
 /*
 * Synchronize processes attempting to flush the in-core inode back to disk.
@@ -334,7 +332,7 @@ int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 int             xfs_create(struct xfs_inode *dp, struct xfs_name *name,
                           umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
 int             xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
-                           umode_t mode);
+                           umode_t mode, struct xfs_inode **ipp);
 int             xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                           struct xfs_inode *ip);
 int             xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index 73514c0486b7..b031e8d0d928 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -798,8 +798,7 @@ xfs_iflush_fork(
        xfs_inode_t             *ip,
        xfs_dinode_t            *dip,
        xfs_inode_log_item_t    *iip,
-        int                     whichfork,
+        int                     whichfork)
-        xfs_buf_t               *bp)
 {
        char                    *cp;
        xfs_ifork_t             *ifp;
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h
index eb329a1ea888..7d3b1ed6dcbe 100644
--- a/fs/xfs/xfs_inode_fork.h
+++ b/fs/xfs/xfs_inode_fork.h
@@ -127,8 +127,7 @@ typedef struct xfs_ifork {
 int             xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
 void            xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
-                                struct xfs_inode_log_item *, int,
+                                struct xfs_inode_log_item *, int);
-                                struct xfs_buf *);
 void            xfs_idestroy_fork(struct xfs_inode *, int);
 void            xfs_idata_realloc(struct xfs_inode *, int, int);
 void            xfs_iroot_realloc(struct xfs_inode *, int, int);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0b18776b075e..2d8f4fdf07f9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -543,10 +543,11 @@ xfs_attrmulti_by_handle(
        ops = memdup_user(am_hreq.ops, size);
        if (IS_ERR(ops)) {
-                error = PTR_ERR(ops);
+                error = -PTR_ERR(ops);
                goto out_dput;
        }
+        error = ENOMEM;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
@@ -556,7 +557,7 @@ xfs_attrmulti_by_handle(
                ops[i].am_error = strncpy_from_user((char *)attr_name,
                                ops[i].am_attrname, MAXNAMELEN);
                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-                        error = -ERANGE;
+                        error = ERANGE;
                if (ops[i].am_error < 0)
                        break;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index a7992f8de9d3..944d5baa710a 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -424,10 +424,11 @@ xfs_compat_attrmulti_by_handle(
        ops = memdup_user(compat_ptr(am_hreq.ops), size);
        if (IS_ERR(ops)) {
-                error = PTR_ERR(ops);
+                error = -PTR_ERR(ops);
                goto out_dput;
        }
+        error = ENOMEM;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
@@ -438,7 +439,7 @@ xfs_compat_attrmulti_by_handle(
                                compat_ptr(ops[i].am_attrname),
                                MAXNAMELEN);
                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-                        error = -ERANGE;
+                        error = ERANGE;
                if (ops[i].am_error < 0)
                        break;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3b80ebae05f5..6c5eb4c551e3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -730,7 +730,7 @@ xfs_iomap_write_allocate(
                         */
                        nimaps = 1;
                        end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
-                        error = xfs_bmap_last_offset(NULL, ip, &last_block,
+                        error = xfs_bmap_last_offset(ip, &last_block,
                                                        XFS_DATA_FORK);
                        if (error)
                                goto trans_cancel;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 89b07e43ca28..205613a06068 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -72,8 +72,8 @@ xfs_initxattrs(
        int                     error = 0;
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-                error = xfs_attr_set(ip, xattr->name, xattr->value,
+                error = -xfs_attr_set(ip, xattr->name, xattr->value,
-                                     xattr->value_len, ATTR_SECURE);
+                                      xattr->value_len, ATTR_SECURE);
                if (error < 0)
                        break;
        }
@@ -93,8 +93,8 @@ xfs_init_security(
        struct inode    *dir,
        const struct qstr *qstr)
 {
-        return security_inode_init_security(inode, dir, qstr,
+        return -security_inode_init_security(inode, dir, qstr,
-                                            &xfs_initxattrs, NULL);
+                                             &xfs_initxattrs, NULL);
 }
 static void
@@ -124,15 +124,15 @@ xfs_cleanup_inode(
        xfs_dentry_to_name(&teardown, dentry, 0);
        xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
-        iput(inode);
 }
 STATIC int
-xfs_vn_mknod(
+xfs_generic_create(
        struct inode    *dir,
        struct dentry   *dentry,
        umode_t         mode,
-        dev_t           rdev)
+        dev_t           rdev,
+        bool            tmpfile)        /* unnamed file */
 {
        struct inode    *inode;
        struct xfs_inode *ip = NULL;
@@ -156,8 +156,12 @@ xfs_vn_mknod(
        if (error)
                return error;
-        xfs_dentry_to_name(&name, dentry, mode);
+        if (!tmpfile) {
-        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+                xfs_dentry_to_name(&name, dentry, mode);
+                error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+        } else {
+                error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
+        }
        if (unlikely(error))
                goto out_free_acl;
@@ -169,18 +173,22 @@ xfs_vn_mknod(
 #ifdef CONFIG_XFS_POSIX_ACL
        if (default_acl) {
-                error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+                error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
                if (error)
                        goto out_cleanup_inode;
        }
        if (acl) {
-                error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+                error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
                if (error)
                        goto out_cleanup_inode;
        }
 #endif
-        d_instantiate(dentry, inode);
+        if (tmpfile)
+                d_tmpfile(dentry, inode);
+        else
+                d_instantiate(dentry, inode);
 out_free_acl:
        if (default_acl)
                posix_acl_release(default_acl);
@@ -189,11 +197,23 @@ xfs_vn_mknod(
        return -error;
 out_cleanup_inode:
-        xfs_cleanup_inode(dir, inode, dentry);
+        if (!tmpfile)
+                xfs_cleanup_inode(dir, inode, dentry);
+        iput(inode);
        goto out_free_acl;
 }
 STATIC int
+xfs_vn_mknod(
+        struct inode    *dir,
+        struct dentry   *dentry,
+        umode_t         mode,
+        dev_t           rdev)
+{
+        return xfs_generic_create(dir, dentry, mode, rdev, false);
+}
+STATIC int
 xfs_vn_create(
        struct inode    *dir,
        struct dentry   *dentry,
@@ -353,6 +373,7 @@ xfs_vn_symlink(
 out_cleanup_inode:
        xfs_cleanup_inode(dir, inode, dentry);
+        iput(inode);
 out:
        return -error;
 }
@@ -808,22 +829,34 @@ xfs_setattr_size(
         */
        inode_dio_wait(inode);
+        /*
+         * Do all the page cache truncate work outside the transaction context
+         * as the "lock" order is page lock->log space reservation.  i.e.
+         * locking pages inside the transaction can ABBA deadlock with
+         * writeback. We have to do the VFS inode size update before we truncate
+         * the pagecache, however, to avoid racing with page faults beyond the
+         * new EOF they are not serialised against truncate operations except by
+         * page locks and size updates.
+         *
+         * Hence we are in a situation where a truncate can fail with ENOMEM
+         * from xfs_trans_reserve(), but having already truncated the in-memory
+         * version of the file (i.e. made user visible changes). There's not
+         * much we can do about this, except to hope that the caller sees ENOMEM
+         * and retries the truncate operation.
+         */
        error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
        if (error)
                return error;
+        truncate_setsize(inode, newsize);
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error)
                goto out_trans_cancel;
-        truncate_setsize(inode, newsize);
        commit_flags = XFS_TRANS_RELEASE_LOG_RES;
        lock_flags |= XFS_ILOCK_EXCL;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
        /*
@@ -1053,11 +1086,7 @@ xfs_vn_tmpfile(
        struct dentry   *dentry,
        umode_t         mode)
 {
-        int             error;
+        return xfs_generic_create(dir, dentry, mode, 0, true);
-        error = xfs_create_tmpfile(XFS_I(dir), dentry, mode);
-        return -error;
 }
 static const struct inode_operations xfs_inode_operations = {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f46338285152..cb64f222d607 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -270,7 +270,8 @@ xfs_bulkstat(
                /*
                 * Allocate and initialize a btree cursor for ialloc btree.
                 */
-                cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+                cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+                                            XFS_BTNUM_INO);
                irbp = irbuf;
                irbufend = irbuf + nirbuf;
                end_of_ag = 0;
@@ -621,7 +622,8 @@ xfs_inumbers(
                                agino = 0;
                                continue;
                        }
-                        cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+                        cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+                                                    XFS_BTNUM_INO);
                        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
                                                 &tmp);
                        if (error) {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 8497a00e399d..3554098692d8 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -616,11 +616,13 @@ xfs_log_mount(
        int             error = 0;
        int             min_logfsbs;
-        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
-                xfs_notice(mp, "Mounting Filesystem");
+                xfs_notice(mp, "Mounting V%d Filesystem",
-        else {
+                           XFS_SB_VERSION_NUM(&mp->m_sb));
+        } else {
                xfs_notice(mp,
-"Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
+"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
+                           XFS_SB_VERSION_NUM(&mp->m_sb));
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
@@ -1181,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp)
        /* log I/O is always issued ASYNC */
        ASSERT(XFS_BUF_ISASYNC(bp));
        xlog_state_done_syncing(iclog, aborted);
        /*
-         * do not reference the buffer (bp) here as we could race
+         * drop the buffer lock now that we are done. Nothing references
-         * with it being freed after writing the unmount record to the
+         * the buffer after this, so an unmount waiting on this lock can now
-         * log.
+         * tear it down safely. As such, it is unsafe to reference the buffer
+         * (bp) after the unlock as we could race with it being freed.
         */
+        xfs_buf_unlock(bp);
 }
 /*
@@ -1368,8 +1373,16 @@ xlog_alloc_log(
        bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
        if (!bp)
                goto out_free_log;
-        bp->b_iodone = xlog_iodone;
+        /*
+         * The iclogbuf buffer locks are held over IO but we are not going to do
+         * IO yet.  Hence unlock the buffer so that the log IO path can grab it
+         * when appropriately.
+         */
        ASSERT(xfs_buf_islocked(bp));
+        xfs_buf_unlock(bp);
+        bp->b_iodone = xlog_iodone;
        log->l_xbuf = bp;
        spin_lock_init(&log->l_icloglock);
@@ -1398,6 +1411,9 @@ xlog_alloc_log(
                if (!bp)
                        goto out_free_iclog;
+                ASSERT(xfs_buf_islocked(bp));
+                xfs_buf_unlock(bp);
                bp->b_iodone = xlog_iodone;
                iclog->ic_bp = bp;
                iclog->ic_data = bp->b_addr;
@@ -1422,7 +1438,6 @@ xlog_alloc_log(
                iclog->ic_callback_tail = &(iclog->ic_callback);
                iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
-                ASSERT(xfs_buf_islocked(iclog->ic_bp));
                init_waitqueue_head(&iclog->ic_force_wait);
                init_waitqueue_head(&iclog->ic_write_wait);
@@ -1631,6 +1646,12 @@ xlog_cksum(
 * we transition the iclogs to IOERROR state *after* flushing all existing
 * iclogs to disk. This is because we don't want anymore new transactions to be
 * started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
 */
 STATIC int
 xlog_bdstrat(
@@ -1638,6 +1659,7 @@ xlog_bdstrat(
 {
        struct xlog_in_core     *iclog = bp->b_fspriv;
+        xfs_buf_lock(bp);
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                xfs_buf_ioerror(bp, EIO);
                xfs_buf_stale(bp);
@@ -1645,7 +1667,8 @@ xlog_bdstrat(
                /*
                 * It would seem logical to return EIO here, but we rely on
                 * the log state machine to propagate I/O errors instead of
-                 * doing it here.
+                 * doing it here. Similarly, IO completion will unlock the
+                 * buffer, so we don't do it here.
                 */
                return 0;
        }
@@ -1847,14 +1870,28 @@ xlog_dealloc_log(
        xlog_cil_destroy(log);
        /*
-         * always need to ensure that the extra buffer does not point to memory
+         * Cycle all the iclogbuf locks to make sure all log IO completion
-         * owned by another log buffer before we free it.
+         * is done before we tear down these buffers.
+         */
+        iclog = log->l_iclog;
+        for (i = 0; i < log->l_iclog_bufs; i++) {
+                xfs_buf_lock(iclog->ic_bp);
+                xfs_buf_unlock(iclog->ic_bp);
+                iclog = iclog->ic_next;
+        }
+        /*
+         * Always need to ensure that the extra buffer does not point to memory
+         * owned by another log buffer before we free it. Also, cycle the lock
+         * first to ensure we've completed IO on it.
         */
+        xfs_buf_lock(log->l_xbuf);
+        xfs_buf_unlock(log->l_xbuf);
        xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
        xfs_buf_free(log->l_xbuf);
        iclog = log->l_iclog;
-        for (i=0; i<log->l_iclog_bufs; i++) {
+        for (i = 0; i < log->l_iclog_bufs; i++) {
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
@@ -3915,11 +3952,14 @@ xfs_log_force_umount(
                retval = xlog_state_ioerror(log);
                spin_unlock(&log->l_icloglock);
        }
        /*
-         * Wake up everybody waiting on xfs_log_force.
+         * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
-         * Callback all log item committed functions as if the
+         * as if the log writes were completed. The abort handling in the log
-         * log writes were completed.
+         * item committed callback functions will do this again under lock to
+         * avoid races.
         */
+        wake_up_all(&log->l_cilp->xc_commit_wait);
        xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
 #ifdef XFSERRORDEBUG
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7e5455391176..039c873e6fb2 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -385,7 +385,15 @@ xlog_cil_committed(
        xfs_extent_busy_clear(mp, &ctx->busy_extents,
                             (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
+        /*
+         * If we are aborting the commit, wake up anyone waiting on the
+         * committing list.  If we don't, then a shutdown we can leave processes
+         * waiting in xlog_cil_force_lsn() waiting on a sequence commit that
+         * will never happen because we aborted it.
+         */
        spin_lock(&ctx->cil->xc_push_lock);
+        if (abort)
+                wake_up_all(&ctx->cil->xc_commit_wait);
        list_del(&ctx->committing);
        spin_unlock(&ctx->cil->xc_push_lock);
@@ -564,8 +572,18 @@ restart:
        spin_lock(&cil->xc_push_lock);
        list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
                /*
+                 * Avoid getting stuck in this loop because we were woken by the
+                 * shutdown, but then went back to sleep once already in the
+                 * shutdown state.
+                 */
+                if (XLOG_FORCED_SHUTDOWN(log)) {
+                        spin_unlock(&cil->xc_push_lock);
+                        goto out_abort_free_ticket;
+                }
+                /*
                 * Higher sequences will wait for this one so skip them.
-                 * Don't wait for own own sequence, either.
+                 * Don't wait for our own sequence, either.
                 */
                if (new_ctx->sequence >= ctx->sequence)
                        continue;
@@ -810,6 +828,13 @@ restart:
         */
        spin_lock(&cil->xc_push_lock);
        list_for_each_entry(ctx, &cil->xc_committing, committing) {
+                /*
+                 * Avoid getting stuck in this loop because we were woken by the
+                 * shutdown, but then went back to sleep once already in the
+                 * shutdown state.
+                 */
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto out_shutdown;
                if (ctx->sequence > sequence)
                        continue;
                if (!ctx->commit_lsn) {
@@ -833,14 +858,12 @@ restart:
         * push sequence after the above wait loop and the CIL still contains
         * dirty objects.
         *
-         * When the push occurs, it will empty the CIL and
+         * When the push occurs, it will empty the CIL and atomically increment
-         * atomically increment the currect sequence past the push sequence and
+         * the currect sequence past the push sequence and move it into the
-         * move it into the committing list. Of course, if the CIL is clean at
+         * committing list. Of course, if the CIL is clean at the time of the
-         * the time of the push, it won't have pushed the CIL at all, so in that
+         * push, it won't have pushed the CIL at all, so in that case we should
-         * case we should try the push for this sequence again from the start
+         * try the push for this sequence again from the start just in case.
-         * just in case.
         */
        if (sequence == cil->xc_current_sequence &&
            !list_empty(&cil->xc_cil)) {
                spin_unlock(&cil->xc_push_lock);
@@ -849,6 +872,17 @@ restart:
        spin_unlock(&cil->xc_push_lock);
        return commit_lsn;
+        /*
+         * We detected a shutdown in progress. We need to trigger the log force
+         * to pass through it's iclog state machine error handling, even though
+         * we are already in a shutdown state. Hence we can't return
+         * NULLCOMMITLSN here as that has special meaning to log forces (i.e.
+         * LSN is already stable), so we return a zero LSN instead.
+         */
+out_shutdown:
+        spin_unlock(&cil->xc_push_lock);
+        return 0;
 }
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index bce53ac81096..981af0f6504b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2138,7 +2138,9 @@ xlog_recover_validate_buf_type(
                        bp->b_ops = &xfs_allocbt_buf_ops;
                        break;
                case XFS_IBT_CRC_MAGIC:
+                case XFS_FIBT_CRC_MAGIC:
                case XFS_IBT_MAGIC:
+                case XFS_FIBT_MAGIC:
                        bp->b_ops = &xfs_inobt_buf_ops;
                        break;
                case XFS_BMAP_CRC_MAGIC:
@@ -3145,7 +3147,7 @@ xlog_recover_efd_pass2(
                }
                lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
-        xfs_trans_ail_cursor_done(ailp, &cur);
+        xfs_trans_ail_cursor_done(&cur);
        spin_unlock(&ailp->xa_lock);
        return 0;
@@ -3520,8 +3522,7 @@ out:
 STATIC int
 xlog_recover_unmount_trans(
-        struct xlog             *log,
+        struct xlog             *log)
-        struct xlog_recover     *trans)
 {
        /* Do nothing now */
        xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
@@ -3595,7 +3596,7 @@ xlog_recover_process_data(
                                                                trans, pass);
                                break;
                        case XLOG_UNMOUNT_TRANS:
-                                error = xlog_recover_unmount_trans(log, trans);
+                                error = xlog_recover_unmount_trans(log);
                                break;
                        case XLOG_WAS_CONT_TRANS:
                                error = xlog_recover_add_to_cont_trans(log,
@@ -3757,7 +3758,7 @@ xlog_recover_process_efis(
                lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
 out:
-        xfs_trans_ail_cursor_done(ailp, &cur);
+        xfs_trans_ail_cursor_done(&cur);
        spin_unlock(&ailp->xa_lock);
        return error;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 993cb19e7d39..944f3d9456a8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -743,8 +743,6 @@ xfs_mountfs(
                new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
                if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
                        mp->m_inode_cluster_size = new_size;
-                xfs_info(mp, "Using inode cluster size of %d bytes",
-                         mp->m_inode_cluster_size);
        }
        /*
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4aff56395732..f99b4933dc22 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -100,14 +100,20 @@
 * likely result in a loop in one of the lists.  That's a sure-fire recipe for
 * an infinite loop in the code.
 */
-typedef struct xfs_mru_cache_elem
+struct xfs_mru_cache {
-{
+        struct radix_tree_root  store;     /* Core storage data structure.  */
-        struct list_head list_node;
+        struct list_head        *lists;    /* Array of lists, one per grp.  */
-        unsigned long   key;
+        struct list_head        reap_list; /* Elements overdue for reaping. */
-        void            *value;
+        spinlock_t              lock;      /* Lock to protect this struct.  */
-} xfs_mru_cache_elem_t;
+        unsigned int            grp_count; /* Number of discrete groups.    */
+        unsigned int            grp_time;  /* Time period spanned by grps.  */
+        unsigned int            lru_grp;   /* Group containing time zero.   */
+        unsigned long           time_zero; /* Time first element was added. */
+        xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
+        struct delayed_work     work;      /* Workqueue data for reaping.   */
+        unsigned int            queued;    /* work has been queued */
+};
-static kmem_zone_t              *xfs_mru_elem_zone;
 static struct workqueue_struct  *xfs_mru_reap_wq;
 /*
@@ -129,12 +135,12 @@ static struct workqueue_struct	*xfs_mru_reap_wq;
 */
 STATIC unsigned long
 _xfs_mru_cache_migrate(
-        xfs_mru_cache_t *mru,
+        struct xfs_mru_cache    *mru,
-        unsigned long   now)
+        unsigned long           now)
 {
-        unsigned int    grp;
+        unsigned int            grp;
-        unsigned int    migrated = 0;
+        unsigned int            migrated = 0;
-        struct list_head *lru_list;
+        struct list_head        *lru_list;
        /* Nothing to do if the data store is empty. */
        if (!mru->time_zero)
@@ -193,11 +199,11 @@ _xfs_mru_cache_migrate(
 */
 STATIC void
 _xfs_mru_cache_list_insert(
-        xfs_mru_cache_t         *mru,
+        struct xfs_mru_cache    *mru,
-        xfs_mru_cache_elem_t    *elem)
+        struct xfs_mru_cache_elem *elem)
 {
-        unsigned int    grp = 0;
+        unsigned int            grp = 0;
-        unsigned long   now = jiffies;
+        unsigned long           now = jiffies;
        /*
         * If the data store is empty, initialise time zero, leave grp set to
@@ -231,10 +237,10 @@ _xfs_mru_cache_list_insert(
 */
 STATIC void
 _xfs_mru_cache_clear_reap_list(
-        xfs_mru_cache_t         *mru) __releases(mru->lock) __acquires(mru->lock)
+        struct xfs_mru_cache    *mru)
+                __releases(mru->lock) __acquires(mru->lock)
 {
-        xfs_mru_cache_elem_t    *elem, *next;
+        struct xfs_mru_cache_elem *elem, *next;
        struct list_head        tmp;
        INIT_LIST_HEAD(&tmp);
@@ -252,15 +258,8 @@ _xfs_mru_cache_clear_reap_list(
        spin_unlock(&mru->lock);
        list_for_each_entry_safe(elem, next, &tmp, list_node) {
-                /* Remove the element from the reap list. */
                list_del_init(&elem->list_node);
+                mru->free_func(elem);
-                /* Call the client's free function with the key and value pointer. */
-                mru->free_func(elem->key, elem->value);
-                /* Free the element structure. */
-                kmem_zone_free(xfs_mru_elem_zone, elem);
        }
        spin_lock(&mru->lock);
@@ -277,7 +276,8 @@ STATIC void
 _xfs_mru_cache_reap(
        struct work_struct      *work)
 {
-        xfs_mru_cache_t         *mru = container_of(work, xfs_mru_cache_t, work.work);
+        struct xfs_mru_cache    *mru =
+                container_of(work, struct xfs_mru_cache, work.work);
        unsigned long           now, next;
        ASSERT(mru && mru->lists);
@@ -304,28 +304,16 @@ _xfs_mru_cache_reap(
 int
 xfs_mru_cache_init(void)
 {
-        xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
-                                         "xfs_mru_cache_elem");
-        if (!xfs_mru_elem_zone)
-                goto out;
        xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
        if (!xfs_mru_reap_wq)
-                goto out_destroy_mru_elem_zone;
+                return -ENOMEM;
        return 0;
- out_destroy_mru_elem_zone:
-        kmem_zone_destroy(xfs_mru_elem_zone);
- out:
-        return -ENOMEM;
 }
 void
 xfs_mru_cache_uninit(void)
 {
        destroy_workqueue(xfs_mru_reap_wq);
-        kmem_zone_destroy(xfs_mru_elem_zone);
 }
 /*
@@ -336,14 +324,14 @@ xfs_mru_cache_uninit(void)
 */
 int
 xfs_mru_cache_create(
-        xfs_mru_cache_t         **mrup,
+        struct xfs_mru_cache    **mrup,
        unsigned int            lifetime_ms,
        unsigned int            grp_count,
        xfs_mru_cache_free_func_t free_func)
 {
-        xfs_mru_cache_t *mru = NULL;
+        struct xfs_mru_cache    *mru = NULL;
-        int             err = 0, grp;
+        int                     err = 0, grp;
-        unsigned int    grp_time;
+        unsigned int            grp_time;
        if (mrup)
                *mrup = NULL;
@@ -400,7 +388,7 @@ exit:
 */
 static void
 xfs_mru_cache_flush(
-        xfs_mru_cache_t         *mru)
+        struct xfs_mru_cache    *mru)
 {
        if (!mru || !mru->lists)
                return;
@@ -420,7 +408,7 @@ xfs_mru_cache_flush(
 void
 xfs_mru_cache_destroy(
-        xfs_mru_cache_t         *mru)
+        struct xfs_mru_cache    *mru)
 {
        if (!mru || !mru->lists)
                return;
@@ -438,38 +426,30 @@ xfs_mru_cache_destroy(
 */
 int
 xfs_mru_cache_insert(
-        xfs_mru_cache_t *mru,
+        struct xfs_mru_cache    *mru,
-        unsigned long   key,
+        unsigned long           key,
-        void            *value)
+        struct xfs_mru_cache_elem *elem)
 {
-        xfs_mru_cache_elem_t *elem;
+        int                     error;
        ASSERT(mru && mru->lists);
        if (!mru || !mru->lists)
                return EINVAL;
-        elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP);
+        if (radix_tree_preload(GFP_KERNEL))
-        if (!elem)
                return ENOMEM;
-        if (radix_tree_preload(GFP_KERNEL)) {
-                kmem_zone_free(xfs_mru_elem_zone, elem);
-                return ENOMEM;
-        }
        INIT_LIST_HEAD(&elem->list_node);
        elem->key = key;
-        elem->value = value;
        spin_lock(&mru->lock);
+        error = -radix_tree_insert(&mru->store, key, elem);
-        radix_tree_insert(&mru->store, key, elem);
        radix_tree_preload_end();
-        _xfs_mru_cache_list_insert(mru, elem);
+        if (!error)
+                _xfs_mru_cache_list_insert(mru, elem);
        spin_unlock(&mru->lock);
-        return 0;
+        return error;
 }
 /*
@@ -478,13 +458,12 @@ xfs_mru_cache_insert(
 * the client data pointer for the removed element is returned, otherwise this
 * function will return a NULL pointer.
 */
-void *
+struct xfs_mru_cache_elem *
 xfs_mru_cache_remove(
-        xfs_mru_cache_t *mru,
+        struct xfs_mru_cache    *mru,
-        unsigned long   key)
+        unsigned long           key)
 {
-        xfs_mru_cache_elem_t *elem;
+        struct xfs_mru_cache_elem *elem;
-        void            *value = NULL;
        ASSERT(mru && mru->lists);
        if (!mru || !mru->lists)
@@ -492,17 +471,11 @@ xfs_mru_cache_remove(
        spin_lock(&mru->lock);
        elem = radix_tree_delete(&mru->store, key);
-        if (elem) {
+        if (elem)
-                value = elem->value;
                list_del(&elem->list_node);
-        }
        spin_unlock(&mru->lock);
-        if (elem)
+        return elem;
-                kmem_zone_free(xfs_mru_elem_zone, elem);
-        return value;
 }
 /*
@@ -511,13 +484,14 @@ xfs_mru_cache_remove(
 */
 void
 xfs_mru_cache_delete(
-        xfs_mru_cache_t *mru,
+        struct xfs_mru_cache    *mru,
-        unsigned long   key)
+        unsigned long           key)
 {
-        void            *value = xfs_mru_cache_remove(mru, key);
+        struct xfs_mru_cache_elem *elem;
-        if (value)
+        elem = xfs_mru_cache_remove(mru, key);
-                mru->free_func(key, value);
+        if (elem)
+                mru->free_func(elem);
 }
 /*
@@ -540,12 +514,12 @@ xfs_mru_cache_delete(
 * status, we need to help it get it right by annotating the path that does
 * not release the lock.
 */
-void *
+struct xfs_mru_cache_elem *
 xfs_mru_cache_lookup(
-        xfs_mru_cache_t *mru,
+        struct xfs_mru_cache    *mru,
-        unsigned long   key)
+        unsigned long           key)
 {
-        xfs_mru_cache_elem_t *elem;
+        struct xfs_mru_cache_elem *elem;
        ASSERT(mru && mru->lists);
        if (!mru || !mru->lists)
@@ -560,7 +534,7 @@ xfs_mru_cache_lookup(
        } else
                spin_unlock(&mru->lock);
-        return elem ? elem->value : NULL;
+        return elem;
 }
 /*
@@ -570,7 +544,8 @@ xfs_mru_cache_lookup(
 */
 void
 xfs_mru_cache_done(
-        xfs_mru_cache_t *mru) __releases(mru->lock)
+        struct xfs_mru_cache    *mru)
+                __releases(mru->lock)
 {
        spin_unlock(&mru->lock);
 }
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 36dd3ec8b4eb..fb5245ba5ff7 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -18,24 +18,15 @@
 #ifndef __XFS_MRU_CACHE_H__
 #define __XFS_MRU_CACHE_H__
+struct xfs_mru_cache;
-/* Function pointer type for callback to free a client's data pointer. */
+struct xfs_mru_cache_elem {
-typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
+        struct list_head list_node;
+        unsigned long   key;
+};
-typedef struct xfs_mru_cache
+/* Function pointer type for callback to free a client's data pointer. */
-{
+typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem);
-        struct radix_tree_root  store;     /* Core storage data structure.  */
-        struct list_head        *lists;    /* Array of lists, one per grp.  */
-        struct list_head        reap_list; /* Elements overdue for reaping. */
-        spinlock_t              lock;      /* Lock to protect this struct.  */
-        unsigned int            grp_count; /* Number of discrete groups.    */
-        unsigned int            grp_time;  /* Time period spanned by grps.  */
-        unsigned int            lru_grp;   /* Group containing time zero.   */
-        unsigned long           time_zero; /* Time first element was added. */
-        xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
-        struct delayed_work     work;      /* Workqueue data for reaping.   */
-        unsigned int            queued;    /* work has been queued */
-} xfs_mru_cache_t;
 int xfs_mru_cache_init(void);
 void xfs_mru_cache_uninit(void);
@@ -44,10 +35,12 @@ int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
                             xfs_mru_cache_free_func_t free_func);
 void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
 int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
-                                void *value);
+                struct xfs_mru_cache_elem *elem);
-void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_done(struct xfs_mru_cache *mru);
 #endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 348e4d2ed6e6..6d26759c779a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -193,47 +193,6 @@ xfs_qm_dqpurge(
 }
 /*
- * Release the group or project dquot pointers the user dquots maybe carrying
- * around as a hint, and proceed to purge the user dquot cache if requested.
-*/
-STATIC int
-xfs_qm_dqpurge_hints(
-        struct xfs_dquot        *dqp,
-        void                    *data)
-{
-        struct xfs_dquot        *gdqp = NULL;
-        struct xfs_dquot        *pdqp = NULL;
-        uint                    flags = *((uint *)data);
-        xfs_dqlock(dqp);
-        if (dqp->dq_flags & XFS_DQ_FREEING) {
-                xfs_dqunlock(dqp);
-                return EAGAIN;
-        }
-        /* If this quota has a hint attached, prepare for releasing it now */
-        gdqp = dqp->q_gdquot;
-        if (gdqp)
-                dqp->q_gdquot = NULL;
-        pdqp = dqp->q_pdquot;
-        if (pdqp)
-                dqp->q_pdquot = NULL;
-        xfs_dqunlock(dqp);
-        if (gdqp)
-                xfs_qm_dqrele(gdqp);
-        if (pdqp)
-                xfs_qm_dqrele(pdqp);
-        if (flags & XFS_QMOPT_UQUOTA)
-                return xfs_qm_dqpurge(dqp, NULL);
-        return 0;
-}
-/*
 * Purge the dquot cache.
 */
 void
@@ -241,18 +200,8 @@ xfs_qm_dqpurge_all(
        struct xfs_mount        *mp,
        uint                    flags)
 {
-        /*
+        if (flags & XFS_QMOPT_UQUOTA)
-         * We have to release group/project dquot hint(s) from the user dquot
+                xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
-         * at first if they are there, otherwise we would run into an infinite
-         * loop while walking through radix tree to purge other type of dquots
-         * since their refcount is not zero if the user dquot refers to them
-         * as hint.
-         *
-         * Call the special xfs_qm_dqpurge_hints() will end up go through the
-         * general xfs_qm_dqpurge() against user dquot cache if requested.
-         */
-        xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags);
        if (flags & XFS_QMOPT_GQUOTA)
                xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
        if (flags & XFS_QMOPT_PQUOTA)
@@ -409,7 +358,6 @@ xfs_qm_dqattach_one(
        xfs_dqid_t      id,
        uint            type,
        uint            doalloc,
-        xfs_dquot_t     *udqhint, /* hint */
        xfs_dquot_t     **IO_idqpp)
 {
        xfs_dquot_t     *dqp;
@@ -419,9 +367,9 @@ xfs_qm_dqattach_one(
        error = 0;
        /*
-         * See if we already have it in the inode itself. IO_idqpp is
+         * See if we already have it in the inode itself. IO_idqpp is &i_udquot
-         * &i_udquot or &i_gdquot. This made the code look weird, but
+         * or &i_gdquot. This made the code look weird, but made the logic a lot
-         * made the logic a lot simpler.
+         * simpler.
         */
        dqp = *IO_idqpp;
        if (dqp) {
@@ -430,49 +378,10 @@ xfs_qm_dqattach_one(
        }
        /*
-         * udqhint is the i_udquot field in inode, and is non-NULL only
+         * Find the dquot from somewhere. This bumps the reference count of
-         * when the type arg is group/project. Its purpose is to save a
+         * dquot and returns it locked.  This can return ENOENT if dquot didn't
-         * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
+         * exist on disk and we didn't ask it to allocate; ESRCH if quotas got
-         * the user dquot.
+         * turned off suddenly.
-         */
-        if (udqhint) {
-                ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
-                xfs_dqlock(udqhint);
-                /*
-                 * No need to take dqlock to look at the id.
-                 *
-                 * The ID can't change until it gets reclaimed, and it won't
-                 * be reclaimed as long as we have a ref from inode and we
-                 * hold the ilock.
-                 */
-                if (type == XFS_DQ_GROUP)
-                        dqp = udqhint->q_gdquot;
-                else
-                        dqp = udqhint->q_pdquot;
-                if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
-                        ASSERT(*IO_idqpp == NULL);
-                        *IO_idqpp = xfs_qm_dqhold(dqp);
-                        xfs_dqunlock(udqhint);
-                        return 0;
-                }
-                /*
-                 * We can't hold a dquot lock when we call the dqget code.
-                 * We'll deadlock in no time, because of (not conforming to)
-                 * lock ordering - the inodelock comes before any dquot lock,
-                 * and we may drop and reacquire the ilock in xfs_qm_dqget().
-                 */
-                xfs_dqunlock(udqhint);
-        }
-        /*
-         * Find the dquot from somewhere. This bumps the
-         * reference count of dquot and returns it locked.
-         * This can return ENOENT if dquot didn't exist on
-         * disk and we didn't ask it to allocate;
-         * ESRCH if quotas got turned off suddenly.
         */
        error = xfs_qm_dqget(ip->i_mount, ip, id, type,
                             doalloc | XFS_QMOPT_DOWARN, &dqp);
@@ -490,48 +399,6 @@ xfs_qm_dqattach_one(
        return 0;
 }
-/*
- * Given a udquot and group/project type, attach the group/project
- * dquot pointer to the udquot as a hint for future lookups.
- */
-STATIC void
-xfs_qm_dqattach_hint(
-        struct xfs_inode        *ip,
-        int                     type)
-{
-        struct xfs_dquot **dqhintp;
-        struct xfs_dquot *dqp;
-        struct xfs_dquot *udq = ip->i_udquot;
-        ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
-        xfs_dqlock(udq);
-        if (type == XFS_DQ_GROUP) {
-                dqp = ip->i_gdquot;
-                dqhintp = &udq->q_gdquot;
-        } else {
-                dqp = ip->i_pdquot;
-                dqhintp = &udq->q_pdquot;
-        }
-        if (*dqhintp) {
-                struct xfs_dquot *tmp;
-                if (*dqhintp == dqp)
-                        goto done;
-                tmp = *dqhintp;
-                *dqhintp = NULL;
-                xfs_qm_dqrele(tmp);
-        }
-        *dqhintp = xfs_qm_dqhold(dqp);
-done:
-        xfs_dqunlock(udq);
-}
 static bool
 xfs_qm_need_dqattach(
        struct xfs_inode        *ip)
@@ -562,7 +429,6 @@ xfs_qm_dqattach_locked(
        uint            flags)
 {
        xfs_mount_t     *mp = ip->i_mount;
-        uint            nquotas = 0;
        int             error = 0;
        if (!xfs_qm_need_dqattach(ip))
@@ -570,77 +436,39 @@ xfs_qm_dqattach_locked(
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        if (XFS_IS_UQUOTA_ON(mp)) {
+        if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
                error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
                                                flags & XFS_QMOPT_DQALLOC,
-                                                NULL, &ip->i_udquot);
+                                                &ip->i_udquot);
                if (error)
                        goto done;
-                nquotas++;
+                ASSERT(ip->i_udquot);
        }
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
-        if (XFS_IS_GQUOTA_ON(mp)) {
                error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
                                                flags & XFS_QMOPT_DQALLOC,
-                                                ip->i_udquot, &ip->i_gdquot);
+                                                &ip->i_gdquot);
-                /*
-                 * Don't worry about the udquot that we may have
-                 * attached above. It'll get detached, if not already.
-                 */
                if (error)
                        goto done;
-                nquotas++;
+                ASSERT(ip->i_gdquot);
        }
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
-        if (XFS_IS_PQUOTA_ON(mp)) {
                error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
                                                flags & XFS_QMOPT_DQALLOC,
-                                                ip->i_udquot, &ip->i_pdquot);
+                                                &ip->i_pdquot);
-                /*
-                 * Don't worry about the udquot that we may have
-                 * attached above. It'll get detached, if not already.
-                 */
                if (error)
                        goto done;
-                nquotas++;
+                ASSERT(ip->i_pdquot);
        }
+done:
        /*
-         * Attach this group/project quota to the user quota as a hint.
+         * Don't worry about the dquots that we may have attached before any
-         * This WON'T, in general, result in a thrash.
+         * error - they'll get detached later if it has not already been done.
         */
-        if (nquotas > 1 && ip->i_udquot) {
-                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-                ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp));
-                ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp));
-                /*
-                 * We do not have i_udquot locked at this point, but this check
-                 * is OK since we don't depend on the i_gdquot to be accurate
-                 * 100% all the time. It is just a hint, and this will
-                 * succeed in general.
-                 */
-                if (ip->i_udquot->q_gdquot != ip->i_gdquot)
-                        xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP);
-                if (ip->i_udquot->q_pdquot != ip->i_pdquot)
-                        xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ);
-        }
- done:
-#ifdef DEBUG
-        if (!error) {
-                if (XFS_IS_UQUOTA_ON(mp))
-                        ASSERT(ip->i_udquot);
-                if (XFS_IS_GQUOTA_ON(mp))
-                        ASSERT(ip->i_gdquot);
-                if (XFS_IS_PQUOTA_ON(mp))
-                        ASSERT(ip->i_pdquot);
-        }
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-#endif
        return error;
 }
@@ -843,22 +671,17 @@ xfs_qm_init_quotainfo(
        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
-        if ((error = list_lru_init(&qinf->qi_lru))) {
+        error = -list_lru_init(&qinf->qi_lru);
-                kmem_free(qinf);
+        if (error)
-                mp->m_quotainfo = NULL;
+                goto out_free_qinf;
-                return error;
-        }
        /*
         * See if quotainodes are setup, and if not, allocate them,
         * and change the superblock accordingly.
         */
-        if ((error = xfs_qm_init_quotainos(mp))) {
+        error = xfs_qm_init_quotainos(mp);
-                list_lru_destroy(&qinf->qi_lru);
+        if (error)
-                kmem_free(qinf);
+                goto out_free_lru;
-                mp->m_quotainfo = NULL;
-                return error;
-        }
        INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
        INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
@@ -870,8 +693,7 @@ xfs_qm_init_quotainfo(
        /* Precalc some constants */
        qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
-        qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(mp,
+        qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
-                                                        qinf->qi_dqchunklen);
        mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
@@ -918,7 +740,7 @@ xfs_qm_init_quotainfo(
                qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
                qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
                qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
- 
                xfs_qm_dqdestroy(dqp);
        } else {
                qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -935,6 +757,13 @@ xfs_qm_init_quotainfo(
        qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
        register_shrinker(&qinf->qi_shrinker);
        return 0;
+out_free_lru:
+        list_lru_destroy(&qinf->qi_lru);
+out_free_qinf:
+        kmem_free(qinf);
+        mp->m_quotainfo = NULL;
+        return error;
 }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 3daf5ea1eb8d..bbc813caba4c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -278,9 +278,10 @@ xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
 {
-        int             error;
+        int             error = EINVAL;
-        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
+        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
+            (flags & ~XFS_DQ_ALLTYPES)) {
                xfs_debug(mp, "%s: flags=%x m_qflags=%x",
                        __func__, flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
@@ -959,7 +960,6 @@ xfs_qm_export_flags(
 STATIC int
 xfs_dqrele_inode(
        struct xfs_inode        *ip,
-        struct xfs_perag        *pag,
        int                     flags,
        void                    *args)
 {
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h
index b3b2b1065c0f..137e20937077 100644
--- a/fs/xfs/xfs_quota_defs.h
+++ b/fs/xfs/xfs_quota_defs.h
@@ -156,6 +156,6 @@ typedef __uint16_t	xfs_qwarncnt_t;
 extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
                       xfs_dqid_t id, uint type, uint flags, char *str);
-extern int xfs_calc_dquots_per_chunk(struct xfs_mount *mp, unsigned int nbblks);
+extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
 #endif  /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index af33cafe69b6..2ad1b9822e92 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -100,16 +100,36 @@ xfs_fs_set_xstate(
                if (!XFS_IS_QUOTA_ON(mp))
                        return -EINVAL;
                return -xfs_qm_scall_quotaoff(mp, flags);
-        case Q_XQUOTARM:
-                if (XFS_IS_QUOTA_ON(mp))
-                        return -EINVAL;
-                return -xfs_qm_scall_trunc_qfiles(mp, flags);
        }
        return -EINVAL;
 }
 STATIC int
+xfs_fs_rm_xquota(
+        struct super_block      *sb,
+        unsigned int            uflags)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        unsigned int            flags = 0;
+        
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (XFS_IS_QUOTA_ON(mp))
+                return -EINVAL;
+        if (uflags & FS_USER_QUOTA)
+                flags |= XFS_DQ_USER;
+        if (uflags & FS_GROUP_QUOTA)
+                flags |= XFS_DQ_GROUP;
+        if (uflags & FS_USER_QUOTA)
+                flags |= XFS_DQ_PROJ;
+        return -xfs_qm_scall_trunc_qfiles(mp, flags);
+}       
+STATIC int
 xfs_fs_get_dqblk(
        struct super_block      *sb,
        struct kqid             qid,
@@ -149,6 +169,7 @@ const struct quotactl_ops xfs_quotactl_operations = {
        .get_xstatev            = xfs_fs_get_xstatev,
        .get_xstate             = xfs_fs_get_xstate,
        .set_xstate             = xfs_fs_set_xstate,
+        .rm_xquota              = xfs_fs_rm_xquota,
        .get_dqblk              = xfs_fs_get_dqblk,
        .set_dqblk              = xfs_fs_set_dqblk,
 };
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 0c0e41bbe4e3..8baf61afae1d 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -201,10 +201,6 @@ xfs_mount_validate_sb(
         * write validation, we don't need to check feature masks.
         */
        if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
-                xfs_alert(mp,
-"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
-"Use of these features in this kernel is at your own risk!");
                if (xfs_sb_has_compat_feature(sbp,
                                        XFS_SB_FEAT_COMPAT_UNKNOWN)) {
                        xfs_warn(mp,
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f7b2fe77c5a5..950d1ea058b2 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -587,7 +587,9 @@ xfs_sb_has_compat_feature(
        return (sbp->sb_features_compat & feature) != 0;
 }
-#define XFS_SB_FEAT_RO_COMPAT_ALL 0
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)         /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+                (XFS_SB_FEAT_RO_COMPAT_FINOBT)
 #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN   ~XFS_SB_FEAT_RO_COMPAT_ALL
 static inline bool
 xfs_sb_has_ro_compat_feature(
@@ -641,6 +643,12 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
                 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
 }
+static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+                (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
+}
 /*
 * end of superblock version macros
 */
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
index 4484e5151395..82404da2ca67 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/xfs_shared.h
@@ -238,7 +238,7 @@ int	xfs_log_calc_minimum_size(struct xfs_mount *);
 int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
 int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
                        uint32_t size, struct xfs_buf *bp);
-bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
                        uint32_t size, struct xfs_buf *bp);
 void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
                                 struct xfs_inode *ip, struct xfs_ifork *ifp);
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index ce372b7d5644..f2240383d4bb 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -59,6 +59,7 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
                { "abtc2",              XFSSTAT_END_ABTC_V2             },
                { "bmbt2",              XFSSTAT_END_BMBT_V2             },
                { "ibt2",               XFSSTAT_END_IBT_V2              },
+                { "fibt2",              XFSSTAT_END_FIBT_V2             },
                /* we print both series of quota information together */
                { "qm",                 XFSSTAT_END_QM                  },
        };
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index c03ad38ceaeb..c8f238b8299a 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -183,7 +183,23 @@ struct xfsstats {
        __uint32_t              xs_ibt_2_alloc;
        __uint32_t              xs_ibt_2_free;
        __uint32_t              xs_ibt_2_moves;
-#define XFSSTAT_END_XQMSTAT             (XFSSTAT_END_IBT_V2+6)
+#define XFSSTAT_END_FIBT_V2             (XFSSTAT_END_IBT_V2+15)
+        __uint32_t              xs_fibt_2_lookup;
+        __uint32_t              xs_fibt_2_compare;
+        __uint32_t              xs_fibt_2_insrec;
+        __uint32_t              xs_fibt_2_delrec;
+        __uint32_t              xs_fibt_2_newroot;
+        __uint32_t              xs_fibt_2_killroot;
+        __uint32_t              xs_fibt_2_increment;
+        __uint32_t              xs_fibt_2_decrement;
+        __uint32_t              xs_fibt_2_lshift;
+        __uint32_t              xs_fibt_2_rshift;
+        __uint32_t              xs_fibt_2_split;
+        __uint32_t              xs_fibt_2_join;
+        __uint32_t              xs_fibt_2_alloc;
+        __uint32_t              xs_fibt_2_free;
+        __uint32_t              xs_fibt_2_moves;
+#define XFSSTAT_END_XQMSTAT             (XFSSTAT_END_FIBT_V2+6)
        __uint32_t              xs_qm_dqreclaims;
        __uint32_t              xs_qm_dqreclaim_misses;
        __uint32_t              xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 205376776377..8f0333b3f7a0 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -765,20 +765,18 @@ xfs_open_devices(
         * Setup xfs_mount buffer target pointers
         */
        error = ENOMEM;
-        mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
+        mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
        if (rtdev) {
-                mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+                mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
-                                                        mp->m_fsname);
                if (!mp->m_rtdev_targp)
                        goto out_free_ddev_targ;
        }
        if (logdev && logdev != ddev) {
-                mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+                mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
-                                                        mp->m_fsname);
                if (!mp->m_logdev_targp)
                        goto out_free_rtdev_targ;
        } else {
@@ -811,8 +809,7 @@ xfs_setup_devices(
 {
        int                     error;
-        error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
+        error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
-                                    mp->m_sb.sb_sectsize);
        if (error)
                return error;
@@ -822,14 +819,12 @@ xfs_setup_devices(
                if (xfs_sb_version_hassector(&mp->m_sb))
                        log_sector_size = mp->m_sb.sb_logsectsize;
                error = xfs_setsize_buftarg(mp->m_logdev_targp,
-                                            mp->m_sb.sb_blocksize,
                                            log_sector_size);
                if (error)
                        return error;
        }
        if (mp->m_rtdev_targp) {
                error = xfs_setsize_buftarg(mp->m_rtdev_targp,
-                                            mp->m_sb.sb_blocksize,
                                            mp->m_sb.sb_sectsize);
                if (error)
                        return error;
@@ -1433,11 +1428,11 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_fsname;
-        error = xfs_init_mount_workqueues(mp);
+        error = -xfs_init_mount_workqueues(mp);
        if (error)
                goto out_close_devices;
-        error = xfs_icsb_init_counters(mp);
+        error = -xfs_icsb_init_counters(mp);
        if (error)
                goto out_destroy_workqueues;
@@ -1754,13 +1749,9 @@ init_xfs_fs(void)
        if (error)
                goto out_destroy_wq;
-        error = xfs_filestream_init();
-        if (error)
-                goto out_mru_cache_uninit;
        error = xfs_buf_init();
        if (error)
-                goto out_filestream_uninit;
+                goto out_mru_cache_uninit;
        error = xfs_init_procfs();
        if (error)
@@ -1787,8 +1778,6 @@ init_xfs_fs(void)
        xfs_cleanup_procfs();
 out_buf_terminate:
        xfs_buf_terminate();
- out_filestream_uninit:
-        xfs_filestream_uninit();
 out_mru_cache_uninit:
        xfs_mru_cache_uninit();
 out_destroy_wq:
@@ -1807,7 +1796,6 @@ exit_xfs_fs(void)
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_buf_terminate();
-        xfs_filestream_uninit();
        xfs_mru_cache_uninit();
        xfs_destroy_workqueues();
        xfs_destroy_zones();
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 52979aa90986..0816b4018dfc 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -92,7 +92,7 @@ xfs_readlink_bmap(
                cur_chunk = bp->b_addr;
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
-                        if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset,
+                        if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
                                                        byte_cnt, bp)) {
                                error = EFSCORRUPTED;
                                xfs_alert(mp,
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
index 9b32052ff65e..23c2f2577c8d 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -80,7 +80,6 @@ xfs_symlink_hdr_set(
 */
 bool
 xfs_symlink_hdr_ok(
-        struct xfs_mount        *mp,
        xfs_ino_t               ino,
        uint32_t                offset,
        uint32_t                size,
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index dee3279c095e..1e85bcd0e418 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -46,6 +46,7 @@
 #include "xfs_log_recover.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_filestream.h"
 /*
 * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a4ae41c179a8..6910458915cf 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -538,6 +538,64 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
+DECLARE_EVENT_CLASS(xfs_filestream_class,
+        TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
+        TP_ARGS(ip, agno),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(xfs_agnumber_t, agno)
+                __field(int, streams)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->agno = agno;
+                __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+        ),
+        TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->agno,
+                  __entry->streams)
+)
+#define DEFINE_FILESTREAM_EVENT(name) \
+DEFINE_EVENT(xfs_filestream_class, name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \
+        TP_ARGS(ip, agno))
+DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
+TRACE_EVENT(xfs_filestream_pick,
+        TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno,
+                 xfs_extlen_t free, int nscan),
+        TP_ARGS(ip, agno, free, nscan),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(xfs_agnumber_t, agno)
+                __field(int, streams)
+                __field(xfs_extlen_t, free)
+                __field(int, nscan)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
+                __entry->ino = ip->i_ino;
+                __entry->agno = agno;
+                __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+                __entry->free = free;
+                __entry->nscan = nscan;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->agno,
+                  __entry->streams,
+                  __entry->free,
+                  __entry->nscan)
+);
 DECLARE_EVENT_CLASS(xfs_lock_class,
        TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
                 unsigned long caller_ip),
@@ -603,6 +661,7 @@ DEFINE_INODE_EVENT(xfs_readlink);
 DEFINE_INODE_EVENT(xfs_inactive_symlink);
 DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
 DEFINE_INODE_EVENT(xfs_collapse_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
 #ifdef CONFIG_XFS_POSIX_ACL
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 54a57326d85b..d03932564ccb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -827,7 +827,7 @@ xfs_trans_committed_bulk(
                xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
        spin_lock(&ailp->xa_lock);
-        xfs_trans_ail_cursor_done(ailp, &cur);
+        xfs_trans_ail_cursor_done(&cur);
        spin_unlock(&ailp->xa_lock);
 }
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index a7287354e535..cb0f3a84cc68 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -173,7 +173,6 @@ xfs_trans_ail_cursor_next(
 */
 void
 xfs_trans_ail_cursor_done(
-        struct xfs_ail          *ailp,
        struct xfs_ail_cursor   *cur)
 {
        cur->item = NULL;
@@ -368,7 +367,7 @@ xfsaild_push(
                 * If the AIL is empty or our push has reached the end we are
                 * done now.
                 */
-                xfs_trans_ail_cursor_done(ailp, &cur);
+                xfs_trans_ail_cursor_done(&cur);
                spin_unlock(&ailp->xa_lock);
                goto out_done;
        }
@@ -453,7 +452,7 @@ xfsaild_push(
                        break;
                lsn = lip->li_lsn;
        }
-        xfs_trans_ail_cursor_done(ailp, &cur);
+        xfs_trans_ail_cursor_done(&cur);
        spin_unlock(&ailp->xa_lock);
        if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 12e86af9d9b9..bd1281862ad7 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -133,8 +133,7 @@ struct xfs_log_item *	xfs_trans_ail_cursor_last(struct xfs_ail *ailp,
                                        xfs_lsn_t lsn);
 struct xfs_log_item *   xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur);
-void                    xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
+void                    xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur);
-                                        struct xfs_ail_cursor *cur);
 #if BITS_PER_LONG != 64
 static inline void
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index ae368165244d..52b6c3e3203e 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -106,6 +106,47 @@ xfs_calc_inode_res(
 }
 /*
+ * The free inode btree is a conditional feature and the log reservation
+ * requirements differ slightly from that of the traditional inode allocation
+ * btree. The finobt tracks records for inode chunks with at least one free
+ * inode. A record can be removed from the tree for an inode allocation
+ * or free and thus the finobt reservation is unconditional across:
+ *
+ *      - inode allocation
+ *      - inode free
+ *      - inode chunk allocation
+ *
+ * The 'modify' param indicates to include the record modification scenario. The
+ * 'alloc' param indicates to include the reservation for free space btree
+ * modifications on behalf of finobt modifications. This is required only for
+ * transactions that do not already account for free space btree modifications.
+ *
+ * the free inode btree: max depth * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the free inode btree entry: block size
+ */
+STATIC uint
+xfs_calc_finobt_res(
+        struct xfs_mount        *mp,
+        int                     alloc,
+        int                     modify)
+{
+        uint res;
+        if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+                return 0;
+        res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
+        if (alloc)
+                res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 
+                                        XFS_FSB_TO_B(mp, 1));
+        if (modify)
+                res += (uint)XFS_FSB_TO_B(mp, 1);
+        return res;
+}
+/*
 * Various log reservation values.
 *
 * These are based on the size of the file system block because that is what
@@ -302,6 +343,7 @@ xfs_calc_remove_reservation(
 *    the superblock for the nlink flag: sector size
 *    the directory btree: (max depth + v2) * dir block size
 *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the finobt (record modification and allocation btrees)
 */
 STATIC uint
 xfs_calc_create_resv_modify(
@@ -310,7 +352,8 @@ xfs_calc_create_resv_modify(
        return xfs_calc_inode_res(mp, 2) +
                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
                (uint)XFS_FSB_TO_B(mp, 1) +
-                xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+                xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_finobt_res(mp, 1, 1);
 }
 /*
@@ -348,6 +391,7 @@ __xfs_calc_create_reservation(
 *    the superblock for the nlink flag: sector size
 *    the inode btree: max depth * blocksize
 *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion)
 */
 STATIC uint
 xfs_calc_icreate_resv_alloc(
@@ -357,7 +401,8 @@ xfs_calc_icreate_resv_alloc(
                mp->m_sb.sb_sectsize +
                xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                 XFS_FSB_TO_B(mp, 1));
+                                 XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_finobt_res(mp, 0, 0);
 }
 STATIC uint
@@ -425,6 +470,7 @@ xfs_calc_symlink_reservation(
 *    the on disk inode before ours in the agi hash list: inode cluster size
 *    the inode btree: max depth * blocksize
 *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion, removal or modification)
 */
 STATIC uint
 xfs_calc_ifree_reservation(
@@ -439,7 +485,8 @@ xfs_calc_ifree_reservation(
                xfs_calc_buf_res(2 + mp->m_ialloc_blks +
                                 mp->m_in_maxlevels, 0) +
                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                 XFS_FSB_TO_B(mp, 1));
+                                 XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_finobt_res(mp, 0, 1);
 }
 /*
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index af5dbe06cb65..df4c1f81884c 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,9 @@
 #define XFS_DIRREMOVE_SPACE_RES(mp)     \
        XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
 #define XFS_IALLOC_SPACE_RES(mp)        \
-        ((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1)
+        ((mp)->m_ialloc_blks + \
+         (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
+          ((mp)->m_in_maxlevels - 1)))
 /*
 * Space reservation values for various transactions.
@@ -82,5 +84,8 @@
        (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
 #define XFS_SYMLINK_SPACE_RES(mp,nl,b)  \
        (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#define XFS_IFREE_SPACE_RES(mp)         \
+        (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
 #endif  /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 82bbc34d54a3..65c6e6650b1a 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -134,7 +134,7 @@ typedef enum {
 typedef enum {
        XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
-        XFS_BTNUM_MAX
+        XFS_BTNUM_FINOi, XFS_BTNUM_MAX
 } xfs_btnum_t;
 struct xfs_name {