Merge branch 'xfs-libxfs-restructure' into for-next

author: Dave Chinner <david@fromorbit.com> 2014-07-14 17:37:18 -0400
committer: Dave Chinner <david@fromorbit.com> 2014-07-14 17:37:18 -0400
commit: 7f8a058f6dc52219117bc2469b1fb816f7fa1a4b (patch)
tree: 43ce8eed4d26beb6f2acff2279c43eae7f79f83a /fs/xfs/libxfs
parent: 03e01349c654fbdea80d3d9b4ab599244eb55bb7 (diff)
parent: 2451337dd043901b5270b7586942abe564443e3d (diff)
55 files changed, 45286 insertions, 0 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
new file mode 100644
index 000000000000..6e247a99f5db
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_AG_H__
+#define __XFS_AG_H__
+/*
+ * Allocation group header
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_trans;
+#define XFS_AGF_MAGIC   0x58414746      /* 'XAGF' */
+#define XFS_AGI_MAGIC   0x58414749      /* 'XAGI' */
+#define XFS_AGFL_MAGIC  0x5841464c      /* 'XAFL' */
+#define XFS_AGF_VERSION 1
+#define XFS_AGI_VERSION 1
+#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
+#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
+/*
+ * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
+ * arrays below.
+ */
+#define XFS_BTNUM_AGF   ((int)XFS_BTNUM_CNTi + 1)
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number.  Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+typedef struct xfs_agf {
+        /*
+         * Common allocation group header information
+         */
+        __be32          agf_magicnum;   /* magic number == XFS_AGF_MAGIC */
+        __be32          agf_versionnum; /* header version == XFS_AGF_VERSION */
+        __be32          agf_seqno;      /* sequence # starting from 0 */
+        __be32          agf_length;     /* size in blocks of a.g. */
+        /*
+         * Freespace information
+         */
+        __be32          agf_roots[XFS_BTNUM_AGF];       /* root blocks */
+        __be32          agf_spare0;     /* spare field */
+        __be32          agf_levels[XFS_BTNUM_AGF];      /* btree levels */
+        __be32          agf_spare1;     /* spare field */
+        __be32          agf_flfirst;    /* first freelist block's index */
+        __be32          agf_fllast;     /* last freelist block's index */
+        __be32          agf_flcount;    /* count of blocks in freelist */
+        __be32          agf_freeblks;   /* total free blocks */
+        __be32          agf_longest;    /* longest free space */
+        __be32          agf_btreeblks;  /* # of blocks held in AGF btrees */
+        uuid_t          agf_uuid;       /* uuid of filesystem */
+        /*
+         * reserve some contiguous space for future logged fields before we add
+         * the unlogged fields. This makes the range logging via flags and
+         * structure offsets much simpler.
+         */
+        __be64          agf_spare64[16];
+        /* unlogged fields, written during buffer writeback. */
+        __be64          agf_lsn;        /* last write sequence */
+        __be32          agf_crc;        /* crc of agf sector */
+        __be32          agf_spare2;
+        /* structure must be padded to 64 bit alignment */
+} xfs_agf_t;
+#define XFS_AGF_CRC_OFF         offsetof(struct xfs_agf, agf_crc)
+#define XFS_AGF_MAGICNUM        0x00000001
+#define XFS_AGF_VERSIONNUM      0x00000002
+#define XFS_AGF_SEQNO           0x00000004
+#define XFS_AGF_LENGTH          0x00000008
+#define XFS_AGF_ROOTS           0x00000010
+#define XFS_AGF_LEVELS          0x00000020
+#define XFS_AGF_FLFIRST         0x00000040
+#define XFS_AGF_FLLAST          0x00000080
+#define XFS_AGF_FLCOUNT         0x00000100
+#define XFS_AGF_FREEBLKS        0x00000200
+#define XFS_AGF_LONGEST         0x00000400
+#define XFS_AGF_BTREEBLKS       0x00000800
+#define XFS_AGF_UUID            0x00001000
+#define XFS_AGF_NUM_BITS        13
+#define XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
+#define XFS_AGF_FLAGS \
+        { XFS_AGF_MAGICNUM,     "MAGICNUM" }, \
+        { XFS_AGF_VERSIONNUM,   "VERSIONNUM" }, \
+        { XFS_AGF_SEQNO,        "SEQNO" }, \
+        { XFS_AGF_LENGTH,       "LENGTH" }, \
+        { XFS_AGF_ROOTS,        "ROOTS" }, \
+        { XFS_AGF_LEVELS,       "LEVELS" }, \
+        { XFS_AGF_FLFIRST,      "FLFIRST" }, \
+        { XFS_AGF_FLLAST,       "FLLAST" }, \
+        { XFS_AGF_FLCOUNT,      "FLCOUNT" }, \
+        { XFS_AGF_FREEBLKS,     "FREEBLKS" }, \
+        { XFS_AGF_LONGEST,      "LONGEST" }, \
+        { XFS_AGF_BTREEBLKS,    "BTREEBLKS" }, \
+        { XFS_AGF_UUID,         "UUID" }
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp)       ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+#define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)((bp)->b_addr))
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define XFS_AGI_UNLINKED_BUCKETS        64
+typedef struct xfs_agi {
+        /*
+         * Common allocation group header information
+         */
+        __be32          agi_magicnum;   /* magic number == XFS_AGI_MAGIC */
+        __be32          agi_versionnum; /* header version == XFS_AGI_VERSION */
+        __be32          agi_seqno;      /* sequence # starting from 0 */
+        __be32          agi_length;     /* size in blocks of a.g. */
+        /*
+         * Inode information
+         * Inodes are mapped by interpreting the inode number, so no
+         * mapping data is needed here.
+         */
+        __be32          agi_count;      /* count of allocated inodes */
+        __be32          agi_root;       /* root of inode btree */
+        __be32          agi_level;      /* levels in inode btree */
+        __be32          agi_freecount;  /* number of free inodes */
+        __be32          agi_newino;     /* new inode just allocated */
+        __be32          agi_dirino;     /* last directory inode chunk */
+        /*
+         * Hash table of inodes which have been unlinked but are
+         * still being referenced.
+         */
+        __be32          agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+        /*
+         * This marks the end of logging region 1 and start of logging region 2.
+         */
+        uuid_t          agi_uuid;       /* uuid of filesystem */
+        __be32          agi_crc;        /* crc of agi sector */
+        __be32          agi_pad32;
+        __be64          agi_lsn;        /* last write sequence */
+        __be32          agi_free_root; /* root of the free inode btree */
+        __be32          agi_free_level;/* levels in free inode btree */
+        /* structure must be padded to 64 bit alignment */
+} xfs_agi_t;
+#define XFS_AGI_CRC_OFF         offsetof(struct xfs_agi, agi_crc)
+#define XFS_AGI_MAGICNUM        (1 << 0)
+#define XFS_AGI_VERSIONNUM      (1 << 1)
+#define XFS_AGI_SEQNO           (1 << 2)
+#define XFS_AGI_LENGTH          (1 << 3)
+#define XFS_AGI_COUNT           (1 << 4)
+#define XFS_AGI_ROOT            (1 << 5)
+#define XFS_AGI_LEVEL           (1 << 6)
+#define XFS_AGI_FREECOUNT       (1 << 7)
+#define XFS_AGI_NEWINO          (1 << 8)
+#define XFS_AGI_DIRINO          (1 << 9)
+#define XFS_AGI_UNLINKED        (1 << 10)
+#define XFS_AGI_NUM_BITS_R1     11      /* end of the 1st agi logging region */
+#define XFS_AGI_ALL_BITS_R1     ((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT       (1 << 11)
+#define XFS_AGI_FREE_LEVEL      (1 << 12)
+#define XFS_AGI_NUM_BITS_R2     13
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp)       ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#define XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+#define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)((bp)->b_addr))
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+                                xfs_agnumber_t agno, struct xfs_buf **bpp);
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp)      ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#define XFS_AGFL_BLOCK(mp)      XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#define XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)((bp)->b_addr))
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+        (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+                &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+                (__be32 *)(bp)->b_addr)
+/*
+ * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
+ */
+#define XFS_AGFL_SIZE(mp) \
+        (((mp)->m_sb.sb_sectsize - \
+         (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+                sizeof(struct xfs_agfl) : 0)) / \
+          sizeof(xfs_agblock_t))
+typedef struct xfs_agfl {
+        __be32          agfl_magicnum;
+        __be32          agfl_seqno;
+        uuid_t          agfl_uuid;
+        __be64          agfl_lsn;
+        __be32          agfl_crc;
+        __be32          agfl_bno[];     /* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+#define XFS_AGFL_CRC_OFF        offsetof(struct xfs_agfl, agfl_crc)
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_NO_TAG          (-1)    /* special flag for an untagged lookup
+                                           in xfs_inode_ag_iterator */
+#define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG   1       /* inode has blocks beyond EOF */
+#define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
+        (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#define XFS_MIN_FREELIST(a,mp)          \
+        (XFS_MIN_FREELIST_RAW(          \
+                be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
+                be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
+#define XFS_MIN_FREELIST_PAG(pag,mp)    \
+        (XFS_MIN_FREELIST_RAW(          \
+                (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+                (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+#define XFS_AGB_TO_FSB(mp,agno,agbno)   \
+        (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#define XFS_FSB_TO_AGNO(mp,fsbno)       \
+        ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#define XFS_FSB_TO_AGBNO(mp,fsbno)      \
+        ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
+        ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
+                (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
+#define XFS_AG_DADDR(mp,agno,d)         (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#define XFS_AG_CHECK_DADDR(mp,d,len)    \
+        ((len) == 1 ? \
+            ASSERT((d) == XFS_SB_DADDR || \
+                   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+            ASSERT(xfs_daddr_to_agno(mp, d) == \
+                   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
+#endif  /* __XFS_AG_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
new file mode 100644
index 000000000000..4bffffe038a1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -0,0 +1,2630 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+struct workqueue_struct *xfs_alloc_wq;
+#define XFS_ABSDIFF(a,b)        (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+#define XFSA_FIXUP_BNO_OK       1
+#define XFSA_FIXUP_CNT_OK       2
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
+                xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_eq(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int                             /* error */
+xfs_alloc_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                              /* error */
+xfs_alloc_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len)    /* length of extent */
+{
+        union xfs_btree_rec     rec;
+        rec.alloc.ar_startblock = cpu_to_be32(bno);
+        rec.alloc.ar_blockcount = cpu_to_be32(len);
+        return xfs_btree_update(cur, &rec);
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat)  /* output: success/failure */
+{
+        union xfs_btree_rec     *rec;
+        int                     error;
+        error = xfs_btree_get_rec(cur, &rec, stat);
+        if (!error && *stat == 1) {
+                *bno = be32_to_cpu(rec->alloc.ar_startblock);
+                *len = be32_to_cpu(rec->alloc.ar_blockcount);
+        }
+        return error;
+}
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC void
+xfs_alloc_compute_aligned(
+        xfs_alloc_arg_t *args,          /* allocation argument structure */
+        xfs_agblock_t   foundbno,       /* starting block in found extent */
+        xfs_extlen_t    foundlen,       /* length in found extent */
+        xfs_agblock_t   *resbno,        /* result block number */
+        xfs_extlen_t    *reslen)        /* result length */
+{
+        xfs_agblock_t   bno;
+        xfs_extlen_t    len;
+        /* Trim busy sections out of found extent */
+        xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+        if (args->alignment > 1 && len >= args->minlen) {
+                xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
+                xfs_extlen_t    diff = aligned_bno - bno;
+                *resbno = aligned_bno;
+                *reslen = diff >= len ? 0 : len - diff;
+        } else {
+                *resbno = bno;
+                *reslen = len;
+        }
+}
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t                     /* difference value (absolute) */
+xfs_alloc_compute_diff(
+        xfs_agblock_t   wantbno,        /* target starting block */
+        xfs_extlen_t    wantlen,        /* target length */
+        xfs_extlen_t    alignment,      /* target alignment */
+        char            userdata,       /* are we allocating data? */
+        xfs_agblock_t   freebno,        /* freespace's starting block */
+        xfs_extlen_t    freelen,        /* freespace's length */
+        xfs_agblock_t   *newbnop)       /* result: best start block from free */
+{
+        xfs_agblock_t   freeend;        /* end of freespace extent */
+        xfs_agblock_t   newbno1;        /* return block number */
+        xfs_agblock_t   newbno2;        /* other new block number */
+        xfs_extlen_t    newlen1=0;      /* length with newbno1 */
+        xfs_extlen_t    newlen2=0;      /* length with newbno2 */
+        xfs_agblock_t   wantend;        /* end of target extent */
+        ASSERT(freelen >= wantlen);
+        freeend = freebno + freelen;
+        wantend = wantbno + wantlen;
+        /*
+         * We want to allocate from the start of a free extent if it is past
+         * the desired block or if we are allocating user data and the free
+         * extent is before desired block. The second case is there to allow
+         * for contiguous allocation from the remaining free space if the file
+         * grows in the short term.
+         */
+        if (freebno >= wantbno || (userdata && freeend < wantend)) {
+                if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+                        newbno1 = NULLAGBLOCK;
+        } else if (freeend >= wantend && alignment > 1) {
+                newbno1 = roundup(wantbno, alignment);
+                newbno2 = newbno1 - alignment;
+                if (newbno1 >= freeend)
+                        newbno1 = NULLAGBLOCK;
+                else
+                        newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+                if (newbno2 < freebno)
+                        newbno2 = NULLAGBLOCK;
+                else
+                        newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+                if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+                        if (newlen1 < newlen2 ||
+                            (newlen1 == newlen2 &&
+                             XFS_ABSDIFF(newbno1, wantbno) >
+                             XFS_ABSDIFF(newbno2, wantbno)))
+                                newbno1 = newbno2;
+                } else if (newbno2 != NULLAGBLOCK)
+                        newbno1 = newbno2;
+        } else if (freeend >= wantend) {
+                newbno1 = wantbno;
+        } else if (alignment > 1) {
+                newbno1 = roundup(freeend - wantlen, alignment);
+                if (newbno1 > freeend - wantlen &&
+                    newbno1 - alignment >= freebno)
+                        newbno1 -= alignment;
+                else if (newbno1 >= freeend)
+                        newbno1 = NULLAGBLOCK;
+        } else
+                newbno1 = freeend - wantlen;
+        *newbnop = newbno1;
+        return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+        xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+        xfs_extlen_t    k;
+        xfs_extlen_t    rlen;
+        ASSERT(args->mod < args->prod);
+        rlen = args->len;
+        ASSERT(rlen >= args->minlen);
+        ASSERT(rlen <= args->maxlen);
+        if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+            (args->mod == 0 && rlen < args->prod))
+                return;
+        k = rlen % args->prod;
+        if (k == args->mod)
+                return;
+        if (k > args->mod)
+                rlen = rlen - (k - args->mod);
+        else
+                rlen = rlen - args->prod + (args->mod - k);
+        if ((int)rlen < (int)args->minlen)
+                return;
+        ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+        ASSERT(rlen % args->prod == args->mod);
+        args->len = rlen;
+}
+/*
+ * Fix up length if there is too little space left in the a.g.
+ * Return 1 if ok, 0 if too little, should give up.
+ */
+STATIC int
+xfs_alloc_fix_minleft(
+        xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+        xfs_agf_t       *agf;           /* a.g. freelist header */
+        int             diff;           /* free space difference */
+        if (args->minleft == 0)
+                return 1;
+        agf = XFS_BUF_TO_AGF(args->agbp);
+        diff = be32_to_cpu(agf->agf_freeblks)
+                - args->len - args->minleft;
+        if (diff >= 0)
+                return 1;
+        args->len += diff;              /* shrink the allocated space */
+        if (args->len >= args->minlen)
+                return 1;
+        args->agbno = NULLAGBLOCK;
+        return 0;
+}
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks.  The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int                              /* error code */
+xfs_alloc_fixup_trees(
+        xfs_btree_cur_t *cnt_cur,       /* cursor for by-size btree */
+        xfs_btree_cur_t *bno_cur,       /* cursor for by-block btree */
+        xfs_agblock_t   fbno,           /* starting block of free extent */
+        xfs_extlen_t    flen,           /* length of free extent */
+        xfs_agblock_t   rbno,           /* starting block of returned extent */
+        xfs_extlen_t    rlen,           /* length of returned extent */
+        int             flags)          /* flags, XFSA_FIXUP_... */
+{
+        int             error;          /* error code */
+        int             i;              /* operation results */
+        xfs_agblock_t   nfbno1;         /* first new free startblock */
+        xfs_agblock_t   nfbno2;         /* second new free startblock */
+        xfs_extlen_t    nflen1=0;       /* first new free length */
+        xfs_extlen_t    nflen2=0;       /* second new free length */
+        /*
+         * Look up the record in the by-size tree if necessary.
+         */
+        if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+                if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(
+                        i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+        } else {
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        /*
+         * Look up the record in the by-block tree if necessary.
+         */
+        if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+                if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(
+                        i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+        } else {
+                if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+#ifdef DEBUG
+        if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+                struct xfs_btree_block  *bnoblock;
+                struct xfs_btree_block  *cntblock;
+                bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+                cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+                XFS_WANT_CORRUPTED_RETURN(
+                        bnoblock->bb_numrecs == cntblock->bb_numrecs);
+        }
+#endif
+        /*
+         * Deal with all four cases: the allocated record is contained
+         * within the freespace record, so we can have new freespace
+         * at either (or both) end, or no freespace remaining.
+         */
+        if (rbno == fbno && rlen == flen)
+                nfbno1 = nfbno2 = NULLAGBLOCK;
+        else if (rbno == fbno) {
+                nfbno1 = rbno + rlen;
+                nflen1 = flen - rlen;
+                nfbno2 = NULLAGBLOCK;
+        } else if (rbno + rlen == fbno + flen) {
+                nfbno1 = fbno;
+                nflen1 = flen - rlen;
+                nfbno2 = NULLAGBLOCK;
+        } else {
+                nfbno1 = fbno;
+                nflen1 = rbno - fbno;
+                nfbno2 = rbno + rlen;
+                nflen2 = (fbno + flen) - nfbno2;
+        }
+        /*
+         * Delete the entry from the by-size btree.
+         */
+        if ((error = xfs_btree_delete(cnt_cur, &i)))
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        /*
+         * Add new by-size btree entry(s).
+         */
+        if (nfbno1 != NULLAGBLOCK) {
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                if ((error = xfs_btree_insert(cnt_cur, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        if (nfbno2 != NULLAGBLOCK) {
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                if ((error = xfs_btree_insert(cnt_cur, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        /*
+         * Fix up the by-block btree entry(s).
+         */
+        if (nfbno1 == NULLAGBLOCK) {
+                /*
+                 * No remaining freespace, just delete the by-block tree entry.
+                 */
+                if ((error = xfs_btree_delete(bno_cur, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        } else {
+                /*
+                 * Update the by-block entry to start later|be shorter.
+                 */
+                if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+                        return error;
+        }
+        if (nfbno2 != NULLAGBLOCK) {
+                /*
+                 * 2 resulting free entries, need to add one.
+                 */
+                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                if ((error = xfs_btree_insert(bno_cur, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        return 0;
+}
+static bool
+xfs_agfl_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+        int             i;
+        if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+                return false;
+        if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+                return false;
+        /*
+         * during growfs operations, the perag is not fully initialised,
+         * so we can't use it for any useful checking. growfs ensures we can't
+         * use it by using uncached buffers that don't have the perag attached
+         * so we can detect and avoid this problem.
+         */
+        if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+                return false;
+        for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+                if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
+                    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+                        return false;
+        }
+        return true;
+}
+static void
+xfs_agfl_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        /*
+         * There is no verification of non-crc AGFLs because mkfs does not
+         * initialise the AGFL to zero or NULL. Hence the only valid part of the
+         * AGFL is what the AGF says is active. We can't get to the AGF, so we
+         * can't verify just those entries are valid.
+         */
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_agfl_verify(bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+static void
+xfs_agfl_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        /* no verification of non-crc AGFLs */
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (!xfs_agfl_verify(bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (bip)
+                XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
+}
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+        .verify_read = xfs_agfl_read_verify,
+        .verify_write = xfs_agfl_write_verify,
+};
+/*
+ * Read in the allocation group free block array.
+ */
+STATIC int                              /* error */
+xfs_alloc_read_agfl(
+        xfs_mount_t     *mp,            /* mount point structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_buf_t       **bpp)          /* buffer for the ag free block array */
+{
+        xfs_buf_t       *bp;            /* return value */
+        int             error;
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_trans_read_buf(
+                        mp, tp, mp->m_ddev_targp,
+                        XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+                        XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+        if (error)
+                return error;
+        xfs_buf_set_ref(bp, XFS_AGFL_REF);
+        *bpp = bp;
+        return 0;
+}
+STATIC int
+xfs_alloc_update_counters(
+        struct xfs_trans        *tp,
+        struct xfs_perag        *pag,
+        struct xfs_buf          *agbp,
+        long                    len)
+{
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        pag->pagf_freeblks += len;
+        be32_add_cpu(&agf->agf_freeblks, len);
+        xfs_trans_agblocks_delta(tp, len);
+        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+                     be32_to_cpu(agf->agf_length)))
+                return -EFSCORRUPTED;
+        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+        return 0;
+}
+/*
+ * Allocation group level functions.
+ */
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                      /* error */
+xfs_alloc_ag_vextent(
+        xfs_alloc_arg_t *args)  /* argument structure for allocation */
+{
+        int             error=0;
+        ASSERT(args->minlen > 0);
+        ASSERT(args->maxlen > 0);
+        ASSERT(args->minlen <= args->maxlen);
+        ASSERT(args->mod < args->prod);
+        ASSERT(args->alignment > 0);
+        /*
+         * Branch to correct routine based on the type.
+         */
+        args->wasfromfl = 0;
+        switch (args->type) {
+        case XFS_ALLOCTYPE_THIS_AG:
+                error = xfs_alloc_ag_vextent_size(args);
+                break;
+        case XFS_ALLOCTYPE_NEAR_BNO:
+                error = xfs_alloc_ag_vextent_near(args);
+                break;
+        case XFS_ALLOCTYPE_THIS_BNO:
+                error = xfs_alloc_ag_vextent_exact(args);
+                break;
+        default:
+                ASSERT(0);
+                /* NOTREACHED */
+        }
+        if (error || args->agbno == NULLAGBLOCK)
+                return error;
+        ASSERT(args->len >= args->minlen);
+        ASSERT(args->len <= args->maxlen);
+        ASSERT(!args->wasfromfl || !args->isfl);
+        ASSERT(args->agbno % args->alignment == 0);
+        if (!args->wasfromfl) {
+                error = xfs_alloc_update_counters(args->tp, args->pag,
+                                                  args->agbp,
+                                                  -((long)(args->len)));
+                if (error)
+                        return error;
+                ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
+                                              args->agbno, args->len));
+        }
+        if (!args->isfl) {
+                xfs_trans_mod_sb(args->tp, args->wasdel ?
+                                 XFS_TRANS_SB_RES_FDBLOCKS :
+                                 XFS_TRANS_SB_FDBLOCKS,
+                                 -((long)(args->len)));
+        }
+        XFS_STATS_INC(xs_allocx);
+        XFS_STATS_ADD(xs_allocb, args->len);
+        return error;
+}
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                      /* error */
+xfs_alloc_ag_vextent_exact(
+        xfs_alloc_arg_t *args)  /* allocation argument structure */
+{
+        xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
+        xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+        int             error;
+        xfs_agblock_t   fbno;   /* start block of found extent */
+        xfs_extlen_t    flen;   /* length of found extent */
+        xfs_agblock_t   tbno;   /* start block of trimmed extent */
+        xfs_extlen_t    tlen;   /* length of trimmed extent */
+        xfs_agblock_t   tend;   /* end block of trimmed extent */
+        int             i;      /* success/failure of operation */
+        ASSERT(args->alignment == 1);
+        /*
+         * Allocate/initialize a cursor for the by-number freespace btree.
+         */
+        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+                                          args->agno, XFS_BTNUM_BNO);
+        /*
+         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+         * Look for the closest free block <= bno, it must contain bno
+         * if any free block does.
+         */
+        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+        if (error)
+                goto error0;
+        if (!i)
+                goto not_found;
+        /*
+         * Grab the freespace record.
+         */
+        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+        if (error)
+                goto error0;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        ASSERT(fbno <= args->agbno);
+        /*
+         * Check for overlapping busy extents.
+         */
+        xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+        /*
+         * Give up if the start of the extent is busy, or the freespace isn't
+         * long enough for the minimum request.
+         */
+        if (tbno > args->agbno)
+                goto not_found;
+        if (tlen < args->minlen)
+                goto not_found;
+        tend = tbno + tlen;
+        if (tend < args->agbno + args->minlen)
+                goto not_found;
+        /*
+         * End of extent will be smaller of the freespace end and the
+         * maximal requested end.
+         *
+         * Fix the length according to mod and prod if given.
+         */
+        args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
+                                                - args->agbno;
+        xfs_alloc_fix_len(args);
+        if (!xfs_alloc_fix_minleft(args))
+                goto not_found;
+        ASSERT(args->agbno + args->len <= tend);
+        /*
+         * We are allocating agbno for args->len
+         * Allocate/initialize a cursor for the by-size btree.
+         */
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_CNT);
+        ASSERT(args->agbno + args->len <=
+                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+                                      args->len, XFSA_FIXUP_BNO_OK);
+        if (error) {
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+                goto error0;
+        }
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        args->wasfromfl = 0;
+        trace_xfs_alloc_exact_done(args);
+        return 0;
+not_found:
+        /* Didn't find it, return null. */
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        args->agbno = NULLAGBLOCK;
+        trace_xfs_alloc_exact_notfound(args);
+        return 0;
+error0:
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+        trace_xfs_alloc_exact_error(args);
+        return error;
+}
+/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+        struct xfs_alloc_arg    *args,  /* allocation argument structure */
+        struct xfs_btree_cur    **gcur, /* good cursor */
+        struct xfs_btree_cur    **scur, /* searching cursor */
+        xfs_agblock_t           gdiff,  /* difference for search comparison */
+        xfs_agblock_t           *sbno,  /* extent found by search */
+        xfs_extlen_t            *slen,  /* extent length */
+        xfs_agblock_t           *sbnoa, /* aligned extent found by search */
+        xfs_extlen_t            *slena, /* aligned extent length */
+        int                     dir)    /* 0 = search right, 1 = search left */
+{
+        xfs_agblock_t           new;
+        xfs_agblock_t           sdiff;
+        int                     error;
+        int                     i;
+        /* The good extent is perfect, no need to  search. */
+        if (!gdiff)
+                goto out_use_good;
+        /*
+         * Look until we find a better one, run out of space or run off the end.
+         */
+        do {
+                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+                /*
+                 * The good extent is closer than this one.
+                 */
+                if (!dir) {
+                        if (*sbnoa >= args->agbno + gdiff)
+                                goto out_use_good;
+                } else {
+                        if (*sbnoa <= args->agbno - gdiff)
+                                goto out_use_good;
+                }
+                /*
+                 * Same distance, compare length and pick the best.
+                 */
+                if (*slena >= args->minlen) {
+                        args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                       args->alignment,
+                                                       args->userdata, *sbnoa,
+                                                       *slena, &new);
+                        /*
+                         * Choose closer size and invalidate other cursor.
+                         */
+                        if (sdiff < gdiff)
+                                goto out_use_search;
+                        goto out_use_good;
+                }
+                if (!dir)
+                        error = xfs_btree_increment(*scur, 0, &i);
+                else
+                        error = xfs_btree_decrement(*scur, 0, &i);
+                if (error)
+                        goto error0;
+        } while (i);
+out_use_good:
+        xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+        *scur = NULL;
+        return 0;
+out_use_search:
+        xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+        *gcur = NULL;
+        return 0;
+error0:
+        /* caller invalidates cursors */
+        return error;
+}
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                              /* error */
+xfs_alloc_ag_vextent_near(
+        xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+        xfs_btree_cur_t *bno_cur_gt;    /* cursor for bno btree, right side */
+        xfs_btree_cur_t *bno_cur_lt;    /* cursor for bno btree, left side */
+        xfs_btree_cur_t *cnt_cur;       /* cursor for count btree */
+        xfs_agblock_t   gtbno;          /* start bno of right side entry */
+        xfs_agblock_t   gtbnoa;         /* aligned ... */
+        xfs_extlen_t    gtdiff;         /* difference to right side entry */
+        xfs_extlen_t    gtlen;          /* length of right side entry */
+        xfs_extlen_t    gtlena;         /* aligned ... */
+        xfs_agblock_t   gtnew;          /* useful start bno of right side */
+        int             error;          /* error code */
+        int             i;              /* result code, temporary */
+        int             j;              /* result code, temporary */
+        xfs_agblock_t   ltbno;          /* start bno of left side entry */
+        xfs_agblock_t   ltbnoa;         /* aligned ... */
+        xfs_extlen_t    ltdiff;         /* difference to left side entry */
+        xfs_extlen_t    ltlen;          /* length of left side entry */
+        xfs_extlen_t    ltlena;         /* aligned ... */
+        xfs_agblock_t   ltnew;          /* useful start bno of left side */
+        xfs_extlen_t    rlen;           /* length of returned extent */
+        int             forced = 0;
+#ifdef DEBUG
+        /*
+         * Randomly don't execute the first algorithm.
+         */
+        int             dofirst;        /* set to do first algorithm */
+        dofirst = prandom_u32() & 1;
+#endif
+restart:
+        bno_cur_lt = NULL;
+        bno_cur_gt = NULL;
+        ltlen = 0;
+        gtlena = 0;
+        ltlena = 0;
+        /*
+         * Get a cursor for the by-size btree.
+         */
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_CNT);
+        /*
+         * See if there are any free extents as big as maxlen.
+         */
+        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
+                goto error0;
+        /*
+         * If none, then pick up the last entry in the tree unless the
+         * tree is empty.
+         */
+        if (!i) {
+                if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
+                                &ltlen, &i)))
+                        goto error0;
+                if (i == 0 || ltlen == 0) {
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        trace_xfs_alloc_near_noentry(args);
+                        return 0;
+                }
+                ASSERT(i == 1);
+        }
+        args->wasfromfl = 0;
+        /*
+         * First algorithm.
+         * If the requested extent is large wrt the freespaces available
+         * in this a.g., then the cursor will be pointing to a btree entry
+         * near the right edge of the tree.  If it's in the last btree leaf
+         * block, then we just examine all the entries in that block
+         * that are big enough, and pick the best one.
+         * This is written as a while loop so we can break out of it,
+         * but we never loop back to the top.
+         */
+        while (xfs_btree_islastblock(cnt_cur, 0)) {
+                xfs_extlen_t    bdiff;
+                int             besti=0;
+                xfs_extlen_t    blen=0;
+                xfs_agblock_t   bnew=0;
+#ifdef DEBUG
+                if (dofirst)
+                        break;
+#endif
+                /*
+                 * Start from the entry that lookup found, sequence through
+                 * all larger free blocks.  If we're actually pointing at a
+                 * record smaller than maxlen, go to the start of this block,
+                 * and skip all those smaller than minlen.
+                 */
+                if (ltlen || args->alignment > 1) {
+                        cnt_cur->bc_ptrs[0] = 1;
+                        do {
+                                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
+                                                &ltlen, &i)))
+                                        goto error0;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                                if (ltlen >= args->minlen)
+                                        break;
+                                if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
+                                        goto error0;
+                        } while (i);
+                        ASSERT(ltlen >= args->minlen);
+                        if (!i)
+                                break;
+                }
+                i = cnt_cur->bc_ptrs[0];
+                for (j = 1, blen = 0, bdiff = 0;
+                     !error && j && (blen < args->maxlen || bdiff > 0);
+                     error = xfs_btree_increment(cnt_cur, 0, &j)) {
+                        /*
+                         * For each entry, decide if it's better than
+                         * the previous best entry.
+                         */
+                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
+                                                  &ltbnoa, &ltlena);
+                        if (ltlena < args->minlen)
+                                continue;
+                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        ASSERT(args->len >= args->minlen);
+                        if (args->len < blen)
+                                continue;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                args->alignment, args->userdata, ltbnoa,
+                                ltlena, &ltnew);
+                        if (ltnew != NULLAGBLOCK &&
+                            (args->len > blen || ltdiff < bdiff)) {
+                                bdiff = ltdiff;
+                                bnew = ltnew;
+                                blen = args->len;
+                                besti = cnt_cur->bc_ptrs[0];
+                        }
+                }
+                /*
+                 * It didn't work.  We COULD be in a case where
+                 * there's a good record somewhere, so try again.
+                 */
+                if (blen == 0)
+                        break;
+                /*
+                 * Point at the best entry, and retrieve it again.
+                 */
+                cnt_cur->bc_ptrs[0] = besti;
+                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+                args->len = blen;
+                if (!xfs_alloc_fix_minleft(args)) {
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        trace_xfs_alloc_near_nominleft(args);
+                        return 0;
+                }
+                blen = args->len;
+                /*
+                 * We are allocating starting at bnew for blen blocks.
+                 */
+                args->agbno = bnew;
+                ASSERT(bnew >= ltbno);
+                ASSERT(bnew + blen <= ltbno + ltlen);
+                /*
+                 * Set up a cursor for the by-bno tree.
+                 */
+                bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+                        args->agbp, args->agno, XFS_BTNUM_BNO);
+                /*
+                 * Fix up the btree entries.
+                 */
+                if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
+                                ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
+                        goto error0;
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+                trace_xfs_alloc_near_first(args);
+                return 0;
+        }
+        /*
+         * Second algorithm.
+         * Search in the by-bno tree to the left and to the right
+         * simultaneously, until in each case we find a space big enough,
+         * or run into the edge of the tree.  When we run into the edge,
+         * we deallocate that cursor.
+         * If both searches succeed, we compare the two spaces and pick
+         * the better one.
+         * With alignment, it's possible for both to fail; the upper
+         * level algorithm that picks allocation groups for allocations
+         * is not supposed to do this.
+         */
+        /*
+         * Allocate and initialize the cursor for the leftward search.
+         */
+        bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_BNO);
+        /*
+         * Lookup <= bno to find the leftward search's starting point.
+         */
+        if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
+                goto error0;
+        if (!i) {
+                /*
+                 * Didn't find anything; use this cursor for the rightward
+                 * search.
+                 */
+                bno_cur_gt = bno_cur_lt;
+                bno_cur_lt = NULL;
+        }
+        /*
+         * Found something.  Duplicate the cursor for the rightward search.
+         */
+        else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
+                goto error0;
+        /*
+         * Increment the cursor, so we will point at the entry just right
+         * of the leftward entry if any, or to the leftmost entry.
+         */
+        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+                goto error0;
+        if (!i) {
+                /*
+                 * It failed, there are no rightward entries.
+                 */
+                xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
+                bno_cur_gt = NULL;
+        }
+        /*
+         * Loop going left with the leftward cursor, right with the
+         * rightward cursor, until either both directions give up or
+         * we find an entry at least as big as minlen.
+         */
+        do {
+                if (bno_cur_lt) {
+                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
+                                                  &ltbnoa, &ltlena);
+                        if (ltlena >= args->minlen)
+                                break;
+                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
+                                goto error0;
+                        if (!i) {
+                                xfs_btree_del_cursor(bno_cur_lt,
+                                                     XFS_BTREE_NOERROR);
+                                bno_cur_lt = NULL;
+                        }
+                }
+                if (bno_cur_gt) {
+                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        xfs_alloc_compute_aligned(args, gtbno, gtlen,
+                                                  &gtbnoa, &gtlena);
+                        if (gtlena >= args->minlen)
+                                break;
+                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+                                goto error0;
+                        if (!i) {
+                                xfs_btree_del_cursor(bno_cur_gt,
+                                                     XFS_BTREE_NOERROR);
+                                bno_cur_gt = NULL;
+                        }
+                }
+        } while (bno_cur_lt || bno_cur_gt);
+        /*
+         * Got both cursors still active, need to find better entry.
+         */
+        if (bno_cur_lt && bno_cur_gt) {
+                if (ltlena >= args->minlen) {
+                        /*
+                         * Left side is good, look for a right side entry.
+                         */
+                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                args->alignment, args->userdata, ltbnoa,
+                                ltlena, &ltnew);
+                        error = xfs_alloc_find_best_extent(args,
+                                                &bno_cur_lt, &bno_cur_gt,
+                                                ltdiff, &gtbno, &gtlen,
+                                                &gtbnoa, &gtlena,
+                                                0 /* search right */);
+                } else {
+                        ASSERT(gtlena >= args->minlen);
+                        /*
+                         * Right side is good, look for a left side entry.
+                         */
+                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                args->alignment, args->userdata, gtbnoa,
+                                gtlena, &gtnew);
+                        error = xfs_alloc_find_best_extent(args,
+                                                &bno_cur_gt, &bno_cur_lt,
+                                                gtdiff, &ltbno, &ltlen,
+                                                &ltbnoa, &ltlena,
+                                                1 /* search left */);
+                }
+                if (error)
+                        goto error0;
+        }
+        /*
+         * If we couldn't get anything, give up.
+         */
+        if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                if (!forced++) {
+                        trace_xfs_alloc_near_busy(args);
+                        xfs_log_force(args->mp, XFS_LOG_SYNC);
+                        goto restart;
+                }
+                trace_xfs_alloc_size_neither(args);
+                args->agbno = NULLAGBLOCK;
+                return 0;
+        }
+        /*
+         * At this point we have selected a freespace entry, either to the
+         * left or to the right.  If it's on the right, copy all the
+         * useful variables to the "left" set so we only have one
+         * copy of this code.
+         */
+        if (bno_cur_gt) {
+                bno_cur_lt = bno_cur_gt;
+                bno_cur_gt = NULL;
+                ltbno = gtbno;
+                ltbnoa = gtbnoa;
+                ltlen = gtlen;
+                ltlena = gtlena;
+                j = 1;
+        } else
+                j = 0;
+        /*
+         * Fix up the length and compute the useful address.
+         */
+        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+        xfs_alloc_fix_len(args);
+        if (!xfs_alloc_fix_minleft(args)) {
+                trace_xfs_alloc_near_nominleft(args);
+                xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                return 0;
+        }
+        rlen = args->len;
+        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+                                     args->userdata, ltbnoa, ltlena, &ltnew);
+        ASSERT(ltnew >= ltbno);
+        ASSERT(ltnew + rlen <= ltbnoa + ltlena);
+        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+        args->agbno = ltnew;
+        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
+                        ltnew, rlen, XFSA_FIXUP_BNO_OK)))
+                goto error0;
+        if (j)
+                trace_xfs_alloc_near_greater(args);
+        else
+                trace_xfs_alloc_near_lesser(args);
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+        return 0;
+ error0:
+        trace_xfs_alloc_near_error(args);
+        if (cnt_cur != NULL)
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+        if (bno_cur_lt != NULL)
+                xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
+        if (bno_cur_gt != NULL)
+                xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                              /* error */
+xfs_alloc_ag_vextent_size(
+        xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+        xfs_btree_cur_t *bno_cur;       /* cursor for bno btree */
+        xfs_btree_cur_t *cnt_cur;       /* cursor for cnt btree */
+        int             error;          /* error result */
+        xfs_agblock_t   fbno;           /* start of found freespace */
+        xfs_extlen_t    flen;           /* length of found freespace */
+        int             i;              /* temp status variable */
+        xfs_agblock_t   rbno;           /* returned block number */
+        xfs_extlen_t    rlen;           /* length of returned extent */
+        int             forced = 0;
+restart:
+        /*
+         * Allocate and initialize a cursor for the by-size btree.
+         */
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_CNT);
+        bno_cur = NULL;
+        /*
+         * Look for an entry >= maxlen+alignment-1 blocks.
+         */
+        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+                        args->maxlen + args->alignment - 1, &i)))
+                goto error0;
+        /*
+         * If none or we have busy extents that we cannot allocate from, then
+         * we have to settle for a smaller extent. In the case that there are
+         * no large extents, this will return the last entry in the tree unless
+         * the tree is empty. In the case that there are only busy large
+         * extents, this will return the largest small extent unless there
+         * are no smaller extents available.
+         */
+        if (!i || forced > 1) {
+                error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+                                                   &fbno, &flen, &i);
+                if (error)
+                        goto error0;
+                if (i == 0 || flen == 0) {
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        trace_xfs_alloc_size_noentry(args);
+                        return 0;
+                }
+                ASSERT(i == 1);
+                xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+        } else {
+                /*
+                 * Search for a non-busy extent that is large enough.
+                 * If we are at low space, don't check, or if we fall of
+                 * the end of the btree, turn off the busy check and
+                 * restart.
+                 */
+                for (;;) {
+                        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        xfs_alloc_compute_aligned(args, fbno, flen,
+                                                  &rbno, &rlen);
+                        if (rlen >= args->maxlen)
+                                break;
+                        error = xfs_btree_increment(cnt_cur, 0, &i);
+                        if (error)
+                                goto error0;
+                        if (i == 0) {
+                                /*
+                                 * Our only valid extents must have been busy.
+                                 * Make it unbusy by forcing the log out and
+                                 * retrying. If we've been here before, forcing
+                                 * the log isn't making the extents available,
+                                 * which means they have probably been freed in
+                                 * this transaction.  In that case, we have to
+                                 * give up on them and we'll attempt a minlen
+                                 * allocation the next time around.
+                                 */
+                                xfs_btree_del_cursor(cnt_cur,
+                                                     XFS_BTREE_NOERROR);
+                                trace_xfs_alloc_size_busy(args);
+                                if (!forced++)
+                                        xfs_log_force(args->mp, XFS_LOG_SYNC);
+                                goto restart;
+                        }
+                }
+        }
+        /*
+         * In the first case above, we got the last entry in the
+         * by-size btree.  Now we check to see if the space hits maxlen
+         * once aligned; if not, we search left for something better.
+         * This can't happen in the second case above.
+         */
+        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
+        if (rlen < args->maxlen) {
+                xfs_agblock_t   bestfbno;
+                xfs_extlen_t    bestflen;
+                xfs_agblock_t   bestrbno;
+                xfs_extlen_t    bestrlen;
+                bestrlen = rlen;
+                bestrbno = rbno;
+                bestflen = flen;
+                bestfbno = fbno;
+                for (;;) {
+                        if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
+                                goto error0;
+                        if (i == 0)
+                                break;
+                        if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+                                        &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        if (flen < bestrlen)
+                                break;
+                        xfs_alloc_compute_aligned(args, fbno, flen,
+                                                  &rbno, &rlen);
+                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+                        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+                                (rlen <= flen && rbno + rlen <= fbno + flen),
+                                error0);
+                        if (rlen > bestrlen) {
+                                bestrlen = rlen;
+                                bestrbno = rbno;
+                                bestflen = flen;
+                                bestfbno = fbno;
+                                if (rlen == args->maxlen)
+                                        break;
+                        }
+                }
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+                                &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                rlen = bestrlen;
+                rbno = bestrbno;
+                flen = bestflen;
+                fbno = bestfbno;
+        }
+        args->wasfromfl = 0;
+        /*
+         * Fix up the length.
+         */
+        args->len = rlen;
+        if (rlen < args->minlen) {
+                if (!forced++) {
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        trace_xfs_alloc_size_busy(args);
+                        xfs_log_force(args->mp, XFS_LOG_SYNC);
+                        goto restart;
+                }
+                goto out_nominleft;
+        }
+        xfs_alloc_fix_len(args);
+        if (!xfs_alloc_fix_minleft(args))
+                goto out_nominleft;
+        rlen = args->len;
+        XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+        /*
+         * Allocate and initialize a cursor for the by-block tree.
+         */
+        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_BNO);
+        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+                        rbno, rlen, XFSA_FIXUP_CNT_OK)))
+                goto error0;
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        cnt_cur = bno_cur = NULL;
+        args->len = rlen;
+        args->agbno = rbno;
+        XFS_WANT_CORRUPTED_GOTO(
+                args->agbno + args->len <=
+                        be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+                error0);
+        trace_xfs_alloc_size_done(args);
+        return 0;
+error0:
+        trace_xfs_alloc_size_error(args);
+        if (cnt_cur)
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+        if (bno_cur)
+                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+        return error;
+out_nominleft:
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        trace_xfs_alloc_size_nominleft(args);
+        args->agbno = NULLAGBLOCK;
+        return 0;
+}
+/*
+ * Deal with the case where only small freespaces remain.
+ * Either return the contents of the last freespace record,
+ * or allocate space from the freelist if there is nothing in the tree.
+ */
+STATIC int                      /* error */
+xfs_alloc_ag_vextent_small(
+        xfs_alloc_arg_t *args,  /* allocation argument structure */
+        xfs_btree_cur_t *ccur,  /* by-size cursor */
+        xfs_agblock_t   *fbnop, /* result block number */
+        xfs_extlen_t    *flenp, /* result length */
+        int             *stat)  /* status: 0-freelist, 1-normal/none */
+{
+        int             error;
+        xfs_agblock_t   fbno;
+        xfs_extlen_t    flen;
+        int             i;
+        if ((error = xfs_btree_decrement(ccur, 0, &i)))
+                goto error0;
+        if (i) {
+                if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        }
+        /*
+         * Nothing in the btree, try the freelist.  Make sure
+         * to respect minleft even when pulling from the
+         * freelist.
+         */
+        else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+                 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
+                  > args->minleft)) {
+                error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+                if (error)
+                        goto error0;
+                if (fbno != NULLAGBLOCK) {
+                        xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+                                             args->userdata);
+                        if (args->userdata) {
+                                xfs_buf_t       *bp;
+                                bp = xfs_btree_get_bufs(args->mp, args->tp,
+                                        args->agno, fbno, 0);
+                                xfs_trans_binval(args->tp, bp);
+                        }
+                        args->len = 1;
+                        args->agbno = fbno;
+                        XFS_WANT_CORRUPTED_GOTO(
+                                args->agbno + args->len <=
+                                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+                                error0);
+                        args->wasfromfl = 1;
+                        trace_xfs_alloc_small_freelist(args);
+                        *stat = 0;
+                        return 0;
+                }
+                /*
+                 * Nothing in the freelist.
+                 */
+                else
+                        flen = 0;
+        }
+        /*
+         * Can't allocate from the freelist for some reason.
+         */
+        else {
+                fbno = NULLAGBLOCK;
+                flen = 0;
+        }
+        /*
+         * Can't do the allocation, give up.
+         */
+        if (flen < args->minlen) {
+                args->agbno = NULLAGBLOCK;
+                trace_xfs_alloc_small_notenough(args);
+                flen = 0;
+        }
+        *fbnop = fbno;
+        *flenp = flen;
+        *stat = 1;
+        trace_xfs_alloc_small_done(args);
+        return 0;
+error0:
+        trace_xfs_alloc_small_error(args);
+        return error;
+}
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int                      /* error */
+xfs_free_ag_extent(
+        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_buf_t       *agbp,  /* buffer for a.g. freelist header */
+        xfs_agnumber_t  agno,   /* allocation group number */
+        xfs_agblock_t   bno,    /* starting block number */
+        xfs_extlen_t    len,    /* length of extent */
+        int             isfl)   /* set if is freelist blocks - no sb acctg */
+{
+        xfs_btree_cur_t *bno_cur;       /* cursor for by-block btree */
+        xfs_btree_cur_t *cnt_cur;       /* cursor for by-size btree */
+        int             error;          /* error return value */
+        xfs_agblock_t   gtbno;          /* start of right neighbor block */
+        xfs_extlen_t    gtlen;          /* length of right neighbor block */
+        int             haveleft;       /* have a left neighbor block */
+        int             haveright;      /* have a right neighbor block */
+        int             i;              /* temp, result code */
+        xfs_agblock_t   ltbno;          /* start of left neighbor block */
+        xfs_extlen_t    ltlen;          /* length of left neighbor block */
+        xfs_mount_t     *mp;            /* mount point struct for filesystem */
+        xfs_agblock_t   nbno;           /* new starting block of freespace */
+        xfs_extlen_t    nlen;           /* new length of freespace */
+        xfs_perag_t     *pag;           /* per allocation group data */
+        mp = tp->t_mountp;
+        /*
+         * Allocate and initialize a cursor for the by-block btree.
+         */
+        bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
+        cnt_cur = NULL;
+        /*
+         * Look for a neighboring block on the left (lower block numbers)
+         * that is contiguous with this space.
+         */
+        if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+                goto error0;
+        if (haveleft) {
+                /*
+                 * There is a block to our left.
+                 */
+                if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * It's not contiguous, though.
+                 */
+                if (ltbno + ltlen < bno)
+                        haveleft = 0;
+                else {
+                        /*
+                         * If this failure happens the request to free this
+                         * space was invalid, it's (partly) already free.
+                         * Very bad.
+                         */
+                        XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+                }
+        }
+        /*
+         * Look for a neighboring block on the right (higher block numbers)
+         * that is contiguous with this space.
+         */
+        if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
+                goto error0;
+        if (haveright) {
+                /*
+                 * There is a block to our right.
+                 */
+                if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * It's not contiguous, though.
+                 */
+                if (bno + len < gtbno)
+                        haveright = 0;
+                else {
+                        /*
+                         * If this failure happens the request to free this
+                         * space was invalid, it's (partly) already free.
+                         * Very bad.
+                         */
+                        XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+                }
+        }
+        /*
+         * Now allocate and initialize a cursor for the by-size tree.
+         */
+        cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
+        /*
+         * Have both left and right contiguous neighbors.
+         * Merge all three into a single free block.
+         */
+        if (haveleft && haveright) {
+                /*
+                 * Delete the old by-size entry on the left.
+                 */
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Delete the old by-size entry on the right.
+                 */
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Delete the old by-block entry for the right block.
+                 */
+                if ((error = xfs_btree_delete(bno_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Move the by-block cursor back to the left neighbor.
+                 */
+                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+#ifdef DEBUG
+                /*
+                 * Check that this is the right record: delete didn't
+                 * mangle the cursor.
+                 */
+                {
+                        xfs_agblock_t   xxbno;
+                        xfs_extlen_t    xxlen;
+                        if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+                                        &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(
+                                i == 1 && xxbno == ltbno && xxlen == ltlen,
+                                error0);
+                }
+#endif
+                /*
+                 * Update remaining by-block entry to the new, joined block.
+                 */
+                nbno = ltbno;
+                nlen = len + ltlen + gtlen;
+                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+                        goto error0;
+        }
+        /*
+         * Have only a left contiguous neighbor.
+         * Merge it together with the new freespace.
+         */
+        else if (haveleft) {
+                /*
+                 * Delete the old by-size entry on the left.
+                 */
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Back up the by-block cursor to the left neighbor, and
+                 * update its length.
+                 */
+                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                nbno = ltbno;
+                nlen = len + ltlen;
+                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+                        goto error0;
+        }
+        /*
+         * Have only a right contiguous neighbor.
+         * Merge it together with the new freespace.
+         */
+        else if (haveright) {
+                /*
+                 * Delete the old by-size entry on the right.
+                 */
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Update the starting block and length of the right
+                 * neighbor in the by-block tree.
+                 */
+                nbno = bno;
+                nlen = len + gtlen;
+                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+                        goto error0;
+        }
+        /*
+         * No contiguous neighbors.
+         * Insert the new freespace into the by-block tree.
+         */
+        else {
+                nbno = bno;
+                nlen = len;
+                if ((error = xfs_btree_insert(bno_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        }
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        bno_cur = NULL;
+        /*
+         * In all cases we need to insert the new freespace in the by-size tree.
+         */
+        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+                goto error0;
+        XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+        if ((error = xfs_btree_insert(cnt_cur, &i)))
+                goto error0;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        cnt_cur = NULL;
+        /*
+         * Update the freespace totals in the ag and superblock.
+         */
+        pag = xfs_perag_get(mp, agno);
+        error = xfs_alloc_update_counters(tp, pag, agbp, len);
+        xfs_perag_put(pag);
+        if (error)
+                goto error0;
+        if (!isfl)
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+        XFS_STATS_INC(xs_freex);
+        XFS_STATS_ADD(xs_freeb, len);
+        trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+        return 0;
+ error0:
+        trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+        if (bno_cur)
+                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+        if (cnt_cur)
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+        xfs_mount_t     *mp)    /* file system mount structure */
+{
+        int             level;
+        uint            maxblocks;
+        uint            maxleafents;
+        int             minleafrecs;
+        int             minnoderecs;
+        maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
+        minleafrecs = mp->m_alloc_mnr[0];
+        minnoderecs = mp->m_alloc_mnr[1];
+        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+        for (level = 1; maxblocks > 1; level++)
+                maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+        mp->m_ag_maxlevels = level;
+}
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag)
+{
+        xfs_extlen_t            need, delta = 0;
+        need = XFS_MIN_FREELIST_PAG(pag, mp);
+        if (need > pag->pagf_flcount)
+                delta = need - pag->pagf_flcount;
+        if (pag->pagf_longest > delta)
+                return pag->pagf_longest - delta;
+        return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ */
+STATIC int                      /* error */
+xfs_alloc_fix_freelist(
+        xfs_alloc_arg_t *args,  /* allocation argument structure */
+        int             flags)  /* XFS_ALLOC_FLAG_... */
+{
+        xfs_buf_t       *agbp;  /* agf buffer pointer */
+        xfs_agf_t       *agf;   /* a.g. freespace structure pointer */
+        xfs_buf_t       *agflbp;/* agfl buffer pointer */
+        xfs_agblock_t   bno;    /* freelist block */
+        xfs_extlen_t    delta;  /* new blocks needed in freelist */
+        int             error;  /* error result code */
+        xfs_extlen_t    longest;/* longest extent in allocation group */
+        xfs_mount_t     *mp;    /* file system mount point structure */
+        xfs_extlen_t    need;   /* total blocks needed in freelist */
+        xfs_perag_t     *pag;   /* per-ag information structure */
+        xfs_alloc_arg_t targs;  /* local allocation arguments */
+        xfs_trans_t     *tp;    /* transaction pointer */
+        mp = args->mp;
+        pag = args->pag;
+        tp = args->tp;
+        if (!pag->pagf_init) {
+                if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+                                &agbp)))
+                        return error;
+                if (!pag->pagf_init) {
+                        ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+                        ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+                        args->agbp = NULL;
+                        return 0;
+                }
+        } else
+                agbp = NULL;
+        /*
+         * If this is a metadata preferred pag and we are user data
+         * then try somewhere else if we are not being asked to
+         * try harder at this point
+         */
+        if (pag->pagf_metadata && args->userdata &&
+            (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+                ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+                args->agbp = NULL;
+                return 0;
+        }
+        if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+                /*
+                 * If it looks like there isn't a long enough extent, or enough
+                 * total blocks, reject it.
+                 */
+                need = XFS_MIN_FREELIST_PAG(pag, mp);
+                longest = xfs_alloc_longest_free_extent(mp, pag);
+                if ((args->minlen + args->alignment + args->minalignslop - 1) >
+                                longest ||
+                    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
+                           need - args->total) < (int)args->minleft)) {
+                        if (agbp)
+                                xfs_trans_brelse(tp, agbp);
+                        args->agbp = NULL;
+                        return 0;
+                }
+        }
+        /*
+         * Get the a.g. freespace buffer.
+         * Can fail if we're not blocking on locks, and it's held.
+         */
+        if (agbp == NULL) {
+                if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+                                &agbp)))
+                        return error;
+                if (agbp == NULL) {
+                        ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+                        ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+                        args->agbp = NULL;
+                        return 0;
+                }
+        }
+        /*
+         * Figure out how many blocks we should have in the freelist.
+         */
+        agf = XFS_BUF_TO_AGF(agbp);
+        need = XFS_MIN_FREELIST(agf, mp);
+        /*
+         * If there isn't enough total or single-extent, reject it.
+         */
+        if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+                delta = need > be32_to_cpu(agf->agf_flcount) ?
+                        (need - be32_to_cpu(agf->agf_flcount)) : 0;
+                longest = be32_to_cpu(agf->agf_longest);
+                longest = (longest > delta) ? (longest - delta) :
+                        (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
+                if ((args->minlen + args->alignment + args->minalignslop - 1) >
+                                longest ||
+                    ((int)(be32_to_cpu(agf->agf_freeblks) +
+                     be32_to_cpu(agf->agf_flcount) - need - args->total) <
+                                (int)args->minleft)) {
+                        xfs_trans_brelse(tp, agbp);
+                        args->agbp = NULL;
+                        return 0;
+                }
+        }
+        /*
+         * Make the freelist shorter if it's too long.
+         */
+        while (be32_to_cpu(agf->agf_flcount) > need) {
+                xfs_buf_t       *bp;
+                error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+                if (error)
+                        return error;
+                if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
+                        return error;
+                bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+                xfs_trans_binval(tp, bp);
+        }
+        /*
+         * Initialize the args structure.
+         */
+        memset(&targs, 0, sizeof(targs));
+        targs.tp = tp;
+        targs.mp = mp;
+        targs.agbp = agbp;
+        targs.agno = args->agno;
+        targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+        targs.type = XFS_ALLOCTYPE_THIS_AG;
+        targs.pag = pag;
+        if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+                return error;
+        /*
+         * Make the freelist longer if it's too short.
+         */
+        while (be32_to_cpu(agf->agf_flcount) < need) {
+                targs.agbno = 0;
+                targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
+                /*
+                 * Allocate as many blocks as possible at once.
+                 */
+                if ((error = xfs_alloc_ag_vextent(&targs))) {
+                        xfs_trans_brelse(tp, agflbp);
+                        return error;
+                }
+                /*
+                 * Stop if we run out.  Won't happen if callers are obeying
+                 * the restrictions correctly.  Can happen for free calls
+                 * on a completely full ag.
+                 */
+                if (targs.agbno == NULLAGBLOCK) {
+                        if (flags & XFS_ALLOC_FLAG_FREEING)
+                                break;
+                        xfs_trans_brelse(tp, agflbp);
+                        args->agbp = NULL;
+                        return 0;
+                }
+                /*
+                 * Put each allocated block on the list.
+                 */
+                for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+                        error = xfs_alloc_put_freelist(tp, agbp,
+                                                        agflbp, bno, 0);
+                        if (error)
+                                return error;
+                }
+        }
+        xfs_trans_brelse(tp, agflbp);
+        args->agbp = agbp;
+        return 0;
+}
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int                             /* error */
+xfs_alloc_get_freelist(
+        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_buf_t       *agbp,  /* buffer containing the agf structure */
+        xfs_agblock_t   *bnop,  /* block address retrieved from freelist */
+        int             btreeblk) /* destination is a AGF btree */
+{
+        xfs_agf_t       *agf;   /* a.g. freespace structure */
+        xfs_buf_t       *agflbp;/* buffer for a.g. freelist structure */
+        xfs_agblock_t   bno;    /* block number returned */
+        __be32          *agfl_bno;
+        int             error;
+        int             logflags;
+        xfs_mount_t     *mp = tp->t_mountp;
+        xfs_perag_t     *pag;   /* per allocation group data */
+        /*
+         * Freelist is empty, give up.
+         */
+        agf = XFS_BUF_TO_AGF(agbp);
+        if (!agf->agf_flcount) {
+                *bnop = NULLAGBLOCK;
+                return 0;
+        }
+        /*
+         * Read the array of free blocks.
+         */
+        error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+                                    &agflbp);
+        if (error)
+                return error;
+        /*
+         * Get the block number and update the data structures.
+         */
+        agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+        bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+        be32_add_cpu(&agf->agf_flfirst, 1);
+        xfs_trans_brelse(tp, agflbp);
+        if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
+                agf->agf_flfirst = 0;
+        pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+        be32_add_cpu(&agf->agf_flcount, -1);
+        xfs_trans_agflist_delta(tp, -1);
+        pag->pagf_flcount--;
+        xfs_perag_put(pag);
+        logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+        if (btreeblk) {
+                be32_add_cpu(&agf->agf_btreeblks, 1);
+                pag->pagf_btreeblks++;
+                logflags |= XFS_AGF_BTREEBLKS;
+        }
+        xfs_alloc_log_agf(tp, agbp, logflags);
+        *bnop = bno;
+        return 0;
+}
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_buf_t       *bp,    /* buffer for a.g. freelist header */
+        int             fields) /* mask of fields to be logged (XFS_AGF_...) */
+{
+        int     first;          /* first byte offset */
+        int     last;           /* last byte offset */
+        static const short      offsets[] = {
+                offsetof(xfs_agf_t, agf_magicnum),
+                offsetof(xfs_agf_t, agf_versionnum),
+                offsetof(xfs_agf_t, agf_seqno),
+                offsetof(xfs_agf_t, agf_length),
+                offsetof(xfs_agf_t, agf_roots[0]),
+                offsetof(xfs_agf_t, agf_levels[0]),
+                offsetof(xfs_agf_t, agf_flfirst),
+                offsetof(xfs_agf_t, agf_fllast),
+                offsetof(xfs_agf_t, agf_flcount),
+                offsetof(xfs_agf_t, agf_freeblks),
+                offsetof(xfs_agf_t, agf_longest),
+                offsetof(xfs_agf_t, agf_btreeblks),
+                offsetof(xfs_agf_t, agf_uuid),
+                sizeof(xfs_agf_t)
+        };
+        trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+        xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+        xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int                                     /* error */
+xfs_alloc_pagf_init(
+        xfs_mount_t             *mp,    /* file system mount structure */
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        int                     flags)  /* XFS_ALLOC_FLAGS_... */
+{
+        xfs_buf_t               *bp;
+        int                     error;
+        if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
+                return error;
+        if (bp)
+                xfs_trans_brelse(tp, bp);
+        return 0;
+}
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int                                     /* error */
+xfs_alloc_put_freelist(
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_buf_t               *agbp,  /* buffer for a.g. freelist header */
+        xfs_buf_t               *agflbp,/* buffer for a.g. free block array */
+        xfs_agblock_t           bno,    /* block being freed */
+        int                     btreeblk) /* block came from a AGF btree */
+{
+        xfs_agf_t               *agf;   /* a.g. freespace structure */
+        __be32                  *blockp;/* pointer to array entry */
+        int                     error;
+        int                     logflags;
+        xfs_mount_t             *mp;    /* mount structure */
+        xfs_perag_t             *pag;   /* per allocation group data */
+        __be32                  *agfl_bno;
+        int                     startoff;
+        agf = XFS_BUF_TO_AGF(agbp);
+        mp = tp->t_mountp;
+        if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
+                        be32_to_cpu(agf->agf_seqno), &agflbp)))
+                return error;
+        be32_add_cpu(&agf->agf_fllast, 1);
+        if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
+                agf->agf_fllast = 0;
+        pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+        be32_add_cpu(&agf->agf_flcount, 1);
+        xfs_trans_agflist_delta(tp, 1);
+        pag->pagf_flcount++;
+        logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+        if (btreeblk) {
+                be32_add_cpu(&agf->agf_btreeblks, -1);
+                pag->pagf_btreeblks--;
+                logflags |= XFS_AGF_BTREEBLKS;
+        }
+        xfs_perag_put(pag);
+        xfs_alloc_log_agf(tp, agbp, logflags);
+        ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
+        agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+        blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
+        *blockp = cpu_to_be32(bno);
+        startoff = (char *)blockp - (char *)agflbp->b_addr;
+        xfs_alloc_log_agf(tp, agbp, logflags);
+        xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+        xfs_trans_log_buf(tp, agflbp, startoff,
+                          startoff + sizeof(xfs_agblock_t) - 1);
+        return 0;
+}
+static bool
+xfs_agf_verify(
+        struct xfs_mount *mp,
+        struct xfs_buf  *bp)
+ {
+        struct xfs_agf  *agf = XFS_BUF_TO_AGF(bp);
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+                        return false;
+        if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+              XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+              be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+              be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+              be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+              be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+                return false;
+        /*
+         * during growfs operations, the perag is not fully initialised,
+         * so we can't use it for any useful checking. growfs ensures we can't
+         * use it by using uncached buffers that don't have the perag attached
+         * so we can detect and avoid this problem.
+         */
+        if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+                return false;
+        if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+            be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+                return false;
+        return true;;
+}
+static void
+xfs_agf_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
+                                XFS_ERRTAG_ALLOC_READ_AGF,
+                                XFS_RANDOM_ALLOC_READ_AGF))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+static void
+xfs_agf_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        if (!xfs_agf_verify(mp, bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (bip)
+                XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
+}
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+        .verify_read = xfs_agf_read_verify,
+        .verify_write = xfs_agf_write_verify,
+};
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                     /* error */
+xfs_read_agf(
+        struct xfs_mount        *mp,    /* mount point structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        int                     flags,  /* XFS_BUF_ */
+        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
+{
+        int             error;
+        trace_xfs_read_agf(mp, agno);
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_trans_read_buf(
+                        mp, tp, mp->m_ddev_targp,
+                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+                        XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
+        if (error)
+                return error;
+        if (!*bpp)
+                return 0;
+        ASSERT(!(*bpp)->b_error);
+        xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+        return 0;
+}
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                     /* error */
+xfs_alloc_read_agf(
+        struct xfs_mount        *mp,    /* mount point structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        int                     flags,  /* XFS_ALLOC_FLAG_... */
+        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
+{
+        struct xfs_agf          *agf;           /* ag freelist header */
+        struct xfs_perag        *pag;           /* per allocation group data */
+        int                     error;
+        trace_xfs_alloc_read_agf(mp, agno);
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_read_agf(mp, tp, agno,
+                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+                        bpp);
+        if (error)
+                return error;
+        if (!*bpp)
+                return 0;
+        ASSERT(!(*bpp)->b_error);
+        agf = XFS_BUF_TO_AGF(*bpp);
+        pag = xfs_perag_get(mp, agno);
+        if (!pag->pagf_init) {
+                pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+                pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+                pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
+                pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+                pag->pagf_levels[XFS_BTNUM_BNOi] =
+                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+                pag->pagf_levels[XFS_BTNUM_CNTi] =
+                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+                spin_lock_init(&pag->pagb_lock);
+                pag->pagb_count = 0;
+                pag->pagb_tree = RB_ROOT;
+                pag->pagf_init = 1;
+        }
+#ifdef DEBUG
+        else if (!XFS_FORCED_SHUTDOWN(mp)) {
+                ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+                ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
+                ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
+                ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
+                ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
+                ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+        }
+#endif
+        xfs_perag_put(pag);
+        return 0;
+}
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int                             /* error */
+xfs_alloc_vextent(
+        xfs_alloc_arg_t *args)  /* allocation argument structure */
+{
+        xfs_agblock_t   agsize; /* allocation group size */
+        int             error;
+        int             flags;  /* XFS_ALLOC_FLAG_... locking flags */
+        xfs_extlen_t    minleft;/* minimum left value, temp copy */
+        xfs_mount_t     *mp;    /* mount structure pointer */
+        xfs_agnumber_t  sagno;  /* starting allocation group number */
+        xfs_alloctype_t type;   /* input allocation type */
+        int             bump_rotor = 0;
+        int             no_min = 0;
+        xfs_agnumber_t  rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+        mp = args->mp;
+        type = args->otype = args->type;
+        args->agbno = NULLAGBLOCK;
+        /*
+         * Just fix this up, for the case where the last a.g. is shorter
+         * (or there's only one a.g.) and the caller couldn't easily figure
+         * that out (xfs_bmap_alloc).
+         */
+        agsize = mp->m_sb.sb_agblocks;
+        if (args->maxlen > agsize)
+                args->maxlen = agsize;
+        if (args->alignment == 0)
+                args->alignment = 1;
+        ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+        ASSERT(args->minlen <= args->maxlen);
+        ASSERT(args->minlen <= agsize);
+        ASSERT(args->mod < args->prod);
+        if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+            XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+            args->minlen > args->maxlen || args->minlen > agsize ||
+            args->mod >= args->prod) {
+                args->fsbno = NULLFSBLOCK;
+                trace_xfs_alloc_vextent_badargs(args);
+                return 0;
+        }
+        minleft = args->minleft;
+        switch (type) {
+        case XFS_ALLOCTYPE_THIS_AG:
+        case XFS_ALLOCTYPE_NEAR_BNO:
+        case XFS_ALLOCTYPE_THIS_BNO:
+                /*
+                 * These three force us into a single a.g.
+                 */
+                args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                args->pag = xfs_perag_get(mp, args->agno);
+                args->minleft = 0;
+                error = xfs_alloc_fix_freelist(args, 0);
+                args->minleft = minleft;
+                if (error) {
+                        trace_xfs_alloc_vextent_nofix(args);
+                        goto error0;
+                }
+                if (!args->agbp) {
+                        trace_xfs_alloc_vextent_noagbp(args);
+                        break;
+                }
+                args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+                if ((error = xfs_alloc_ag_vextent(args)))
+                        goto error0;
+                break;
+        case XFS_ALLOCTYPE_START_BNO:
+                /*
+                 * Try near allocation first, then anywhere-in-ag after
+                 * the first a.g. fails.
+                 */
+                if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+                    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+                        args->fsbno = XFS_AGB_TO_FSB(mp,
+                                        ((mp->m_agfrotor / rotorstep) %
+                                        mp->m_sb.sb_agcount), 0);
+                        bump_rotor = 1;
+                }
+                args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+                args->type = XFS_ALLOCTYPE_NEAR_BNO;
+                /* FALLTHROUGH */
+        case XFS_ALLOCTYPE_ANY_AG:
+        case XFS_ALLOCTYPE_START_AG:
+        case XFS_ALLOCTYPE_FIRST_AG:
+                /*
+                 * Rotate through the allocation groups looking for a winner.
+                 */
+                if (type == XFS_ALLOCTYPE_ANY_AG) {
+                        /*
+                         * Start with the last place we left off.
+                         */
+                        args->agno = sagno = (mp->m_agfrotor / rotorstep) %
+                                        mp->m_sb.sb_agcount;
+                        args->type = XFS_ALLOCTYPE_THIS_AG;
+                        flags = XFS_ALLOC_FLAG_TRYLOCK;
+                } else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+                        /*
+                         * Start with allocation group given by bno.
+                         */
+                        args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                        args->type = XFS_ALLOCTYPE_THIS_AG;
+                        sagno = 0;
+                        flags = 0;
+                } else {
+                        if (type == XFS_ALLOCTYPE_START_AG)
+                                args->type = XFS_ALLOCTYPE_THIS_AG;
+                        /*
+                         * Start with the given allocation group.
+                         */
+                        args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                        flags = XFS_ALLOC_FLAG_TRYLOCK;
+                }
+                /*
+                 * Loop over allocation groups twice; first time with
+                 * trylock set, second time without.
+                 */
+                for (;;) {
+                        args->pag = xfs_perag_get(mp, args->agno);
+                        if (no_min) args->minleft = 0;
+                        error = xfs_alloc_fix_freelist(args, flags);
+                        args->minleft = minleft;
+                        if (error) {
+                                trace_xfs_alloc_vextent_nofix(args);
+                                goto error0;
+                        }
+                        /*
+                         * If we get a buffer back then the allocation will fly.
+                         */
+                        if (args->agbp) {
+                                if ((error = xfs_alloc_ag_vextent(args)))
+                                        goto error0;
+                                break;
+                        }
+                        trace_xfs_alloc_vextent_loopfailed(args);
+                        /*
+                         * Didn't work, figure out the next iteration.
+                         */
+                        if (args->agno == sagno &&
+                            type == XFS_ALLOCTYPE_START_BNO)
+                                args->type = XFS_ALLOCTYPE_THIS_AG;
+                        /*
+                        * For the first allocation, we can try any AG to get
+                        * space.  However, if we already have allocated a
+                        * block, we don't want to try AGs whose number is below
+                        * sagno. Otherwise, we may end up with out-of-order
+                        * locking of AGF, which might cause deadlock.
+                        */
+                        if (++(args->agno) == mp->m_sb.sb_agcount) {
+                                if (args->firstblock != NULLFSBLOCK)
+                                        args->agno = sagno;
+                                else
+                                        args->agno = 0;
+                        }
+                        /*
+                         * Reached the starting a.g., must either be done
+                         * or switch to non-trylock mode.
+                         */
+                        if (args->agno == sagno) {
+                                if (no_min == 1) {
+                                        args->agbno = NULLAGBLOCK;
+                                        trace_xfs_alloc_vextent_allfailed(args);
+                                        break;
+                                }
+                                if (flags == 0) {
+                                        no_min = 1;
+                                } else {
+                                        flags = 0;
+                                        if (type == XFS_ALLOCTYPE_START_BNO) {
+                                                args->agbno = XFS_FSB_TO_AGBNO(mp,
+                                                        args->fsbno);
+                                                args->type = XFS_ALLOCTYPE_NEAR_BNO;
+                                        }
+                                }
+                        }
+                        xfs_perag_put(args->pag);
+                }
+                if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
+                        if (args->agno == sagno)
+                                mp->m_agfrotor = (mp->m_agfrotor + 1) %
+                                        (mp->m_sb.sb_agcount * rotorstep);
+                        else
+                                mp->m_agfrotor = (args->agno * rotorstep + 1) %
+                                        (mp->m_sb.sb_agcount * rotorstep);
+                }
+                break;
+        default:
+                ASSERT(0);
+                /* NOTREACHED */
+        }
+        if (args->agbno == NULLAGBLOCK)
+                args->fsbno = NULLFSBLOCK;
+        else {
+                args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+                ASSERT(args->len >= args->minlen);
+                ASSERT(args->len <= args->maxlen);
+                ASSERT(args->agbno % args->alignment == 0);
+                XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+                        args->len);
+#endif
+        }
+        xfs_perag_put(args->pag);
+        return 0;
+error0:
+        xfs_perag_put(args->pag);
+        return error;
+}
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int                             /* error */
+xfs_free_extent(
+        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_fsblock_t   bno,    /* starting block number of extent */
+        xfs_extlen_t    len)    /* length of extent */
+{
+        xfs_alloc_arg_t args;
+        int             error;
+        ASSERT(len != 0);
+        memset(&args, 0, sizeof(xfs_alloc_arg_t));
+        args.tp = tp;
+        args.mp = tp->t_mountp;
+        /*
+         * validate that the block number is legal - the enables us to detect
+         * and handle a silent filesystem corruption rather than crashing.
+         */
+        args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+        if (args.agno >= args.mp->m_sb.sb_agcount)
+                return -EFSCORRUPTED;
+        args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+        if (args.agbno >= args.mp->m_sb.sb_agblocks)
+                return -EFSCORRUPTED;
+        args.pag = xfs_perag_get(args.mp, args.agno);
+        ASSERT(args.pag);
+        error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+        if (error)
+                goto error0;
+        /* validate the extent size is legal now we have the agf locked */
+        if (args.agbno + len >
+                        be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
+                error = -EFSCORRUPTED;
+                goto error0;
+        }
+        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+        if (!error)
+                xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
+error0:
+        xfs_perag_put(args.pag);
+        return error;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
new file mode 100644
index 000000000000..feacb061bab7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ALLOC_H__
+#define __XFS_ALLOC_H__
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+extern struct workqueue_struct *xfs_alloc_wq;
+/*
+ * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
+ */
+#define XFS_ALLOCTYPE_ANY_AG    0x01    /* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_FIRST_AG  0x02    /* ... start at ag 0 */
+#define XFS_ALLOCTYPE_START_AG  0x04    /* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_THIS_AG   0x08    /* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO 0x10    /* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO  0x20    /* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO  0x40    /* at exactly this block */
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
+#define XFS_ALLOC_TYPES \
+        { XFS_ALLOCTYPE_ANY_AG,         "ANY_AG" }, \
+        { XFS_ALLOCTYPE_FIRST_AG,       "FIRST_AG" }, \
+        { XFS_ALLOCTYPE_START_AG,       "START_AG" }, \
+        { XFS_ALLOCTYPE_THIS_AG,        "THIS_AG" }, \
+        { XFS_ALLOCTYPE_START_BNO,      "START_BNO" }, \
+        { XFS_ALLOCTYPE_NEAR_BNO,       "NEAR_BNO" }, \
+        { XFS_ALLOCTYPE_THIS_BNO,       "THIS_BNO" }
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define XFS_ALLOC_FLAG_TRYLOCK  0x00000001  /* use trylock for buffer locking */
+#define XFS_ALLOC_FLAG_FREEING  0x00000002  /* indicate caller is freeing extents*/
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks
+ * to 4 + 4*agcount.
+ */
+#define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *      - the AG superblock, AGF, AGI and AGFL
+ *      - the AGF (bno and cnt) and AGI btree root blocks
+ *      - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)     \
+        ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+        struct xfs_trans *tp;           /* transaction pointer */
+        struct xfs_mount *mp;           /* file system mount point */
+        struct xfs_buf  *agbp;          /* buffer for a.g. freelist header */
+        struct xfs_perag *pag;          /* per-ag struct for this agno */
+        xfs_fsblock_t   fsbno;          /* file system block number */
+        xfs_agnumber_t  agno;           /* allocation group number */
+        xfs_agblock_t   agbno;          /* allocation group-relative block # */
+        xfs_extlen_t    minlen;         /* minimum size of extent */
+        xfs_extlen_t    maxlen;         /* maximum size of extent */
+        xfs_extlen_t    mod;            /* mod value for extent size */
+        xfs_extlen_t    prod;           /* prod value for extent size */
+        xfs_extlen_t    minleft;        /* min blocks must be left after us */
+        xfs_extlen_t    total;          /* total blocks needed in xaction */
+        xfs_extlen_t    alignment;      /* align answer to multiple of this */
+        xfs_extlen_t    minalignslop;   /* slop for minlen+alignment calcs */
+        xfs_extlen_t    len;            /* output: actual size of extent */
+        xfs_alloctype_t type;           /* allocation type XFS_ALLOCTYPE_... */
+        xfs_alloctype_t otype;          /* original allocation type */
+        char            wasdel;         /* set if allocation was prev delayed */
+        char            wasfromfl;      /* set if allocation is from freelist */
+        char            isfl;           /* set if is freelist blocks - !acctg */
+        char            userdata;       /* set if this is user data */
+        xfs_fsblock_t   firstblock;     /* io first block allocated */
+} xfs_alloc_arg_t;
+/*
+ * Defines for userdata
+ */
+#define XFS_ALLOC_USERDATA              1       /* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA     2       /* special case start of file */
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+                struct xfs_perag *pag);
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+        struct xfs_mount        *mp);   /* file system mount structure */
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int                             /* error */
+xfs_alloc_get_freelist(
+        struct xfs_trans *tp,   /* transaction pointer */
+        struct xfs_buf  *agbp,  /* buffer containing the agf structure */
+        xfs_agblock_t   *bnop,  /* block address retrieved from freelist */
+        int             btreeblk); /* destination is a AGF btree */
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+        struct xfs_trans *tp,   /* transaction pointer */
+        struct xfs_buf  *bp,    /* buffer for a.g. freelist header */
+        int             fields);/* mask of fields to be logged (XFS_AGF_...) */
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int                             /* error */
+xfs_alloc_pagf_init(
+        struct xfs_mount *mp,   /* file system mount structure */
+        struct xfs_trans *tp,   /* transaction pointer */
+        xfs_agnumber_t  agno,   /* allocation group number */
+        int             flags); /* XFS_ALLOC_FLAGS_... */
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int                             /* error */
+xfs_alloc_put_freelist(
+        struct xfs_trans *tp,   /* transaction pointer */
+        struct xfs_buf  *agbp,  /* buffer for a.g. freelist header */
+        struct xfs_buf  *agflbp,/* buffer for a.g. free block array */
+        xfs_agblock_t   bno,    /* block being freed */
+        int             btreeblk); /* owner was a AGF btree */
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                     /* error  */
+xfs_alloc_read_agf(
+        struct xfs_mount *mp,           /* mount point structure */
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        int             flags,          /* XFS_ALLOC_FLAG_... */
+        struct xfs_buf  **bpp);         /* buffer for the ag freelist header */
+/*
+ * Allocate an extent (variable-size).
+ */
+int                             /* error */
+xfs_alloc_vextent(
+        xfs_alloc_arg_t *args); /* allocation argument structure */
+/*
+ * Free an extent.
+ */
+int                             /* error */
+xfs_free_extent(
+        struct xfs_trans *tp,   /* transaction pointer */
+        xfs_fsblock_t   bno,    /* starting block number of extent */
+        xfs_extlen_t    len);   /* length of extent */
+int                                     /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+int                             /* error */
+xfs_alloc_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+int                                     /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat); /* output: success/failure */
+#endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
new file mode 100644
index 000000000000..e0e83e24d3ef
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+        struct xfs_btree_cur    *cur)
+{
+        return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.a.agbp, cur->bc_private.a.agno,
+                        cur->bc_btnum);
+}
+STATIC void
+xfs_allocbt_set_root(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     inc)
+{
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+        int                     btnum = cur->bc_btnum;
+        struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
+        ASSERT(ptr->s != 0);
+        agf->agf_roots[btnum] = ptr->s;
+        be32_add_cpu(&agf->agf_levels[btnum], inc);
+        pag->pagf_levels[btnum] += inc;
+        xfs_perag_put(pag);
+        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+STATIC int
+xfs_allocbt_alloc_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *start,
+        union xfs_btree_ptr     *new,
+        int                     *stat)
+{
+        int                     error;
+        xfs_agblock_t           bno;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        /* Allocate the new block from the freelist. If we can't, give up.  */
+        error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+                                       &bno, 1);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                return error;
+        }
+        if (bno == NULLAGBLOCK) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+        xfs_trans_agbtree_delta(cur->bc_tp, 1);
+        new->s = cpu_to_be32(bno);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+}
+STATIC int
+xfs_allocbt_free_block(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        xfs_agblock_t           bno;
+        int                     error;
+        bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+        error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+        if (error)
+                return error;
+        xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+                              XFS_EXTENT_BUSY_SKIP_DISCARD);
+        xfs_trans_agbtree_delta(cur->bc_tp, -1);
+        xfs_trans_binval(cur->bc_tp, bp);
+        return 0;
+}
+/*
+ * Update the longest extent in the AGF
+ */
+STATIC void
+xfs_allocbt_update_lastrec(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        union xfs_btree_rec     *rec,
+        int                     ptr,
+        int                     reason)
+{
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+        struct xfs_perag        *pag;
+        __be32                  len;
+        int                     numrecs;
+        ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+        switch (reason) {
+        case LASTREC_UPDATE:
+                /*
+                 * If this is the last leaf block and it's the last record,
+                 * then update the size of the longest extent in the AG.
+                 */
+                if (ptr != xfs_btree_get_numrecs(block))
+                        return;
+                len = rec->alloc.ar_blockcount;
+                break;
+        case LASTREC_INSREC:
+                if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+                    be32_to_cpu(agf->agf_longest))
+                        return;
+                len = rec->alloc.ar_blockcount;
+                break;
+        case LASTREC_DELREC:
+                numrecs = xfs_btree_get_numrecs(block);
+                if (ptr <= numrecs)
+                        return;
+                ASSERT(ptr == numrecs + 1);
+                if (numrecs) {
+                        xfs_alloc_rec_t *rrp;
+                        rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+                        len = rrp->ar_blockcount;
+                } else {
+                        len = 0;
+                }
+                break;
+        default:
+                ASSERT(0);
+                return;
+        }
+        agf->agf_longest = len;
+        pag = xfs_perag_get(cur->bc_mp, seqno);
+        pag->pagf_longest = be32_to_cpu(len);
+        xfs_perag_put(pag);
+        xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
+}
+STATIC int
+xfs_allocbt_get_minrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        return cur->bc_mp->m_alloc_mnr[level != 0];
+}
+STATIC int
+xfs_allocbt_get_maxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        return cur->bc_mp->m_alloc_mxr[level != 0];
+}
+STATIC void
+xfs_allocbt_init_key_from_rec(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        ASSERT(rec->alloc.ar_startblock != 0);
+        key->alloc.ar_startblock = rec->alloc.ar_startblock;
+        key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
+STATIC void
+xfs_allocbt_init_rec_from_key(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        ASSERT(key->alloc.ar_startblock != 0);
+        rec->alloc.ar_startblock = key->alloc.ar_startblock;
+        rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
+}
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        ASSERT(cur->bc_rec.a.ar_startblock != 0);
+        rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+        rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+        ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+        ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+STATIC __int64_t
+xfs_allocbt_key_diff(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key)
+{
+        xfs_alloc_rec_incore_t  *rec = &cur->bc_rec.a;
+        xfs_alloc_key_t         *kp = &key->alloc;
+        __int64_t               diff;
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
+                return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+                                rec->ar_startblock;
+        }
+        diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+        if (diff)
+                return diff;
+        return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+static bool
+xfs_allocbt_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        struct xfs_perag        *pag = bp->b_pag;
+        unsigned int            level;
+        /*
+         * magic number and level verification
+         *
+         * During growfs operations, we can't verify the exact level or owner as
+         * the perag is not fully initialised and hence not attached to the
+         * buffer.  In this case, check against the maximum tree depth.
+         *
+         * Similarly, during log recovery we will have a perag structure
+         * attached, but the agf information will not yet have been initialised
+         * from the on disk AGF. Again, we can only check against maximum limits
+         * in this case.
+         */
+        level = be16_to_cpu(block->bb_level);
+        switch (block->bb_magic) {
+        case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
+                if (!xfs_sb_version_hascrc(&mp->m_sb))
+                        return false;
+                if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+                        return false;
+                if (pag &&
+                    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+                        return false;
+                /* fall through */
+        case cpu_to_be32(XFS_ABTB_MAGIC):
+                if (pag && pag->pagf_init) {
+                        if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
+                                return false;
+                } else if (level >= mp->m_ag_maxlevels)
+                        return false;
+                break;
+        case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
+                if (!xfs_sb_version_hascrc(&mp->m_sb))
+                        return false;
+                if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+                        return false;
+                if (pag &&
+                    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+                        return false;
+                /* fall through */
+        case cpu_to_be32(XFS_ABTC_MAGIC):
+                if (pag && pag->pagf_init) {
+                        if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
+                                return false;
+                } else if (level >= mp->m_ag_maxlevels)
+                        return false;
+                break;
+        default:
+                return false;
+        }
+        /* numrecs verification */
+        if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
+                return false;
+        /* sibling pointer verification */
+        if (!block->bb_u.s.bb_leftsib ||
+            (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+             block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+                return false;
+        if (!block->bb_u.s.bb_rightsib ||
+            (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+             block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+                return false;
+        return true;
+}
+static void
+xfs_allocbt_read_verify(
+        struct xfs_buf  *bp)
+{
+        if (!xfs_btree_sblock_verify_crc(bp))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_allocbt_verify(bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                xfs_verifier_error(bp);
+        }
+}
+static void
+xfs_allocbt_write_verify(
+        struct xfs_buf  *bp)
+{
+        if (!xfs_allocbt_verify(bp)) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        xfs_btree_sblock_calc_crc(bp);
+}
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+        .verify_read = xfs_allocbt_read_verify,
+        .verify_write = xfs_allocbt_write_verify,
+};
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_allocbt_keys_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *k1,
+        union xfs_btree_key     *k2)
+{
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
+                return be32_to_cpu(k1->alloc.ar_startblock) <
+                       be32_to_cpu(k2->alloc.ar_startblock);
+        } else {
+                return be32_to_cpu(k1->alloc.ar_blockcount) <
+                        be32_to_cpu(k2->alloc.ar_blockcount) ||
+                        (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+                         be32_to_cpu(k1->alloc.ar_startblock) <
+                         be32_to_cpu(k2->alloc.ar_startblock));
+        }
+}
+STATIC int
+xfs_allocbt_recs_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *r1,
+        union xfs_btree_rec     *r2)
+{
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
+                return be32_to_cpu(r1->alloc.ar_startblock) +
+                        be32_to_cpu(r1->alloc.ar_blockcount) <=
+                        be32_to_cpu(r2->alloc.ar_startblock);
+        } else {
+                return be32_to_cpu(r1->alloc.ar_blockcount) <
+                        be32_to_cpu(r2->alloc.ar_blockcount) ||
+                        (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+                         be32_to_cpu(r1->alloc.ar_startblock) <
+                         be32_to_cpu(r2->alloc.ar_startblock));
+        }
+}
+#endif  /* DEBUG */
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+        .rec_len                = sizeof(xfs_alloc_rec_t),
+        .key_len                = sizeof(xfs_alloc_key_t),
+        .dup_cursor             = xfs_allocbt_dup_cursor,
+        .set_root               = xfs_allocbt_set_root,
+        .alloc_block            = xfs_allocbt_alloc_block,
+        .free_block             = xfs_allocbt_free_block,
+        .update_lastrec         = xfs_allocbt_update_lastrec,
+        .get_minrecs            = xfs_allocbt_get_minrecs,
+        .get_maxrecs            = xfs_allocbt_get_maxrecs,
+        .init_key_from_rec      = xfs_allocbt_init_key_from_rec,
+        .init_rec_from_key      = xfs_allocbt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
+        .key_diff               = xfs_allocbt_key_diff,
+        .buf_ops                = &xfs_allocbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+        .keys_inorder           = xfs_allocbt_keys_inorder,
+        .recs_inorder           = xfs_allocbt_recs_inorder,
+#endif
+};
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur *                  /* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+        struct xfs_mount        *mp,            /* file system mount point */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_buf          *agbp,          /* buffer for agf structure */
+        xfs_agnumber_t          agno,           /* allocation group number */
+        xfs_btnum_t             btnum)          /* btree identifier */
+{
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        struct xfs_btree_cur    *cur;
+        ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+        cur->bc_tp = tp;
+        cur->bc_mp = mp;
+        cur->bc_btnum = btnum;
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
+        cur->bc_ops = &xfs_allocbt_ops;
+        if (btnum == XFS_BTNUM_CNT) {
+                cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+                cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+        } else {
+                cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+        }
+        cur->bc_private.a.agbp = agbp;
+        cur->bc_private.a.agno = agno;
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+        return cur;
+}
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+        struct xfs_mount        *mp,
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+        if (leaf)
+                return blocklen / sizeof(xfs_alloc_rec_t);
+        return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
new file mode 100644
index 000000000000..45e189e7e81c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define __XFS_ALLOC_BTREE_H__
+/*
+ * Freespace on-disk structures
+ */
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+        (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+                XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+        ((xfs_alloc_rec_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+        ((xfs_alloc_key_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_alloc_key_t)))
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_alloc_ptr_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 (maxrecs) * sizeof(xfs_alloc_key_t) + \
+                 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_buf *,
+                xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+#endif  /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
new file mode 100644
index 000000000000..353fb425faef
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -0,0 +1,1459 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_get(xfs_da_args_t *args);
+STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+STATIC int
+xfs_attr_args_init(
+        struct xfs_da_args      *args,
+        struct xfs_inode        *dp,
+        const unsigned char     *name,
+        int                     flags)
+{
+        if (!name)
+                return -EINVAL;
+        memset(args, 0, sizeof(*args));
+        args->geo = dp->i_mount->m_attr_geo;
+        args->whichfork = XFS_ATTR_FORK;
+        args->dp = dp;
+        args->flags = flags;
+        args->name = name;
+        args->namelen = strlen((const char *)name);
+        if (args->namelen >= MAXNAMELEN)
+                return -EFAULT;         /* match IRIX behaviour */
+        args->hashval = xfs_da_hashname(args->name, args->namelen);
+        return 0;
+}
+int
+xfs_inode_hasattr(
+        struct xfs_inode        *ip)
+{
+        if (!XFS_IFORK_Q(ip) ||
+            (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             ip->i_d.di_anextents == 0))
+                return 0;
+        return 1;
+}
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+int
+xfs_attr_get(
+        struct xfs_inode        *ip,
+        const unsigned char     *name,
+        unsigned char           *value,
+        int                     *valuelenp,
+        int                     flags)
+{
+        struct xfs_da_args      args;
+        uint                    lock_mode;
+        int                     error;
+        XFS_STATS_INC(xs_attr_get);
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return -EIO;
+        if (!xfs_inode_hasattr(ip))
+                return -ENOATTR;
+        error = xfs_attr_args_init(&args, ip, name, flags);
+        if (error)
+                return error;
+        args.value = value;
+        args.valuelen = *valuelenp;
+        lock_mode = xfs_ilock_attr_map_shared(ip);
+        if (!xfs_inode_hasattr(ip))
+                error = -ENOATTR;
+        else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+                error = xfs_attr_shortform_getvalue(&args);
+        else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
+                error = xfs_attr_leaf_get(&args);
+        else
+                error = xfs_attr_node_get(&args);
+        xfs_iunlock(ip, lock_mode);
+        *valuelenp = args.valuelen;
+        return error == -EEXIST ? 0 : error;
+}
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+STATIC int
+xfs_attr_calc_size(
+        struct xfs_da_args      *args,
+        int                     *local)
+{
+        struct xfs_mount        *mp = args->dp->i_mount;
+        int                     size;
+        int                     nblks;
+        /*
+         * Determine space new attribute will use, and if it would be
+         * "local" or "remote" (note: local != inline).
+         */
+        size = xfs_attr_leaf_newentsize(args, local);
+        nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+        if (*local) {
+                if (size > (args->geo->blksize / 2)) {
+                        /* Double split possible */
+                        nblks *= 2;
+                }
+        } else {
+                /*
+                 * Out of line attribute, cannot double split, but
+                 * make room for the attribute value itself.
+                 */
+                uint    dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+                nblks += dblocks;
+                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+        }
+        return nblks;
+}
+int
+xfs_attr_set(
+        struct xfs_inode        *dp,
+        const unsigned char     *name,
+        unsigned char           *value,
+        int                     valuelen,
+        int                     flags)
+{
+        struct xfs_mount        *mp = dp->i_mount;
+        struct xfs_da_args      args;
+        struct xfs_bmap_free    flist;
+        struct xfs_trans_res    tres;
+        xfs_fsblock_t           firstblock;
+        int                     rsvd = (flags & ATTR_ROOT) != 0;
+        int                     error, err2, committed, local;
+        XFS_STATS_INC(xs_attr_set);
+        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+                return -EIO;
+        error = xfs_attr_args_init(&args, dp, name, flags);
+        if (error)
+                return error;
+        args.value = value;
+        args.valuelen = valuelen;
+        args.firstblock = &firstblock;
+        args.flist = &flist;
+        args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+        args.total = xfs_attr_calc_size(&args, &local);
+        error = xfs_qm_dqattach(dp, 0);
+        if (error)
+                return error;
+        /*
+         * If the inode doesn't have an attribute fork, add one.
+         * (inode must not be locked when we call this routine)
+         */
+        if (XFS_IFORK_Q(dp) == 0) {
+                int sf_size = sizeof(xfs_attr_sf_hdr_t) +
+                        XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
+                error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+                if (error)
+                        return error;
+        }
+        /*
+         * Start our first transaction of the day.
+         *
+         * All future transactions during this code must be "chained" off
+         * this one via the trans_dup() call.  All transactions will contain
+         * the inode, and the inode will always be marked with trans_ihold().
+         * Since the inode will be locked in all transactions, we must log
+         * the inode in every transaction to let it float upward through
+         * the log.
+         */
+        args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+        /*
+         * Root fork attributes can use reserved data blocks for this
+         * operation if necessary
+         */
+        if (rsvd)
+                args.trans->t_flags |= XFS_TRANS_RESERVE;
+        tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+                         M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+        tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+        tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+        error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
+        if (error) {
+                xfs_trans_cancel(args.trans, 0);
+                return error;
+        }
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
+                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+                                       XFS_QMOPT_RES_REGBLKS);
+        if (error) {
+                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+                xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+                return error;
+        }
+        xfs_trans_ijoin(args.trans, dp, 0);
+        /*
+         * If the attribute list is non-existent or a shortform list,
+         * upgrade it to a single-leaf-block attribute list.
+         */
+        if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             dp->i_d.di_anextents == 0)) {
+                /*
+                 * Build initial attribute list (if required).
+                 */
+                if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+                        xfs_attr_shortform_create(&args);
+                /*
+                 * Try to add the attr to the attribute list in
+                 * the inode.
+                 */
+                error = xfs_attr_shortform_addname(&args);
+                if (error != -ENOSPC) {
+                        /*
+                         * Commit the shortform mods, and we're done.
+                         * NOTE: this is also the error path (EEXIST, etc).
+                         */
+                        ASSERT(args.trans != NULL);
+                        /*
+                         * If this is a synchronous mount, make sure that
+                         * the transaction goes to disk before returning
+                         * to the user.
+                         */
+                        if (mp->m_flags & XFS_MOUNT_WSYNC)
+                                xfs_trans_set_sync(args.trans);
+                        if (!error && (flags & ATTR_KERNOTIME) == 0) {
+                                xfs_trans_ichgtime(args.trans, dp,
+                                                        XFS_ICHGTIME_CHG);
+                        }
+                        err2 = xfs_trans_commit(args.trans,
+                                                 XFS_TRANS_RELEASE_LOG_RES);
+                        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+                        return error ? error : err2;
+                }
+                /*
+                 * It won't fit in the shortform, transform to a leaf block.
+                 * GROT: another possible req'mt for a double-split btree op.
+                 */
+                xfs_bmap_init(args.flist, args.firstblock);
+                error = xfs_attr_shortform_to_leaf(&args);
+                if (!error) {
+                        error = xfs_bmap_finish(&args.trans, args.flist,
+                                                &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args.trans = NULL;
+                        xfs_bmap_cancel(&flist);
+                        goto out;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed)
+                        xfs_trans_ijoin(args.trans, dp, 0);
+                /*
+                 * Commit the leaf transformation.  We'll need another (linked)
+                 * transaction to add the new attribute to the leaf.
+                 */
+                error = xfs_trans_roll(&args.trans, dp);
+                if (error)
+                        goto out;
+        }
+        if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+                error = xfs_attr_leaf_addname(&args);
+        else
+                error = xfs_attr_node_addname(&args);
+        if (error)
+                goto out;
+        /*
+         * If this is a synchronous mount, make sure that the
+         * transaction goes to disk before returning to the user.
+         */
+        if (mp->m_flags & XFS_MOUNT_WSYNC)
+                xfs_trans_set_sync(args.trans);
+        if ((flags & ATTR_KERNOTIME) == 0)
+                xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+        /*
+         * Commit the last in the sequence of transactions.
+         */
+        xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        return error;
+out:
+        if (args.trans) {
+                xfs_trans_cancel(args.trans,
+                        XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+        }
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        return error;
+}
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
+int
+xfs_attr_remove(
+        struct xfs_inode        *dp,
+        const unsigned char     *name,
+        int                     flags)
+{
+        struct xfs_mount        *mp = dp->i_mount;
+        struct xfs_da_args      args;
+        struct xfs_bmap_free    flist;
+        xfs_fsblock_t           firstblock;
+        int                     error;
+        XFS_STATS_INC(xs_attr_remove);
+        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+                return -EIO;
+        if (!xfs_inode_hasattr(dp))
+                return -ENOATTR;
+        error = xfs_attr_args_init(&args, dp, name, flags);
+        if (error)
+                return error;
+        args.firstblock = &firstblock;
+        args.flist = &flist;
+        /*
+         * we have no control over the attribute names that userspace passes us
+         * to remove, so we have to allow the name lookup prior to attribute
+         * removal to fail.
+         */
+        args.op_flags = XFS_DA_OP_OKNOENT;
+        error = xfs_qm_dqattach(dp, 0);
+        if (error)
+                return error;
+        /*
+         * Start our first transaction of the day.
+         *
+         * All future transactions during this code must be "chained" off
+         * this one via the trans_dup() call.  All transactions will contain
+         * the inode, and the inode will always be marked with trans_ihold().
+         * Since the inode will be locked in all transactions, we must log
+         * the inode in every transaction to let it float upward through
+         * the log.
+         */
+        args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
+        /*
+         * Root fork attributes can use reserved data blocks for this
+         * operation if necessary
+         */
+        if (flags & ATTR_ROOT)
+                args.trans->t_flags |= XFS_TRANS_RESERVE;
+        error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
+                                  XFS_ATTRRM_SPACE_RES(mp), 0);
+        if (error) {
+                xfs_trans_cancel(args.trans, 0);
+                return error;
+        }
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        /*
+         * No need to make quota reservations here. We expect to release some
+         * blocks not allocate in the common case.
+         */
+        xfs_trans_ijoin(args.trans, dp, 0);
+        if (!xfs_inode_hasattr(dp)) {
+                error = -ENOATTR;
+        } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+                ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+                error = xfs_attr_shortform_remove(&args);
+        } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+                error = xfs_attr_leaf_removename(&args);
+        } else {
+                error = xfs_attr_node_removename(&args);
+        }
+        if (error)
+                goto out;
+        /*
+         * If this is a synchronous mount, make sure that the
+         * transaction goes to disk before returning to the user.
+         */
+        if (mp->m_flags & XFS_MOUNT_WSYNC)
+                xfs_trans_set_sync(args.trans);
+        if ((flags & ATTR_KERNOTIME) == 0)
+                xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+        /*
+         * Commit the last in the sequence of transactions.
+         */
+        xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        return error;
+out:
+        if (args.trans) {
+                xfs_trans_cancel(args.trans,
+                        XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+        }
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        return error;
+}
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_attr_shortform_addname(xfs_da_args_t *args)
+{
+        int newsize, forkoff, retval;
+        trace_xfs_attr_sf_addname(args);
+        retval = xfs_attr_shortform_lookup(args);
+        if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+                return retval;
+        } else if (retval == -EEXIST) {
+                if (args->flags & ATTR_CREATE)
+                        return retval;
+                retval = xfs_attr_shortform_remove(args);
+                ASSERT(retval == 0);
+        }
+        if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
+            args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+                return -ENOSPC;
+        newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
+        newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+        forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
+        if (!forkoff)
+                return -ENOSPC;
+        xfs_attr_shortform_add(args, forkoff);
+        return 0;
+}
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_addname(xfs_da_args_t *args)
+{
+        xfs_inode_t *dp;
+        struct xfs_buf *bp;
+        int retval, error, committed, forkoff;
+        trace_xfs_attr_leaf_addname(args);
+        /*
+         * Read the (only) block in the attribute list in.
+         */
+        dp = args->dp;
+        args->blkno = 0;
+        error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+        if (error)
+                return error;
+        /*
+         * Look up the given attribute in the leaf block.  Figure out if
+         * the given flags produce an error or call for an atomic rename.
+         */
+        retval = xfs_attr3_leaf_lookup_int(bp, args);
+        if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+                xfs_trans_brelse(args->trans, bp);
+                return retval;
+        } else if (retval == -EEXIST) {
+                if (args->flags & ATTR_CREATE) {        /* pure create op */
+                        xfs_trans_brelse(args->trans, bp);
+                        return retval;
+                }
+                trace_xfs_attr_leaf_replace(args);
+                /* save the attribute state for later removal*/
+                args->op_flags |= XFS_DA_OP_RENAME;     /* an atomic rename */
+                args->blkno2 = args->blkno;             /* set 2nd entry info*/
+                args->index2 = args->index;
+                args->rmtblkno2 = args->rmtblkno;
+                args->rmtblkcnt2 = args->rmtblkcnt;
+                args->rmtvaluelen2 = args->rmtvaluelen;
+                /*
+                 * clear the remote attr state now that it is saved so that the
+                 * values reflect the state of the attribute we are about to
+                 * add, not the attribute we just found and will remove later.
+                 */
+                args->rmtblkno = 0;
+                args->rmtblkcnt = 0;
+                args->rmtvaluelen = 0;
+        }
+        /*
+         * Add the attribute to the leaf block, transitioning to a Btree
+         * if required.
+         */
+        retval = xfs_attr3_leaf_add(bp, args);
+        if (retval == -ENOSPC) {
+                /*
+                 * Promote the attribute list to the Btree format, then
+                 * Commit that transaction so that the node_addname() call
+                 * can manage its own transactions.
+                 */
+                xfs_bmap_init(args->flist, args->firstblock);
+                error = xfs_attr3_leaf_to_node(args);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        return error;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed)
+                        xfs_trans_ijoin(args->trans, dp, 0);
+                /*
+                 * Commit the current trans (including the inode) and start
+                 * a new one.
+                 */
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
+                        return error;
+                /*
+                 * Fob the whole rest of the problem off on the Btree code.
+                 */
+                error = xfs_attr_node_addname(args);
+                return error;
+        }
+        /*
+         * Commit the transaction that added the attr name so that
+         * later routines can manage their own transactions.
+         */
+        error = xfs_trans_roll(&args->trans, dp);
+        if (error)
+                return error;
+        /*
+         * If there was an out-of-line value, allocate the blocks we
+         * identified for its storage and copy the value.  This is done
+         * after we create the attribute so that we don't overflow the
+         * maximum size of a transaction and/or hit a deadlock.
+         */
+        if (args->rmtblkno > 0) {
+                error = xfs_attr_rmtval_set(args);
+                if (error)
+                        return error;
+        }
+        /*
+         * If this is an atomic rename operation, we must "flip" the
+         * incomplete flags on the "new" and "old" attribute/value pairs
+         * so that one disappears and one appears atomically.  Then we
+         * must remove the "old" attribute/value pair.
+         */
+        if (args->op_flags & XFS_DA_OP_RENAME) {
+                /*
+                 * In a separate transaction, set the incomplete flag on the
+                 * "old" attr and clear the incomplete flag on the "new" attr.
+                 */
+                error = xfs_attr3_leaf_flipflags(args);
+                if (error)
+                        return error;
+                /*
+                 * Dismantle the "old" attribute/value pair by removing
+                 * a "remote" value (if it exists).
+                 */
+                args->index = args->index2;
+                args->blkno = args->blkno2;
+                args->rmtblkno = args->rmtblkno2;
+                args->rmtblkcnt = args->rmtblkcnt2;
+                args->rmtvaluelen = args->rmtvaluelen2;
+                if (args->rmtblkno) {
+                        error = xfs_attr_rmtval_remove(args);
+                        if (error)
+                                return error;
+                }
+                /*
+                 * Read in the block containing the "old" attr, then
+                 * remove the "old" attr from that block (neat, huh!)
+                 */
+                error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+                                           -1, &bp);
+                if (error)
+                        return error;
+                xfs_attr3_leaf_remove(bp, args);
+                /*
+                 * If the result is small enough, shrink it all into the inode.
+                 */
+                if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+                        xfs_bmap_init(args->flist, args->firstblock);
+                        error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+                        /* bp is gone due to xfs_da_shrink_inode */
+                        if (!error) {
+                                error = xfs_bmap_finish(&args->trans,
+                                                        args->flist,
+                                                        &committed);
+                        }
+                        if (error) {
+                                ASSERT(committed);
+                                args->trans = NULL;
+                                xfs_bmap_cancel(args->flist);
+                                return error;
+                        }
+                        /*
+                         * bmap_finish() may have committed the last trans
+                         * and started a new one.  We need the inode to be
+                         * in all transactions.
+                         */
+                        if (committed)
+                                xfs_trans_ijoin(args->trans, dp, 0);
+                }
+                /*
+                 * Commit the remove and start the next trans in series.
+                 */
+                error = xfs_trans_roll(&args->trans, dp);
+        } else if (args->rmtblkno > 0) {
+                /*
+                 * Added a "remote" value, just clear the incomplete flag.
+                 */
+                error = xfs_attr3_leaf_clearflag(args);
+        }
+        return error;
+}
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(xfs_da_args_t *args)
+{
+        xfs_inode_t *dp;
+        struct xfs_buf *bp;
+        int error, committed, forkoff;
+        trace_xfs_attr_leaf_removename(args);
+        /*
+         * Remove the attribute.
+         */
+        dp = args->dp;
+        args->blkno = 0;
+        error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+        if (error)
+                return error;
+        error = xfs_attr3_leaf_lookup_int(bp, args);
+        if (error == -ENOATTR) {
+                xfs_trans_brelse(args->trans, bp);
+                return error;
+        }
+        xfs_attr3_leaf_remove(bp, args);
+        /*
+         * If the result is small enough, shrink it all into the inode.
+         */
+        if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+                xfs_bmap_init(args->flist, args->firstblock);
+                error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+                /* bp is gone due to xfs_da_shrink_inode */
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        return error;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed)
+                        xfs_trans_ijoin(args->trans, dp, 0);
+        }
+        return 0;
+}
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+        struct xfs_buf *bp;
+        int error;
+        trace_xfs_attr_leaf_get(args);
+        args->blkno = 0;
+        error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+        if (error)
+                return error;
+        error = xfs_attr3_leaf_lookup_int(bp, args);
+        if (error != -EEXIST)  {
+                xfs_trans_brelse(args->trans, bp);
+                return error;
+        }
+        error = xfs_attr3_leaf_getvalue(bp, args);
+        xfs_trans_brelse(args->trans, bp);
+        if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
+                error = xfs_attr_rmtval_get(args);
+        }
+        return error;
+}
+/*========================================================================
+ * External routines when attribute list size > geo->blksize
+ *========================================================================*/
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ */
+STATIC int
+xfs_attr_node_addname(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        xfs_da_state_blk_t *blk;
+        xfs_inode_t *dp;
+        xfs_mount_t *mp;
+        int committed, retval, error;
+        trace_xfs_attr_node_addname(args);
+        /*
+         * Fill in bucket of arguments/results/context to carry around.
+         */
+        dp = args->dp;
+        mp = dp->i_mount;
+restart:
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = mp;
+        /*
+         * Search to see if name already exists, and get back a pointer
+         * to where it should go.
+         */
+        error = xfs_da3_node_lookup_int(state, &retval);
+        if (error)
+                goto out;
+        blk = &state->path.blk[ state->path.active-1 ];
+        ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+        if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+                goto out;
+        } else if (retval == -EEXIST) {
+                if (args->flags & ATTR_CREATE)
+                        goto out;
+                trace_xfs_attr_node_replace(args);
+                /* save the attribute state for later removal*/
+                args->op_flags |= XFS_DA_OP_RENAME;     /* atomic rename op */
+                args->blkno2 = args->blkno;             /* set 2nd entry info*/
+                args->index2 = args->index;
+                args->rmtblkno2 = args->rmtblkno;
+                args->rmtblkcnt2 = args->rmtblkcnt;
+                args->rmtvaluelen2 = args->rmtvaluelen;
+                /*
+                 * clear the remote attr state now that it is saved so that the
+                 * values reflect the state of the attribute we are about to
+                 * add, not the attribute we just found and will remove later.
+                 */
+                args->rmtblkno = 0;
+                args->rmtblkcnt = 0;
+                args->rmtvaluelen = 0;
+        }
+        retval = xfs_attr3_leaf_add(blk->bp, state->args);
+        if (retval == -ENOSPC) {
+                if (state->path.active == 1) {
+                        /*
+                         * Its really a single leaf node, but it had
+                         * out-of-line values so it looked like it *might*
+                         * have been a b-tree.
+                         */
+                        xfs_da_state_free(state);
+                        state = NULL;
+                        xfs_bmap_init(args->flist, args->firstblock);
+                        error = xfs_attr3_leaf_to_node(args);
+                        if (!error) {
+                                error = xfs_bmap_finish(&args->trans,
+                                                        args->flist,
+                                                        &committed);
+                        }
+                        if (error) {
+                                ASSERT(committed);
+                                args->trans = NULL;
+                                xfs_bmap_cancel(args->flist);
+                                goto out;
+                        }
+                        /*
+                         * bmap_finish() may have committed the last trans
+                         * and started a new one.  We need the inode to be
+                         * in all transactions.
+                         */
+                        if (committed)
+                                xfs_trans_ijoin(args->trans, dp, 0);
+                        /*
+                         * Commit the node conversion and start the next
+                         * trans in the chain.
+                         */
+                        error = xfs_trans_roll(&args->trans, dp);
+                        if (error)
+                                goto out;
+                        goto restart;
+                }
+                /*
+                 * Split as many Btree elements as required.
+                 * This code tracks the new and old attr's location
+                 * in the index/blkno/rmtblkno/rmtblkcnt fields and
+                 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+                 */
+                xfs_bmap_init(args->flist, args->firstblock);
+                error = xfs_da3_split(state);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        goto out;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed)
+                        xfs_trans_ijoin(args->trans, dp, 0);
+        } else {
+                /*
+                 * Addition succeeded, update Btree hashvals.
+                 */
+                xfs_da3_fixhashpath(state, &state->path);
+        }
+        /*
+         * Kill the state structure, we're done with it and need to
+         * allow the buffers to come back later.
+         */
+        xfs_da_state_free(state);
+        state = NULL;
+        /*
+         * Commit the leaf addition or btree split and start the next
+         * trans in the chain.
+         */
+        error = xfs_trans_roll(&args->trans, dp);
+        if (error)
+                goto out;
+        /*
+         * If there was an out-of-line value, allocate the blocks we
+         * identified for its storage and copy the value.  This is done
+         * after we create the attribute so that we don't overflow the
+         * maximum size of a transaction and/or hit a deadlock.
+         */
+        if (args->rmtblkno > 0) {
+                error = xfs_attr_rmtval_set(args);
+                if (error)
+                        return error;
+        }
+        /*
+         * If this is an atomic rename operation, we must "flip" the
+         * incomplete flags on the "new" and "old" attribute/value pairs
+         * so that one disappears and one appears atomically.  Then we
+         * must remove the "old" attribute/value pair.
+         */
+        if (args->op_flags & XFS_DA_OP_RENAME) {
+                /*
+                 * In a separate transaction, set the incomplete flag on the
+                 * "old" attr and clear the incomplete flag on the "new" attr.
+                 */
+                error = xfs_attr3_leaf_flipflags(args);
+                if (error)
+                        goto out;
+                /*
+                 * Dismantle the "old" attribute/value pair by removing
+                 * a "remote" value (if it exists).
+                 */
+                args->index = args->index2;
+                args->blkno = args->blkno2;
+                args->rmtblkno = args->rmtblkno2;
+                args->rmtblkcnt = args->rmtblkcnt2;
+                args->rmtvaluelen = args->rmtvaluelen2;
+                if (args->rmtblkno) {
+                        error = xfs_attr_rmtval_remove(args);
+                        if (error)
+                                return error;
+                }
+                /*
+                 * Re-find the "old" attribute entry after any split ops.
+                 * The INCOMPLETE flag means that we will find the "old"
+                 * attr, not the "new" one.
+                 */
+                args->flags |= XFS_ATTR_INCOMPLETE;
+                state = xfs_da_state_alloc();
+                state->args = args;
+                state->mp = mp;
+                state->inleaf = 0;
+                error = xfs_da3_node_lookup_int(state, &retval);
+                if (error)
+                        goto out;
+                /*
+                 * Remove the name and update the hashvals in the tree.
+                 */
+                blk = &state->path.blk[ state->path.active-1 ];
+                ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+                error = xfs_attr3_leaf_remove(blk->bp, args);
+                xfs_da3_fixhashpath(state, &state->path);
+                /*
+                 * Check to see if the tree needs to be collapsed.
+                 */
+                if (retval && (state->path.active > 1)) {
+                        xfs_bmap_init(args->flist, args->firstblock);
+                        error = xfs_da3_join(state);
+                        if (!error) {
+                                error = xfs_bmap_finish(&args->trans,
+                                                        args->flist,
+                                                        &committed);
+                        }
+                        if (error) {
+                                ASSERT(committed);
+                                args->trans = NULL;
+                                xfs_bmap_cancel(args->flist);
+                                goto out;
+                        }
+                        /*
+                         * bmap_finish() may have committed the last trans
+                         * and started a new one.  We need the inode to be
+                         * in all transactions.
+                         */
+                        if (committed)
+                                xfs_trans_ijoin(args->trans, dp, 0);
+                }
+                /*
+                 * Commit and start the next trans in the chain.
+                 */
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
+                        goto out;
+        } else if (args->rmtblkno > 0) {
+                /*
+                 * Added a "remote" value, just clear the incomplete flag.
+                 */
+                error = xfs_attr3_leaf_clearflag(args);
+                if (error)
+                        goto out;
+        }
+        retval = error = 0;
+out:
+        if (state)
+                xfs_da_state_free(state);
+        if (error)
+                return error;
+        return retval;
+}
+/*
+ * Remove a name from a B-tree attribute list.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_attr_node_removename(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        xfs_da_state_blk_t *blk;
+        xfs_inode_t *dp;
+        struct xfs_buf *bp;
+        int retval, error, committed, forkoff;
+        trace_xfs_attr_node_removename(args);
+        /*
+         * Tie a string around our finger to remind us where we are.
+         */
+        dp = args->dp;
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = dp->i_mount;
+        /*
+         * Search to see if name exists, and get back a pointer to it.
+         */
+        error = xfs_da3_node_lookup_int(state, &retval);
+        if (error || (retval != -EEXIST)) {
+                if (error == 0)
+                        error = retval;
+                goto out;
+        }
+        /*
+         * If there is an out-of-line value, de-allocate the blocks.
+         * This is done before we remove the attribute so that we don't
+         * overflow the maximum size of a transaction and/or hit a deadlock.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        ASSERT(blk->bp != NULL);
+        ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+        if (args->rmtblkno > 0) {
+                /*
+                 * Fill in disk block numbers in the state structure
+                 * so that we can get the buffers back after we commit
+                 * several transactions in the following calls.
+                 */
+                error = xfs_attr_fillstate(state);
+                if (error)
+                        goto out;
+                /*
+                 * Mark the attribute as INCOMPLETE, then bunmapi() the
+                 * remote value.
+                 */
+                error = xfs_attr3_leaf_setflag(args);
+                if (error)
+                        goto out;
+                error = xfs_attr_rmtval_remove(args);
+                if (error)
+                        goto out;
+                /*
+                 * Refill the state structure with buffers, the prior calls
+                 * released our buffers.
+                 */
+                error = xfs_attr_refillstate(state);
+                if (error)
+                        goto out;
+        }
+        /*
+         * Remove the name and update the hashvals in the tree.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+        retval = xfs_attr3_leaf_remove(blk->bp, args);
+        xfs_da3_fixhashpath(state, &state->path);
+        /*
+         * Check to see if the tree needs to be collapsed.
+         */
+        if (retval && (state->path.active > 1)) {
+                xfs_bmap_init(args->flist, args->firstblock);
+                error = xfs_da3_join(state);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        goto out;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed)
+                        xfs_trans_ijoin(args->trans, dp, 0);
+                /*
+                 * Commit the Btree join operation and start a new trans.
+                 */
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
+                        goto out;
+        }
+        /*
+         * If the result is small enough, push it all into the inode.
+         */
+        if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+                /*
+                 * Have to get rid of the copy of this dabuf in the state.
+                 */
+                ASSERT(state->path.active == 1);
+                ASSERT(state->path.blk[0].bp);
+                state->path.blk[0].bp = NULL;
+                error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
+                if (error)
+                        goto out;
+                if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+                        xfs_bmap_init(args->flist, args->firstblock);
+                        error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+                        /* bp is gone due to xfs_da_shrink_inode */
+                        if (!error) {
+                                error = xfs_bmap_finish(&args->trans,
+                                                        args->flist,
+                                                        &committed);
+                        }
+                        if (error) {
+                                ASSERT(committed);
+                                args->trans = NULL;
+                                xfs_bmap_cancel(args->flist);
+                                goto out;
+                        }
+                        /*
+                         * bmap_finish() may have committed the last trans
+                         * and started a new one.  We need the inode to be
+                         * in all transactions.
+                         */
+                        if (committed)
+                                xfs_trans_ijoin(args->trans, dp, 0);
+                } else
+                        xfs_trans_brelse(args->trans, bp);
+        }
+        error = 0;
+out:
+        xfs_da_state_free(state);
+        return error;
+}
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+STATIC int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+        xfs_da_state_path_t *path;
+        xfs_da_state_blk_t *blk;
+        int level;
+        trace_xfs_attr_fillstate(state->args);
+        /*
+         * Roll down the "path" in the state structure, storing the on-disk
+         * block number for those buffers in the "path".
+         */
+        path = &state->path;
+        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+                if (blk->bp) {
+                        blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+                        blk->bp = NULL;
+                } else {
+                        blk->disk_blkno = 0;
+                }
+        }
+        /*
+         * Roll down the "altpath" in the state structure, storing the on-disk
+         * block number for those buffers in the "altpath".
+         */
+        path = &state->altpath;
+        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+                if (blk->bp) {
+                        blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
+                        blk->bp = NULL;
+                } else {
+                        blk->disk_blkno = 0;
+                }
+        }
+        return 0;
+}
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+STATIC int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+        xfs_da_state_path_t *path;
+        xfs_da_state_blk_t *blk;
+        int level, error;
+        trace_xfs_attr_refillstate(state->args);
+        /*
+         * Roll down the "path" in the state structure, storing the on-disk
+         * block number for those buffers in the "path".
+         */
+        path = &state->path;
+        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+                if (blk->disk_blkno) {
+                        error = xfs_da3_node_read(state->args->trans,
+                                                state->args->dp,
+                                                blk->blkno, blk->disk_blkno,
+                                                &blk->bp, XFS_ATTR_FORK);
+                        if (error)
+                                return error;
+                } else {
+                        blk->bp = NULL;
+                }
+        }
+        /*
+         * Roll down the "altpath" in the state structure, storing the on-disk
+         * block number for those buffers in the "altpath".
+         */
+        path = &state->altpath;
+        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+                if (blk->disk_blkno) {
+                        error = xfs_da3_node_read(state->args->trans,
+                                                state->args->dp,
+                                                blk->blkno, blk->disk_blkno,
+                                                &blk->bp, XFS_ATTR_FORK);
+                        if (error)
+                                return error;
+                } else {
+                        blk->bp = NULL;
+                }
+        }
+        return 0;
+}
+/*
+ * Look up a filename in a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ */
+STATIC int
+xfs_attr_node_get(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        xfs_da_state_blk_t *blk;
+        int error, retval;
+        int i;
+        trace_xfs_attr_node_get(args);
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        /*
+         * Search to see if name exists, and get back a pointer to it.
+         */
+        error = xfs_da3_node_lookup_int(state, &retval);
+        if (error) {
+                retval = error;
+        } else if (retval == -EEXIST) {
+                blk = &state->path.blk[ state->path.active-1 ];
+                ASSERT(blk->bp != NULL);
+                ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+                /*
+                 * Get the value, local or "remote"
+                 */
+                retval = xfs_attr3_leaf_getvalue(blk->bp, args);
+                if (!retval && (args->rmtblkno > 0)
+                    && !(args->flags & ATTR_KERNOVAL)) {
+                        retval = xfs_attr_rmtval_get(args);
+                }
+        }
+        /*
+         * If not in a transaction, we have to release all the buffers.
+         */
+        for (i = 0; i < state->path.active; i++) {
+                xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+                state->path.blk[i].bp = NULL;
+        }
+        xfs_da_state_free(state);
+        return retval;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
new file mode 100644
index 000000000000..b1f73dbbf3d8
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -0,0 +1,2697 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+                                 xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+                                   struct xfs_attr3_icleaf_hdr *ichdr,
+                                   struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+                                   struct xfs_attr3_icleaf_hdr *ichdr,
+                                   struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
+                                                   xfs_da_state_blk_t *blk1,
+                                                   xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+                        xfs_da_state_blk_t *leaf_blk_1,
+                        struct xfs_attr3_icleaf_hdr *ichdr1,
+                        xfs_da_state_blk_t *leaf_blk_2,
+                        struct xfs_attr3_icleaf_hdr *ichdr2,
+                        int *number_entries_in_blk1,
+                        int *number_usedbytes_in_blk1);
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+                        struct xfs_attr_leafblock *src_leaf,
+                        struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+                        struct xfs_attr_leafblock *dst_leaf,
+                        struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+                        int move_count);
+STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+void
+xfs_attr3_leaf_hdr_from_disk(
+        struct xfs_attr3_icleaf_hdr     *to,
+        struct xfs_attr_leafblock       *from)
+{
+        int     i;
+        ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+               from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+        if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+                struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+                to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+                to->back = be32_to_cpu(hdr3->info.hdr.back);
+                to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+                to->count = be16_to_cpu(hdr3->count);
+                to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+                to->firstused = be16_to_cpu(hdr3->firstused);
+                to->holes = hdr3->holes;
+                for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+                        to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+                        to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+                }
+                return;
+        }
+        to->forw = be32_to_cpu(from->hdr.info.forw);
+        to->back = be32_to_cpu(from->hdr.info.back);
+        to->magic = be16_to_cpu(from->hdr.info.magic);
+        to->count = be16_to_cpu(from->hdr.count);
+        to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+        to->firstused = be16_to_cpu(from->hdr.firstused);
+        to->holes = from->hdr.holes;
+        for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+                to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+                to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+        }
+}
+void
+xfs_attr3_leaf_hdr_to_disk(
+        struct xfs_attr_leafblock       *to,
+        struct xfs_attr3_icleaf_hdr     *from)
+{
+        int     i;
+        ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+               from->magic == XFS_ATTR3_LEAF_MAGIC);
+        if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+                struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+                hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+                hdr3->info.hdr.back = cpu_to_be32(from->back);
+                hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+                hdr3->count = cpu_to_be16(from->count);
+                hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+                hdr3->firstused = cpu_to_be16(from->firstused);
+                hdr3->holes = from->holes;
+                hdr3->pad1 = 0;
+                for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+                        hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+                        hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+                }
+                return;
+        }
+        to->hdr.info.forw = cpu_to_be32(from->forw);
+        to->hdr.info.back = cpu_to_be32(from->back);
+        to->hdr.info.magic = cpu_to_be16(from->magic);
+        to->hdr.count = cpu_to_be16(from->count);
+        to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+        to->hdr.firstused = cpu_to_be16(from->firstused);
+        to->hdr.holes = from->holes;
+        to->hdr.pad1 = 0;
+        for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+                to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+                to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+        }
+}
+static bool
+xfs_attr3_leaf_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_attr_leafblock *leaf = bp->b_addr;
+        struct xfs_attr3_icleaf_hdr ichdr;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+                if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
+                        return false;
+                if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+                        return false;
+        } else {
+                if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
+                        return false;
+        }
+        if (ichdr.count == 0)
+                return false;
+        /* XXX: need to range check rest of attr header values */
+        /* XXX: hash order check? */
+        return true;
+}
+static void
+xfs_attr3_leaf_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+        if (!xfs_attr3_leaf_verify(bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (bip)
+                hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
+}
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_attr3_leaf_read_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+             !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_attr3_leaf_verify(bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+        .verify_read = xfs_attr3_leaf_read_verify,
+        .verify_write = xfs_attr3_leaf_write_verify,
+};
+int
+xfs_attr3_leaf_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        int                     err;
+        err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                                XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+        if (!err && tp)
+                xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+        return err;
+}
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+/*
+ * If namespace bits don't match return 0.
+ * If all match then return 1.
+ */
+STATIC int
+xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+{
+        return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+}
+/*========================================================================
+ * External routines when attribute fork size < XFS_LITINO(mp).
+ *========================================================================*/
+/*
+ * Query whether the requested number of additional bytes of extended
+ * attribute space will be able to fit inline.
+ *
+ * Returns zero if not, else the di_forkoff fork offset to be used in the
+ * literal area for attribute data once the new bytes have been added.
+ *
+ * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
+ * special case for dev/uuid inodes, they have fixed size data forks.
+ */
+int
+xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
+{
+        int offset;
+        int minforkoff; /* lower limit on valid forkoff locations */
+        int maxforkoff; /* upper limit on valid forkoff locations */
+        int dsize;
+        xfs_mount_t *mp = dp->i_mount;
+        /* rounded down */
+        offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
+        switch (dp->i_d.di_format) {
+        case XFS_DINODE_FMT_DEV:
+                minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+                return (offset >= minforkoff) ? minforkoff : 0;
+        case XFS_DINODE_FMT_UUID:
+                minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
+                return (offset >= minforkoff) ? minforkoff : 0;
+        }
+        /*
+         * If the requested numbers of bytes is smaller or equal to the
+         * current attribute fork size we can always proceed.
+         *
+         * Note that if_bytes in the data fork might actually be larger than
+         * the current data fork size is due to delalloc extents. In that
+         * case either the extent count will go down when they are converted
+         * to real extents, or the delalloc conversion will take care of the
+         * literal area rebalancing.
+         */
+        if (bytes <= XFS_IFORK_ASIZE(dp))
+                return dp->i_d.di_forkoff;
+        /*
+         * For attr2 we can try to move the forkoff if there is space in the
+         * literal area, but for the old format we are done if there is no
+         * space in the fixed attribute fork.
+         */
+        if (!(mp->m_flags & XFS_MOUNT_ATTR2))
+                return 0;
+        dsize = dp->i_df.if_bytes;
+        switch (dp->i_d.di_format) {
+        case XFS_DINODE_FMT_EXTENTS:
+                /*
+                 * If there is no attr fork and the data fork is extents, 
+                 * determine if creating the default attr fork will result
+                 * in the extents form migrating to btree. If so, the
+                 * minimum offset only needs to be the space required for
+                 * the btree root.
+                 */
+                if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+                    xfs_default_attroffset(dp))
+                        dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                /*
+                 * If we have a data btree then keep forkoff if we have one,
+                 * otherwise we are adding a new attr, so then we set
+                 * minforkoff to where the btree root can finish so we have
+                 * plenty of room for attrs
+                 */
+                if (dp->i_d.di_forkoff) {
+                        if (offset < dp->i_d.di_forkoff)
+                                return 0;
+                        return dp->i_d.di_forkoff;
+                }
+                dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+                break;
+        }
+        /*
+         * A data fork btree root must have space for at least
+         * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
+         */
+        minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+        minforkoff = roundup(minforkoff, 8) >> 3;
+        /* attr fork btree root can have at least this many key/ptr pairs */
+        maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
+                        XFS_BMDR_SPACE_CALC(MINABTPTRS);
+        maxforkoff = maxforkoff >> 3;   /* rounded down */
+        if (offset >= maxforkoff)
+                return maxforkoff;
+        if (offset >= minforkoff)
+                return offset;
+        return 0;
+}
+/*
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2)
+ */
+STATIC void
+xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
+{
+        if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
+            !(xfs_sb_version_hasattr2(&mp->m_sb))) {
+                spin_lock(&mp->m_sb_lock);
+                if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
+                        xfs_sb_version_addattr2(&mp->m_sb);
+                        spin_unlock(&mp->m_sb_lock);
+                        xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+                } else
+                        spin_unlock(&mp->m_sb_lock);
+        }
+}
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+void
+xfs_attr_shortform_create(xfs_da_args_t *args)
+{
+        xfs_attr_sf_hdr_t *hdr;
+        xfs_inode_t *dp;
+        xfs_ifork_t *ifp;
+        trace_xfs_attr_sf_create(args);
+        dp = args->dp;
+        ASSERT(dp != NULL);
+        ifp = dp->i_afp;
+        ASSERT(ifp != NULL);
+        ASSERT(ifp->if_bytes == 0);
+        if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+                ifp->if_flags &= ~XFS_IFEXTENTS;        /* just in case */
+                dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+                ifp->if_flags |= XFS_IFINLINE;
+        } else {
+                ASSERT(ifp->if_flags & XFS_IFINLINE);
+        }
+        xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+        hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
+        hdr->count = 0;
+        hdr->totsize = cpu_to_be16(sizeof(*hdr));
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+}
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+void
+xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
+{
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        int i, offset, size;
+        xfs_mount_t *mp;
+        xfs_inode_t *dp;
+        xfs_ifork_t *ifp;
+        trace_xfs_attr_sf_add(args);
+        dp = args->dp;
+        mp = dp->i_mount;
+        dp->i_d.di_forkoff = forkoff;
+        ifp = dp->i_afp;
+        ASSERT(ifp->if_flags & XFS_IFINLINE);
+        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+        sfe = &sf->list[0];
+        for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+#ifdef DEBUG
+                if (sfe->namelen != args->namelen)
+                        continue;
+                if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+                        continue;
+                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+                        continue;
+                ASSERT(0);
+#endif
+        }
+        offset = (char *)sfe - (char *)sf;
+        size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+        xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+        sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+        sfe->namelen = args->namelen;
+        sfe->valuelen = args->valuelen;
+        sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+        memcpy(sfe->nameval, args->name, args->namelen);
+        memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
+        sf->hdr.count++;
+        be16_add_cpu(&sf->hdr.totsize, size);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+        xfs_sbversion_add_attr2(mp, args->trans);
+}
+/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+        struct xfs_inode        *ip,
+        struct xfs_trans        *tp)
+{
+        xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+        ip->i_d.di_forkoff = 0;
+        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+        ASSERT(ip->i_d.di_anextents == 0);
+        ASSERT(ip->i_afp == NULL);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+/*
+ * Remove an attribute from the shortform attribute list structure.
+ */
+int
+xfs_attr_shortform_remove(xfs_da_args_t *args)
+{
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        int base, size=0, end, totsize, i;
+        xfs_mount_t *mp;
+        xfs_inode_t *dp;
+        trace_xfs_attr_sf_remove(args);
+        dp = args->dp;
+        mp = dp->i_mount;
+        base = sizeof(xfs_attr_sf_hdr_t);
+        sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+        sfe = &sf->list[0];
+        end = sf->hdr.count;
+        for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+                                        base += size, i++) {
+                size = XFS_ATTR_SF_ENTSIZE(sfe);
+                if (sfe->namelen != args->namelen)
+                        continue;
+                if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
+                        continue;
+                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+                        continue;
+                break;
+        }
+        if (i == end)
+                return -ENOATTR;
+        /*
+         * Fix up the attribute fork data, covering the hole
+         */
+        end = base + size;
+        totsize = be16_to_cpu(sf->hdr.totsize);
+        if (end != totsize)
+                memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
+        sf->hdr.count--;
+        be16_add_cpu(&sf->hdr.totsize, -size);
+        /*
+         * Fix up the start offset of the attribute fork
+         */
+        totsize -= size;
+        if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
+            (mp->m_flags & XFS_MOUNT_ATTR2) &&
+            (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+            !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+                xfs_attr_fork_reset(dp, args->trans);
+        } else {
+                xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+                dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
+                ASSERT(dp->i_d.di_forkoff);
+                ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+                                (args->op_flags & XFS_DA_OP_ADDNAME) ||
+                                !(mp->m_flags & XFS_MOUNT_ATTR2) ||
+                                dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+                xfs_trans_log_inode(args->trans, dp,
+                                        XFS_ILOG_CORE | XFS_ILOG_ADATA);
+        }
+        xfs_sbversion_add_attr2(mp, args->trans);
+        return 0;
+}
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        int i;
+        xfs_ifork_t *ifp;
+        trace_xfs_attr_sf_lookup(args);
+        ifp = args->dp->i_afp;
+        ASSERT(ifp->if_flags & XFS_IFINLINE);
+        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+        sfe = &sf->list[0];
+        for (i = 0; i < sf->hdr.count;
+                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+                if (sfe->namelen != args->namelen)
+                        continue;
+                if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+                        continue;
+                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+                        continue;
+                return -EEXIST;
+        }
+        return -ENOATTR;
+}
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+{
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        int i;
+        ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
+        sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+        sfe = &sf->list[0];
+        for (i = 0; i < sf->hdr.count;
+                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+                if (sfe->namelen != args->namelen)
+                        continue;
+                if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+                        continue;
+                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+                        continue;
+                if (args->flags & ATTR_KERNOVAL) {
+                        args->valuelen = sfe->valuelen;
+                        return -EEXIST;
+                }
+                if (args->valuelen < sfe->valuelen) {
+                        args->valuelen = sfe->valuelen;
+                        return -ERANGE;
+                }
+                args->valuelen = sfe->valuelen;
+                memcpy(args->value, &sfe->nameval[args->namelen],
+                                                    args->valuelen);
+                return -EEXIST;
+        }
+        return -ENOATTR;
+}
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+{
+        xfs_inode_t *dp;
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        xfs_da_args_t nargs;
+        char *tmpbuffer;
+        int error, i, size;
+        xfs_dablk_t blkno;
+        struct xfs_buf *bp;
+        xfs_ifork_t *ifp;
+        trace_xfs_attr_sf_to_leaf(args);
+        dp = args->dp;
+        ifp = dp->i_afp;
+        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+        size = be16_to_cpu(sf->hdr.totsize);
+        tmpbuffer = kmem_alloc(size, KM_SLEEP);
+        ASSERT(tmpbuffer != NULL);
+        memcpy(tmpbuffer, ifp->if_u1.if_data, size);
+        sf = (xfs_attr_shortform_t *)tmpbuffer;
+        xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+        xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+        bp = NULL;
+        error = xfs_da_grow_inode(args, &blkno);
+        if (error) {
+                /*
+                 * If we hit an IO error middle of the transaction inside
+                 * grow_inode(), we may have inconsistent data. Bail out.
+                 */
+                if (error == -EIO)
+                        goto out;
+                xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
+                memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
+                goto out;
+        }
+        ASSERT(blkno == 0);
+        error = xfs_attr3_leaf_create(args, blkno, &bp);
+        if (error) {
+                error = xfs_da_shrink_inode(args, 0, bp);
+                bp = NULL;
+                if (error)
+                        goto out;
+                xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
+                memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
+                goto out;
+        }
+        memset((char *)&nargs, 0, sizeof(nargs));
+        nargs.dp = dp;
+        nargs.geo = args->geo;
+        nargs.firstblock = args->firstblock;
+        nargs.flist = args->flist;
+        nargs.total = args->total;
+        nargs.whichfork = XFS_ATTR_FORK;
+        nargs.trans = args->trans;
+        nargs.op_flags = XFS_DA_OP_OKNOENT;
+        sfe = &sf->list[0];
+        for (i = 0; i < sf->hdr.count; i++) {
+                nargs.name = sfe->nameval;
+                nargs.namelen = sfe->namelen;
+                nargs.value = &sfe->nameval[nargs.namelen];
+                nargs.valuelen = sfe->valuelen;
+                nargs.hashval = xfs_da_hashname(sfe->nameval,
+                                                sfe->namelen);
+                nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+                error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
+                ASSERT(error == -ENOATTR);
+                error = xfs_attr3_leaf_add(bp, &nargs);
+                ASSERT(error != -ENOSPC);
+                if (error)
+                        goto out;
+                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+        }
+        error = 0;
+out:
+        kmem_free(tmpbuffer);
+        return error;
+}
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(
+        struct xfs_buf          *bp,
+        struct xfs_inode        *dp)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr_leaf_entry *entry;
+        xfs_attr_leaf_name_local_t *name_loc;
+        struct xfs_attr3_icleaf_hdr leafhdr;
+        int                     bytes;
+        int                     i;
+        leaf = bp->b_addr;
+        xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+        entry = xfs_attr3_leaf_entryp(leaf);
+        bytes = sizeof(struct xfs_attr_sf_hdr);
+        for (i = 0; i < leafhdr.count; entry++, i++) {
+                if (entry->flags & XFS_ATTR_INCOMPLETE)
+                        continue;               /* don't copy partial entries */
+                if (!(entry->flags & XFS_ATTR_LOCAL))
+                        return 0;
+                name_loc = xfs_attr3_leaf_name_local(leaf, i);
+                if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+                        return 0;
+                if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
+                        return 0;
+                bytes += sizeof(struct xfs_attr_sf_entry) - 1
+                                + name_loc->namelen
+                                + be16_to_cpu(name_loc->valuelen);
+        }
+        if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
+            (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+            (bytes == sizeof(struct xfs_attr_sf_hdr)))
+                return -1;
+        return xfs_attr_shortform_bytesfit(dp, bytes);
+}
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr3_leaf_to_shortform(
+        struct xfs_buf          *bp,
+        struct xfs_da_args      *args,
+        int                     forkoff)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr3_icleaf_hdr ichdr;
+        struct xfs_attr_leaf_entry *entry;
+        struct xfs_attr_leaf_name_local *name_loc;
+        struct xfs_da_args      nargs;
+        struct xfs_inode        *dp = args->dp;
+        char                    *tmpbuffer;
+        int                     error;
+        int                     i;
+        trace_xfs_attr_leaf_to_sf(args);
+        tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+        if (!tmpbuffer)
+                return -ENOMEM;
+        memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+        leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        entry = xfs_attr3_leaf_entryp(leaf);
+        /* XXX (dgc): buffer is about to be marked stale - why zero it? */
+        memset(bp->b_addr, 0, args->geo->blksize);
+        /*
+         * Clean out the prior contents of the attribute list.
+         */
+        error = xfs_da_shrink_inode(args, 0, bp);
+        if (error)
+                goto out;
+        if (forkoff == -1) {
+                ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
+                ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
+                xfs_attr_fork_reset(dp, args->trans);
+                goto out;
+        }
+        xfs_attr_shortform_create(args);
+        /*
+         * Copy the attributes
+         */
+        memset((char *)&nargs, 0, sizeof(nargs));
+        nargs.geo = args->geo;
+        nargs.dp = dp;
+        nargs.firstblock = args->firstblock;
+        nargs.flist = args->flist;
+        nargs.total = args->total;
+        nargs.whichfork = XFS_ATTR_FORK;
+        nargs.trans = args->trans;
+        nargs.op_flags = XFS_DA_OP_OKNOENT;
+        for (i = 0; i < ichdr.count; entry++, i++) {
+                if (entry->flags & XFS_ATTR_INCOMPLETE)
+                        continue;       /* don't copy partial entries */
+                if (!entry->nameidx)
+                        continue;
+                ASSERT(entry->flags & XFS_ATTR_LOCAL);
+                name_loc = xfs_attr3_leaf_name_local(leaf, i);
+                nargs.name = name_loc->nameval;
+                nargs.namelen = name_loc->namelen;
+                nargs.value = &name_loc->nameval[nargs.namelen];
+                nargs.valuelen = be16_to_cpu(name_loc->valuelen);
+                nargs.hashval = be32_to_cpu(entry->hashval);
+                nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
+                xfs_attr_shortform_add(&nargs, forkoff);
+        }
+        error = 0;
+out:
+        kmem_free(tmpbuffer);
+        return error;
+}
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr3_leaf_to_node(
+        struct xfs_da_args      *args)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr3_icleaf_hdr icleafhdr;
+        struct xfs_attr_leaf_entry *entries;
+        struct xfs_da_node_entry *btree;
+        struct xfs_da3_icnode_hdr icnodehdr;
+        struct xfs_da_intnode   *node;
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_mount        *mp = dp->i_mount;
+        struct xfs_buf          *bp1 = NULL;
+        struct xfs_buf          *bp2 = NULL;
+        xfs_dablk_t             blkno;
+        int                     error;
+        trace_xfs_attr_leaf_to_node(args);
+        error = xfs_da_grow_inode(args, &blkno);
+        if (error)
+                goto out;
+        error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
+        if (error)
+                goto out;
+        error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
+        if (error)
+                goto out;
+        /* copy leaf to new buffer, update identifiers */
+        xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
+        bp2->b_ops = bp1->b_ops;
+        memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+                hdr3->blkno = cpu_to_be64(bp2->b_bn);
+        }
+        xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
+        /*
+         * Set up the new root node.
+         */
+        error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+        if (error)
+                goto out;
+        node = bp1->b_addr;
+        dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
+        btree = dp->d_ops->node_tree_p(node);
+        leaf = bp2->b_addr;
+        xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+        entries = xfs_attr3_leaf_entryp(leaf);
+        /* both on-disk, don't endian-flip twice */
+        btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+        btree[0].before = cpu_to_be32(blkno);
+        icnodehdr.count = 1;
+        dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
+        xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
+        error = 0;
+out:
+        return error;
+}
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+STATIC int
+xfs_attr3_leaf_create(
+        struct xfs_da_args      *args,
+        xfs_dablk_t             blkno,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr3_icleaf_hdr ichdr;
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_mount        *mp = dp->i_mount;
+        struct xfs_buf          *bp;
+        int                     error;
+        trace_xfs_attr_leaf_create(args);
+        error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+                                            XFS_ATTR_FORK);
+        if (error)
+                return error;
+        bp->b_ops = &xfs_attr3_leaf_buf_ops;
+        xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
+        leaf = bp->b_addr;
+        memset(leaf, 0, args->geo->blksize);
+        memset(&ichdr, 0, sizeof(ichdr));
+        ichdr.firstused = args->geo->blksize;
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
+                ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+                hdr3->blkno = cpu_to_be64(bp->b_bn);
+                hdr3->owner = cpu_to_be64(dp->i_ino);
+                uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+                ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+        } else {
+                ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+                ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+        }
+        ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+        xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+        xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr3_leaf_split(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *oldblk,
+        struct xfs_da_state_blk *newblk)
+{
+        xfs_dablk_t blkno;
+        int error;
+        trace_xfs_attr_leaf_split(state->args);
+        /*
+         * Allocate space for a new leaf node.
+         */
+        ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+        error = xfs_da_grow_inode(state->args, &blkno);
+        if (error)
+                return error;
+        error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
+        if (error)
+                return error;
+        newblk->blkno = blkno;
+        newblk->magic = XFS_ATTR_LEAF_MAGIC;
+        /*
+         * Rebalance the entries across the two leaves.
+         * NOTE: rebalance() currently depends on the 2nd block being empty.
+         */
+        xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+        error = xfs_da3_blk_link(state, oldblk, newblk);
+        if (error)
+                return error;
+        /*
+         * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+         * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+         * "new" attrs info.  Will need the "old" info to remove it later.
+         *
+         * Insert the "new" entry in the correct block.
+         */
+        if (state->inleaf) {
+                trace_xfs_attr_leaf_add_old(state->args);
+                error = xfs_attr3_leaf_add(oldblk->bp, state->args);
+        } else {
+                trace_xfs_attr_leaf_add_new(state->args);
+                error = xfs_attr3_leaf_add(newblk->bp, state->args);
+        }
+        /*
+         * Update last hashval in each block since we added the name.
+         */
+        oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+        newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+        return error;
+}
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr3_leaf_add(
+        struct xfs_buf          *bp,
+        struct xfs_da_args      *args)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr3_icleaf_hdr ichdr;
+        int                     tablesize;
+        int                     entsize;
+        int                     sum;
+        int                     tmp;
+        int                     i;
+        trace_xfs_attr_leaf_add(args);
+        leaf = bp->b_addr;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        ASSERT(args->index >= 0 && args->index <= ichdr.count);
+        entsize = xfs_attr_leaf_newentsize(args, NULL);
+        /*
+         * Search through freemap for first-fit on new name length.
+         * (may need to figure in size of entry struct too)
+         */
+        tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+                                        + xfs_attr3_leaf_hdr_size(leaf);
+        for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+                if (tablesize > ichdr.firstused) {
+                        sum += ichdr.freemap[i].size;
+                        continue;
+                }
+                if (!ichdr.freemap[i].size)
+                        continue;       /* no space in this map */
+                tmp = entsize;
+                if (ichdr.freemap[i].base < ichdr.firstused)
+                        tmp += sizeof(xfs_attr_leaf_entry_t);
+                if (ichdr.freemap[i].size >= tmp) {
+                        tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+                        goto out_log_hdr;
+                }
+                sum += ichdr.freemap[i].size;
+        }
+        /*
+         * If there are no holes in the address space of the block,
+         * and we don't have enough freespace, then compaction will do us
+         * no good and we should just give up.
+         */
+        if (!ichdr.holes && sum < entsize)
+                return -ENOSPC;
+        /*
+         * Compact the entries to coalesce free space.
+         * This may change the hdr->count via dropping INCOMPLETE entries.
+         */
+        xfs_attr3_leaf_compact(args, &ichdr, bp);
+        /*
+         * After compaction, the block is guaranteed to have only one
+         * free region, in freemap[0].  If it is not big enough, give up.
+         */
+        if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+                tmp = -ENOSPC;
+                goto out_log_hdr;
+        }
+        tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
+out_log_hdr:
+        xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+        xfs_trans_log_buf(args->trans, bp,
+                XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+                                xfs_attr3_leaf_hdr_size(leaf)));
+        return tmp;
+}
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr3_leaf_add_work(
+        struct xfs_buf          *bp,
+        struct xfs_attr3_icleaf_hdr *ichdr,
+        struct xfs_da_args      *args,
+        int                     mapindex)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr_leaf_entry *entry;
+        struct xfs_attr_leaf_name_local *name_loc;
+        struct xfs_attr_leaf_name_remote *name_rmt;
+        struct xfs_mount        *mp;
+        int                     tmp;
+        int                     i;
+        trace_xfs_attr_leaf_add_work(args);
+        leaf = bp->b_addr;
+        ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+        ASSERT(args->index >= 0 && args->index <= ichdr->count);
+        /*
+         * Force open some space in the entry array and fill it in.
+         */
+        entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+        if (args->index < ichdr->count) {
+                tmp  = ichdr->count - args->index;
+                tmp *= sizeof(xfs_attr_leaf_entry_t);
+                memmove(entry + 1, entry, tmp);
+                xfs_trans_log_buf(args->trans, bp,
+                    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+        }
+        ichdr->count++;
+        /*
+         * Allocate space for the new string (at the end of the run).
+         */
+        mp = args->trans->t_mountp;
+        ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
+        ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+        ASSERT(ichdr->freemap[mapindex].size >=
+                xfs_attr_leaf_newentsize(args, NULL));
+        ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
+        ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+        ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
+        entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+                                     ichdr->freemap[mapindex].size);
+        entry->hashval = cpu_to_be32(args->hashval);
+        entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
+        entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+        if (args->op_flags & XFS_DA_OP_RENAME) {
+                entry->flags |= XFS_ATTR_INCOMPLETE;
+                if ((args->blkno2 == args->blkno) &&
+                    (args->index2 <= args->index)) {
+                        args->index2++;
+                }
+        }
+        xfs_trans_log_buf(args->trans, bp,
+                          XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+        ASSERT((args->index == 0) ||
+               (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
+        ASSERT((args->index == ichdr->count - 1) ||
+               (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
+        /*
+         * For "remote" attribute values, simply note that we need to
+         * allocate space for the "remote" value.  We can't actually
+         * allocate the extents in this transaction, and we can't decide
+         * which blocks they should be as we might allocate more blocks
+         * as part of this transaction (a split operation for example).
+         */
+        if (entry->flags & XFS_ATTR_LOCAL) {
+                name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+                name_loc->namelen = args->namelen;
+                name_loc->valuelen = cpu_to_be16(args->valuelen);
+                memcpy((char *)name_loc->nameval, args->name, args->namelen);
+                memcpy((char *)&name_loc->nameval[args->namelen], args->value,
+                                   be16_to_cpu(name_loc->valuelen));
+        } else {
+                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+                name_rmt->namelen = args->namelen;
+                memcpy((char *)name_rmt->name, args->name, args->namelen);
+                entry->flags |= XFS_ATTR_INCOMPLETE;
+                /* just in case */
+                name_rmt->valuelen = 0;
+                name_rmt->valueblk = 0;
+                args->rmtblkno = 1;
+                args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+                args->rmtvaluelen = args->valuelen;
+        }
+        xfs_trans_log_buf(args->trans, bp,
+             XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+                                   xfs_attr_leaf_entsize(leaf, args->index)));
+        /*
+         * Update the control info for this leaf node
+         */
+        if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+                ichdr->firstused = be16_to_cpu(entry->nameidx);
+        ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+                                        + xfs_attr3_leaf_hdr_size(leaf));
+        tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+                                        + xfs_attr3_leaf_hdr_size(leaf);
+        for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+                if (ichdr->freemap[i].base == tmp) {
+                        ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+                        ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
+                }
+        }
+        ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+        return 0;
+}
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr3_leaf_compact(
+        struct xfs_da_args      *args,
+        struct xfs_attr3_icleaf_hdr *ichdr_dst,
+        struct xfs_buf          *bp)
+{
+        struct xfs_attr_leafblock *leaf_src;
+        struct xfs_attr_leafblock *leaf_dst;
+        struct xfs_attr3_icleaf_hdr ichdr_src;
+        struct xfs_trans        *trans = args->trans;
+        char                    *tmpbuffer;
+        trace_xfs_attr_leaf_compact(args);
+        tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+        memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+        memset(bp->b_addr, 0, args->geo->blksize);
+        leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+        leaf_dst = bp->b_addr;
+        /*
+         * Copy the on-disk header back into the destination buffer to ensure
+         * all the information in the header that is not part of the incore
+         * header structure is preserved.
+         */
+        memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+        /* Initialise the incore headers */
+        ichdr_src = *ichdr_dst; /* struct copy */
+        ichdr_dst->firstused = args->geo->blksize;
+        ichdr_dst->usedbytes = 0;
+        ichdr_dst->count = 0;
+        ichdr_dst->holes = 0;
+        ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+        ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+                                                ichdr_dst->freemap[0].base;
+        /* write the header back to initialise the underlying buffer */
+        xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
+        /*
+         * Copy all entry's in the same (sorted) order,
+         * but allocate name/value pairs packed and in sequence.
+         */
+        xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+                                leaf_dst, ichdr_dst, 0, ichdr_src.count);
+        /*
+         * this logs the entire buffer, but the caller must write the header
+         * back to the buffer when it is finished modifying it.
+         */
+        xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
+        kmem_free(tmpbuffer);
+}
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+        struct xfs_buf  *leaf1_bp,
+        struct xfs_attr3_icleaf_hdr *leaf1hdr,
+        struct xfs_buf  *leaf2_bp,
+        struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+        struct xfs_attr_leaf_entry *entries1;
+        struct xfs_attr_leaf_entry *entries2;
+        entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+        entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+        if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+            ((be32_to_cpu(entries2[0].hashval) <
+              be32_to_cpu(entries1[0].hashval)) ||
+             (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+              be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+                return 1;
+        }
+        return 0;
+}
+int
+xfs_attr_leaf_order(
+        struct xfs_buf  *leaf1_bp,
+        struct xfs_buf  *leaf2_bp)
+{
+        struct xfs_attr3_icleaf_hdr ichdr1;
+        struct xfs_attr3_icleaf_hdr ichdr2;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
+        xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+        return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
+}
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block.  At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block.  Those
+ * values are used in "atomic rename" operations on attributes.  Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr3_leaf_rebalance(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *blk1,
+        struct xfs_da_state_blk *blk2)
+{
+        struct xfs_da_args      *args;
+        struct xfs_attr_leafblock *leaf1;
+        struct xfs_attr_leafblock *leaf2;
+        struct xfs_attr3_icleaf_hdr ichdr1;
+        struct xfs_attr3_icleaf_hdr ichdr2;
+        struct xfs_attr_leaf_entry *entries1;
+        struct xfs_attr_leaf_entry *entries2;
+        int                     count;
+        int                     totallen;
+        int                     max;
+        int                     space;
+        int                     swap;
+        /*
+         * Set up environment.
+         */
+        ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+        leaf1 = blk1->bp->b_addr;
+        leaf2 = blk2->bp->b_addr;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+        xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+        ASSERT(ichdr2.count == 0);
+        args = state->args;
+        trace_xfs_attr_leaf_rebalance(args);
+        /*
+         * Check ordering of blocks, reverse if it makes things simpler.
+         *
+         * NOTE: Given that all (current) callers pass in an empty
+         * second block, this code should never set "swap".
+         */
+        swap = 0;
+        if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+                struct xfs_da_state_blk *tmp_blk;
+                struct xfs_attr3_icleaf_hdr tmp_ichdr;
+                tmp_blk = blk1;
+                blk1 = blk2;
+                blk2 = tmp_blk;
+                /* struct copies to swap them rather than reconverting */
+                tmp_ichdr = ichdr1;
+                ichdr1 = ichdr2;
+                ichdr2 = tmp_ichdr;
+                leaf1 = blk1->bp->b_addr;
+                leaf2 = blk2->bp->b_addr;
+                swap = 1;
+        }
+        /*
+         * Examine entries until we reduce the absolute difference in
+         * byte usage between the two blocks to a minimum.  Then get
+         * the direction to copy and the number of elements to move.
+         *
+         * "inleaf" is true if the new entry should be inserted into blk1.
+         * If "swap" is also true, then reverse the sense of "inleaf".
+         */
+        state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+                                                      blk2, &ichdr2,
+                                                      &count, &totallen);
+        if (swap)
+                state->inleaf = !state->inleaf;
+        /*
+         * Move any entries required from leaf to leaf:
+         */
+        if (count < ichdr1.count) {
+                /*
+                 * Figure the total bytes to be added to the destination leaf.
+                 */
+                /* number entries being moved */
+                count = ichdr1.count - count;
+                space  = ichdr1.usedbytes - totallen;
+                space += count * sizeof(xfs_attr_leaf_entry_t);
+                /*
+                 * leaf2 is the destination, compact it if it looks tight.
+                 */
+                max  = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+                max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
+                if (space > max)
+                        xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
+                /*
+                 * Move high entries from leaf1 to low end of leaf2.
+                 */
+                xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+                                ichdr1.count - count, leaf2, &ichdr2, 0, count);
+        } else if (count > ichdr1.count) {
+                /*
+                 * I assert that since all callers pass in an empty
+                 * second buffer, this code should never execute.
+                 */
+                ASSERT(0);
+                /*
+                 * Figure the total bytes to be added to the destination leaf.
+                 */
+                /* number entries being moved */
+                count -= ichdr1.count;
+                space  = totallen - ichdr1.usedbytes;
+                space += count * sizeof(xfs_attr_leaf_entry_t);
+                /*
+                 * leaf1 is the destination, compact it if it looks tight.
+                 */
+                max  = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+                max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
+                if (space > max)
+                        xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
+                /*
+                 * Move low entries from leaf2 to high end of leaf1.
+                 */
+                xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+                                        ichdr1.count, count);
+        }
+        xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
+        xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+        xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+        xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
+        /*
+         * Copy out last hashval in each block for B-tree code.
+         */
+        entries1 = xfs_attr3_leaf_entryp(leaf1);
+        entries2 = xfs_attr3_leaf_entryp(leaf2);
+        blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+        blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
+        /*
+         * Adjust the expected index for insertion.
+         * NOTE: this code depends on the (current) situation that the
+         * second block was originally empty.
+         *
+         * If the insertion point moved to the 2nd block, we must adjust
+         * the index.  We must also track the entry just following the
+         * new entry for use in an "atomic rename" operation, that entry
+         * is always the "old" entry and the "new" entry is what we are
+         * inserting.  The index/blkno fields refer to the "old" entry,
+         * while the index2/blkno2 fields refer to the "new" entry.
+         */
+        if (blk1->index > ichdr1.count) {
+                ASSERT(state->inleaf == 0);
+                blk2->index = blk1->index - ichdr1.count;
+                args->index = args->index2 = blk2->index;
+                args->blkno = args->blkno2 = blk2->blkno;
+        } else if (blk1->index == ichdr1.count) {
+                if (state->inleaf) {
+                        args->index = blk1->index;
+                        args->blkno = blk1->blkno;
+                        args->index2 = 0;
+                        args->blkno2 = blk2->blkno;
+                } else {
+                        /*
+                         * On a double leaf split, the original attr location
+                         * is already stored in blkno2/index2, so don't
+                         * overwrite it overwise we corrupt the tree.
+                         */
+                        blk2->index = blk1->index - ichdr1.count;
+                        args->index = blk2->index;
+                        args->blkno = blk2->blkno;
+                        if (!state->extravalid) {
+                                /*
+                                 * set the new attr location to match the old
+                                 * one and let the higher level split code
+                                 * decide where in the leaf to place it.
+                                 */
+                                args->index2 = blk2->index;
+                                args->blkno2 = blk2->blkno;
+                        }
+                }
+        } else {
+                ASSERT(state->inleaf == 1);
+                args->index = args->index2 = blk1->index;
+                args->blkno = args->blkno2 = blk1->blkno;
+        }
+}
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr3_leaf_figure_balance(
+        struct xfs_da_state             *state,
+        struct xfs_da_state_blk         *blk1,
+        struct xfs_attr3_icleaf_hdr     *ichdr1,
+        struct xfs_da_state_blk         *blk2,
+        struct xfs_attr3_icleaf_hdr     *ichdr2,
+        int                             *countarg,
+        int                             *usedbytesarg)
+{
+        struct xfs_attr_leafblock       *leaf1 = blk1->bp->b_addr;
+        struct xfs_attr_leafblock       *leaf2 = blk2->bp->b_addr;
+        struct xfs_attr_leaf_entry      *entry;
+        int                             count;
+        int                             max;
+        int                             index;
+        int                             totallen = 0;
+        int                             half;
+        int                             lastdelta;
+        int                             foundit = 0;
+        int                             tmp;
+        /*
+         * Examine entries until we reduce the absolute difference in
+         * byte usage between the two blocks to a minimum.
+         */
+        max = ichdr1->count + ichdr2->count;
+        half = (max + 1) * sizeof(*entry);
+        half += ichdr1->usedbytes + ichdr2->usedbytes +
+                        xfs_attr_leaf_newentsize(state->args, NULL);
+        half /= 2;
+        lastdelta = state->args->geo->blksize;
+        entry = xfs_attr3_leaf_entryp(leaf1);
+        for (count = index = 0; count < max; entry++, index++, count++) {
+#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
+                /*
+                 * The new entry is in the first block, account for it.
+                 */
+                if (count == blk1->index) {
+                        tmp = totallen + sizeof(*entry) +
+                                xfs_attr_leaf_newentsize(state->args, NULL);
+                        if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+                                break;
+                        lastdelta = XFS_ATTR_ABS(half - tmp);
+                        totallen = tmp;
+                        foundit = 1;
+                }
+                /*
+                 * Wrap around into the second block if necessary.
+                 */
+                if (count == ichdr1->count) {
+                        leaf1 = leaf2;
+                        entry = xfs_attr3_leaf_entryp(leaf1);
+                        index = 0;
+                }
+                /*
+                 * Figure out if next leaf entry would be too much.
+                 */
+                tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+                                                                        index);
+                if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+                        break;
+                lastdelta = XFS_ATTR_ABS(half - tmp);
+                totallen = tmp;
+#undef XFS_ATTR_ABS
+        }
+        /*
+         * Calculate the number of usedbytes that will end up in lower block.
+         * If new entry not in lower block, fix up the count.
+         */
+        totallen -= count * sizeof(*entry);
+        if (foundit) {
+                totallen -= sizeof(*entry) +
+                                xfs_attr_leaf_newentsize(state->args, NULL);
+        }
+        *countarg = count;
+        *usedbytesarg = totallen;
+        return foundit;
+}
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr3_leaf_toosmall(
+        struct xfs_da_state     *state,
+        int                     *action)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_da_state_blk *blk;
+        struct xfs_attr3_icleaf_hdr ichdr;
+        struct xfs_buf          *bp;
+        xfs_dablk_t             blkno;
+        int                     bytes;
+        int                     forward;
+        int                     error;
+        int                     retval;
+        int                     i;
+        trace_xfs_attr_leaf_toosmall(state->args);
+        /*
+         * Check for the degenerate case of the block being over 50% full.
+         * If so, it's not worth even looking to see if we might be able
+         * to coalesce with a sibling.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        leaf = blk->bp->b_addr;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        bytes = xfs_attr3_leaf_hdr_size(leaf) +
+                ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+                ichdr.usedbytes;
+        if (bytes > (state->args->geo->blksize >> 1)) {
+                *action = 0;    /* blk over 50%, don't try to join */
+                return 0;
+        }
+        /*
+         * Check for the degenerate case of the block being empty.
+         * If the block is empty, we'll simply delete it, no need to
+         * coalesce it with a sibling block.  We choose (arbitrarily)
+         * to merge with the forward block unless it is NULL.
+         */
+        if (ichdr.count == 0) {
+                /*
+                 * Make altpath point to the block we want to keep and
+                 * path point to the block we want to drop (this one).
+                 */
+                forward = (ichdr.forw != 0);
+                memcpy(&state->altpath, &state->path, sizeof(state->path));
+                error = xfs_da3_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+                if (error)
+                        return error;
+                if (retval) {
+                        *action = 0;
+                } else {
+                        *action = 2;
+                }
+                return 0;
+        }
+        /*
+         * Examine each sibling block to see if we can coalesce with
+         * at least 25% free space to spare.  We need to figure out
+         * whether to merge with the forward or the backward block.
+         * We prefer coalescing with the lower numbered sibling so as
+         * to shrink an attribute list over time.
+         */
+        /* start with smaller blk num */
+        forward = ichdr.forw < ichdr.back;
+        for (i = 0; i < 2; forward = !forward, i++) {
+                struct xfs_attr3_icleaf_hdr ichdr2;
+                if (forward)
+                        blkno = ichdr.forw;
+                else
+                        blkno = ichdr.back;
+                if (blkno == 0)
+                        continue;
+                error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
+                                        blkno, -1, &bp);
+                if (error)
+                        return error;
+                xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+                bytes = state->args->geo->blksize -
+                        (state->args->geo->blksize >> 2) -
+                        ichdr.usedbytes - ichdr2.usedbytes -
+                        ((ichdr.count + ichdr2.count) *
+                                        sizeof(xfs_attr_leaf_entry_t)) -
+                        xfs_attr3_leaf_hdr_size(leaf);
+                xfs_trans_brelse(state->args->trans, bp);
+                if (bytes >= 0)
+                        break;  /* fits with at least 25% to spare */
+        }
+        if (i >= 2) {
+                *action = 0;
+                return 0;
+        }
+        /*
+         * Make altpath point to the block we want to keep (the lower
+         * numbered block) and path point to the block we want to drop.
+         */
+        memcpy(&state->altpath, &state->path, sizeof(state->path));
+        if (blkno < blk->blkno) {
+                error = xfs_da3_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+        } else {
+                error = xfs_da3_path_shift(state, &state->path, forward,
+                                                 0, &retval);
+        }
+        if (error)
+                return error;
+        if (retval) {
+                *action = 0;
+        } else {
+                *action = 1;
+        }
+        return 0;
+}
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr3_leaf_remove(
+        struct xfs_buf          *bp,
+        struct xfs_da_args      *args)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr3_icleaf_hdr ichdr;
+        struct xfs_attr_leaf_entry *entry;
+        int                     before;
+        int                     after;
+        int                     smallest;
+        int                     entsize;
+        int                     tablesize;
+        int                     tmp;
+        int                     i;
+        trace_xfs_attr_leaf_remove(args);
+        leaf = bp->b_addr;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
+        ASSERT(args->index >= 0 && args->index < ichdr.count);
+        ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+                                        xfs_attr3_leaf_hdr_size(leaf));
+        entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+        ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+        ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+        /*
+         * Scan through free region table:
+         *    check for adjacency of free'd entry with an existing one,
+         *    find smallest free region in case we need to replace it,
+         *    adjust any map that borders the entry table,
+         */
+        tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+                                        + xfs_attr3_leaf_hdr_size(leaf);
+        tmp = ichdr.freemap[0].size;
+        before = after = -1;
+        smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+        entsize = xfs_attr_leaf_entsize(leaf, args->index);
+        for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+                ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+                ASSERT(ichdr.freemap[i].size < args->geo->blksize);
+                if (ichdr.freemap[i].base == tablesize) {
+                        ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+                        ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
+                }
+                if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+                                be16_to_cpu(entry->nameidx)) {
+                        before = i;
+                } else if (ichdr.freemap[i].base ==
+                                (be16_to_cpu(entry->nameidx) + entsize)) {
+                        after = i;
+                } else if (ichdr.freemap[i].size < tmp) {
+                        tmp = ichdr.freemap[i].size;
+                        smallest = i;
+                }
+        }
+        /*
+         * Coalesce adjacent freemap regions,
+         * or replace the smallest region.
+         */
+        if ((before >= 0) || (after >= 0)) {
+                if ((before >= 0) && (after >= 0)) {
+                        ichdr.freemap[before].size += entsize;
+                        ichdr.freemap[before].size += ichdr.freemap[after].size;
+                        ichdr.freemap[after].base = 0;
+                        ichdr.freemap[after].size = 0;
+                } else if (before >= 0) {
+                        ichdr.freemap[before].size += entsize;
+                } else {
+                        ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+                        ichdr.freemap[after].size += entsize;
+                }
+        } else {
+                /*
+                 * Replace smallest region (if it is smaller than free'd entry)
+                 */
+                if (ichdr.freemap[smallest].size < entsize) {
+                        ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+                        ichdr.freemap[smallest].size = entsize;
+                }
+        }
+        /*
+         * Did we remove the first entry?
+         */
+        if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
+                smallest = 1;
+        else
+                smallest = 0;
+        /*
+         * Compress the remaining entries and zero out the removed stuff.
+         */
+        memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+        ichdr.usedbytes -= entsize;
+        xfs_trans_log_buf(args->trans, bp,
+             XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
+                                   entsize));
+        tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+        memmove(entry, entry + 1, tmp);
+        ichdr.count--;
+        xfs_trans_log_buf(args->trans, bp,
+            XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+        entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+        memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
+        /*
+         * If we removed the first entry, re-find the first used byte
+         * in the name area.  Note that if the entry was the "firstused",
+         * then we don't have a "hole" in our block resulting from
+         * removing the name.
+         */
+        if (smallest) {
+                tmp = args->geo->blksize;
+                entry = xfs_attr3_leaf_entryp(leaf);
+                for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+                        ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
+                        ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
+                        if (be16_to_cpu(entry->nameidx) < tmp)
+                                tmp = be16_to_cpu(entry->nameidx);
+                }
+                ichdr.firstused = tmp;
+                if (!ichdr.firstused)
+                        ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
+        } else {
+                ichdr.holes = 1;        /* mark as needing compaction */
+        }
+        xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+        xfs_trans_log_buf(args->trans, bp,
+                          XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+                                          xfs_attr3_leaf_hdr_size(leaf)));
+        /*
+         * Check if leaf is less than 50% full, caller may want to
+         * "join" the leaf with a sibling if so.
+         */
+        tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+              ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+        return tmp < args->geo->magicpct; /* leaf is < 37% full */
+}
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr3_leaf_unbalance(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *drop_blk,
+        struct xfs_da_state_blk *save_blk)
+{
+        struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+        struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+        struct xfs_attr3_icleaf_hdr drophdr;
+        struct xfs_attr3_icleaf_hdr savehdr;
+        struct xfs_attr_leaf_entry *entry;
+        trace_xfs_attr_leaf_unbalance(state->args);
+        drop_leaf = drop_blk->bp->b_addr;
+        save_leaf = save_blk->bp->b_addr;
+        xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+        xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+        entry = xfs_attr3_leaf_entryp(drop_leaf);
+        /*
+         * Save last hashval from dying block for later Btree fixup.
+         */
+        drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
+        /*
+         * Check if we need a temp buffer, or can we do it in place.
+         * Note that we don't check "leaf" for holes because we will
+         * always be dropping it, toosmall() decided that for us already.
+         */
+        if (savehdr.holes == 0) {
+                /*
+                 * dest leaf has no holes, so we add there.  May need
+                 * to make some room in the entry array.
+                 */
+                if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+                                         drop_blk->bp, &drophdr)) {
+                        xfs_attr3_leaf_moveents(state->args,
+                                                drop_leaf, &drophdr, 0,
+                                                save_leaf, &savehdr, 0,
+                                                drophdr.count);
+                } else {
+                        xfs_attr3_leaf_moveents(state->args,
+                                                drop_leaf, &drophdr, 0,
+                                                save_leaf, &savehdr,
+                                                savehdr.count, drophdr.count);
+                }
+        } else {
+                /*
+                 * Destination has holes, so we make a temporary copy
+                 * of the leaf and add them both to that.
+                 */
+                struct xfs_attr_leafblock *tmp_leaf;
+                struct xfs_attr3_icleaf_hdr tmphdr;
+                tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+                /*
+                 * Copy the header into the temp leaf so that all the stuff
+                 * not in the incore header is present and gets copied back in
+                 * once we've moved all the entries.
+                 */
+                memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+                memset(&tmphdr, 0, sizeof(tmphdr));
+                tmphdr.magic = savehdr.magic;
+                tmphdr.forw = savehdr.forw;
+                tmphdr.back = savehdr.back;
+                tmphdr.firstused = state->args->geo->blksize;
+                /* write the header to the temp buffer to initialise it */
+                xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+                if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+                                         drop_blk->bp, &drophdr)) {
+                        xfs_attr3_leaf_moveents(state->args,
+                                                drop_leaf, &drophdr, 0,
+                                                tmp_leaf, &tmphdr, 0,
+                                                drophdr.count);
+                        xfs_attr3_leaf_moveents(state->args,
+                                                save_leaf, &savehdr, 0,
+                                                tmp_leaf, &tmphdr, tmphdr.count,
+                                                savehdr.count);
+                } else {
+                        xfs_attr3_leaf_moveents(state->args,
+                                                save_leaf, &savehdr, 0,
+                                                tmp_leaf, &tmphdr, 0,
+                                                savehdr.count);
+                        xfs_attr3_leaf_moveents(state->args,
+                                                drop_leaf, &drophdr, 0,
+                                                tmp_leaf, &tmphdr, tmphdr.count,
+                                                drophdr.count);
+                }
+                memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
+                savehdr = tmphdr; /* struct copy */
+                kmem_free(tmp_leaf);
+        }
+        xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
+        xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
+                                           state->args->geo->blksize - 1);
+        /*
+         * Copy out last hashval in each block for B-tree code.
+         */
+        entry = xfs_attr3_leaf_entryp(save_leaf);
+        save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
+}
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr3_leaf_lookup_int(
+        struct xfs_buf          *bp,
+        struct xfs_da_args      *args)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr3_icleaf_hdr ichdr;
+        struct xfs_attr_leaf_entry *entry;
+        struct xfs_attr_leaf_entry *entries;
+        struct xfs_attr_leaf_name_local *name_loc;
+        struct xfs_attr_leaf_name_remote *name_rmt;
+        xfs_dahash_t            hashval;
+        int                     probe;
+        int                     span;
+        trace_xfs_attr_leaf_lookup(args);
+        leaf = bp->b_addr;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        entries = xfs_attr3_leaf_entryp(leaf);
+        ASSERT(ichdr.count < args->geo->blksize / 8);
+        /*
+         * Binary search.  (note: small blocks will skip this loop)
+         */
+        hashval = args->hashval;
+        probe = span = ichdr.count / 2;
+        for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
+                span /= 2;
+                if (be32_to_cpu(entry->hashval) < hashval)
+                        probe += span;
+                else if (be32_to_cpu(entry->hashval) > hashval)
+                        probe -= span;
+                else
+                        break;
+        }
+        ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
+        ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
+        /*
+         * Since we may have duplicate hashval's, find the first matching
+         * hashval in the leaf.
+         */
+        while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
+                entry--;
+                probe--;
+        }
+        while (probe < ichdr.count &&
+               be32_to_cpu(entry->hashval) < hashval) {
+                entry++;
+                probe++;
+        }
+        if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
+                args->index = probe;
+                return -ENOATTR;
+        }
+        /*
+         * Duplicate keys may be present, so search all of them for a match.
+         */
+        for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
+                        entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+                /*
+                 * If we are looking for INCOMPLETE entries, show only those.
+                 * If we are looking for complete entries, show only those.
+                 */
+                if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+                    (entry->flags & XFS_ATTR_INCOMPLETE)) {
+                        continue;
+                }
+                if (entry->flags & XFS_ATTR_LOCAL) {
+                        name_loc = xfs_attr3_leaf_name_local(leaf, probe);
+                        if (name_loc->namelen != args->namelen)
+                                continue;
+                        if (memcmp(args->name, name_loc->nameval,
+                                                        args->namelen) != 0)
+                                continue;
+                        if (!xfs_attr_namesp_match(args->flags, entry->flags))
+                                continue;
+                        args->index = probe;
+                        return -EEXIST;
+                } else {
+                        name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
+                        if (name_rmt->namelen != args->namelen)
+                                continue;
+                        if (memcmp(args->name, name_rmt->name,
+                                                        args->namelen) != 0)
+                                continue;
+                        if (!xfs_attr_namesp_match(args->flags, entry->flags))
+                                continue;
+                        args->index = probe;
+                        args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+                        args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+                        args->rmtblkcnt = xfs_attr3_rmt_blocks(
+                                                        args->dp->i_mount,
+                                                        args->rmtvaluelen);
+                        return -EEXIST;
+                }
+        }
+        args->index = probe;
+        return -ENOATTR;
+}
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ */
+int
+xfs_attr3_leaf_getvalue(
+        struct xfs_buf          *bp,
+        struct xfs_da_args      *args)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr3_icleaf_hdr ichdr;
+        struct xfs_attr_leaf_entry *entry;
+        struct xfs_attr_leaf_name_local *name_loc;
+        struct xfs_attr_leaf_name_remote *name_rmt;
+        int                     valuelen;
+        leaf = bp->b_addr;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        ASSERT(ichdr.count < args->geo->blksize / 8);
+        ASSERT(args->index < ichdr.count);
+        entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+        if (entry->flags & XFS_ATTR_LOCAL) {
+                name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+                ASSERT(name_loc->namelen == args->namelen);
+                ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+                valuelen = be16_to_cpu(name_loc->valuelen);
+                if (args->flags & ATTR_KERNOVAL) {
+                        args->valuelen = valuelen;
+                        return 0;
+                }
+                if (args->valuelen < valuelen) {
+                        args->valuelen = valuelen;
+                        return -ERANGE;
+                }
+                args->valuelen = valuelen;
+                memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
+        } else {
+                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+                ASSERT(name_rmt->namelen == args->namelen);
+                ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+                args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+                args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+                args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+                                                       args->rmtvaluelen);
+                if (args->flags & ATTR_KERNOVAL) {
+                        args->valuelen = args->rmtvaluelen;
+                        return 0;
+                }
+                if (args->valuelen < args->rmtvaluelen) {
+                        args->valuelen = args->rmtvaluelen;
+                        return -ERANGE;
+                }
+                args->valuelen = args->rmtvaluelen;
+        }
+        return 0;
+}
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr3_leaf_moveents(
+        struct xfs_da_args              *args,
+        struct xfs_attr_leafblock       *leaf_s,
+        struct xfs_attr3_icleaf_hdr     *ichdr_s,
+        int                             start_s,
+        struct xfs_attr_leafblock       *leaf_d,
+        struct xfs_attr3_icleaf_hdr     *ichdr_d,
+        int                             start_d,
+        int                             count)
+{
+        struct xfs_attr_leaf_entry      *entry_s;
+        struct xfs_attr_leaf_entry      *entry_d;
+        int                             desti;
+        int                             tmp;
+        int                             i;
+        /*
+         * Check for nothing to do.
+         */
+        if (count == 0)
+                return;
+        /*
+         * Set up environment.
+         */
+        ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+               ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+        ASSERT(ichdr_s->magic == ichdr_d->magic);
+        ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
+        ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+                                        + xfs_attr3_leaf_hdr_size(leaf_s));
+        ASSERT(ichdr_d->count < args->geo->blksize / 8);
+        ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+                                        + xfs_attr3_leaf_hdr_size(leaf_d));
+        ASSERT(start_s < ichdr_s->count);
+        ASSERT(start_d <= ichdr_d->count);
+        ASSERT(count <= ichdr_s->count);
+        /*
+         * Move the entries in the destination leaf up to make a hole?
+         */
+        if (start_d < ichdr_d->count) {
+                tmp  = ichdr_d->count - start_d;
+                tmp *= sizeof(xfs_attr_leaf_entry_t);
+                entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+                entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+                memmove(entry_d, entry_s, tmp);
+        }
+        /*
+         * Copy all entry's in the same (sorted) order,
+         * but allocate attribute info packed and in sequence.
+         */
+        entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+        entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+        desti = start_d;
+        for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+                ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
+                tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+                /*
+                 * Code to drop INCOMPLETE entries.  Difficult to use as we
+                 * may also need to change the insertion index.  Code turned
+                 * off for 6.2, should be revisited later.
+                 */
+                if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+                        memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+                        ichdr_s->usedbytes -= tmp;
+                        ichdr_s->count -= 1;
+                        entry_d--;      /* to compensate for ++ in loop hdr */
+                        desti--;
+                        if ((start_s + i) < offset)
+                                result++;       /* insertion index adjustment */
+                } else {
+#endif /* GROT */
+                        ichdr_d->firstused -= tmp;
+                        /* both on-disk, don't endian flip twice */
+                        entry_d->hashval = entry_s->hashval;
+                        entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
+                        entry_d->flags = entry_s->flags;
+                        ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
+                                                        <= args->geo->blksize);
+                        memmove(xfs_attr3_leaf_name(leaf_d, desti),
+                                xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
+                        ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
+                                                        <= args->geo->blksize);
+                        memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+                        ichdr_s->usedbytes -= tmp;
+                        ichdr_d->usedbytes += tmp;
+                        ichdr_s->count -= 1;
+                        ichdr_d->count += 1;
+                        tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+                                        + xfs_attr3_leaf_hdr_size(leaf_d);
+                        ASSERT(ichdr_d->firstused >= tmp);
+#ifdef GROT
+                }
+#endif /* GROT */
+        }
+        /*
+         * Zero out the entries we just copied.
+         */
+        if (start_s == ichdr_s->count) {
+                tmp = count * sizeof(xfs_attr_leaf_entry_t);
+                entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+                ASSERT(((char *)entry_s + tmp) <=
+                       ((char *)leaf_s + args->geo->blksize));
+                memset(entry_s, 0, tmp);
+        } else {
+                /*
+                 * Move the remaining entries down to fill the hole,
+                 * then zero the entries at the top.
+                 */
+                tmp  = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+                entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+                entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+                memmove(entry_d, entry_s, tmp);
+                tmp = count * sizeof(xfs_attr_leaf_entry_t);
+                entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
+                ASSERT(((char *)entry_s + tmp) <=
+                       ((char *)leaf_s + args->geo->blksize));
+                memset(entry_s, 0, tmp);
+        }
+        /*
+         * Fill in the freemap information
+         */
+        ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+        ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+        ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+        ichdr_d->freemap[1].base = 0;
+        ichdr_d->freemap[2].base = 0;
+        ichdr_d->freemap[1].size = 0;
+        ichdr_d->freemap[2].size = 0;
+        ichdr_s->holes = 1;     /* leaf may not be compact */
+}
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(
+        struct xfs_buf  *bp,
+        int             *count)
+{
+        struct xfs_attr3_icleaf_hdr ichdr;
+        struct xfs_attr_leaf_entry *entries;
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+        entries = xfs_attr3_leaf_entryp(bp->b_addr);
+        if (count)
+                *count = ichdr.count;
+        if (!ichdr.count)
+                return 0;
+        return be32_to_cpu(entries[ichdr.count - 1].hashval);
+}
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+STATIC int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+        struct xfs_attr_leaf_entry *entries;
+        xfs_attr_leaf_name_local_t *name_loc;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        int size;
+        entries = xfs_attr3_leaf_entryp(leaf);
+        if (entries[index].flags & XFS_ATTR_LOCAL) {
+                name_loc = xfs_attr3_leaf_name_local(leaf, index);
+                size = xfs_attr_leaf_entsize_local(name_loc->namelen,
+                                                   be16_to_cpu(name_loc->valuelen));
+        } else {
+                name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
+                size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
+        }
+        return size;
+}
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(
+        struct xfs_da_args      *args,
+        int                     *local)
+{
+        int                     size;
+        size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+        if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+                if (local)
+                        *local = 1;
+                return size;
+        }
+        if (local)
+                *local = 0;
+        return xfs_attr_leaf_entsize_remote(args->namelen);
+}
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_clearflag(
+        struct xfs_da_args      *args)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr_leaf_entry *entry;
+        struct xfs_attr_leaf_name_remote *name_rmt;
+        struct xfs_buf          *bp;
+        int                     error;
+#ifdef DEBUG
+        struct xfs_attr3_icleaf_hdr ichdr;
+        xfs_attr_leaf_name_local_t *name_loc;
+        int namelen;
+        char *name;
+#endif /* DEBUG */
+        trace_xfs_attr_leaf_clearflag(args);
+        /*
+         * Set up the operation.
+         */
+        error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+        if (error)
+                return error;
+        leaf = bp->b_addr;
+        entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+        ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+#ifdef DEBUG
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        ASSERT(args->index < ichdr.count);
+        ASSERT(args->index >= 0);
+        if (entry->flags & XFS_ATTR_LOCAL) {
+                name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
+                namelen = name_loc->namelen;
+                name = (char *)name_loc->nameval;
+        } else {
+                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+                namelen = name_rmt->namelen;
+                name = (char *)name_rmt->name;
+        }
+        ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
+        ASSERT(namelen == args->namelen);
+        ASSERT(memcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+        entry->flags &= ~XFS_ATTR_INCOMPLETE;
+        xfs_trans_log_buf(args->trans, bp,
+                         XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+        if (args->rmtblkno) {
+                ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+                name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+                name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+                xfs_trans_log_buf(args->trans, bp,
+                         XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+        }
+        /*
+         * Commit the flag value change and start the next trans in series.
+         */
+        return xfs_trans_roll(&args->trans, args->dp);
+}
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr3_leaf_setflag(
+        struct xfs_da_args      *args)
+{
+        struct xfs_attr_leafblock *leaf;
+        struct xfs_attr_leaf_entry *entry;
+        struct xfs_attr_leaf_name_remote *name_rmt;
+        struct xfs_buf          *bp;
+        int error;
+#ifdef DEBUG
+        struct xfs_attr3_icleaf_hdr ichdr;
+#endif
+        trace_xfs_attr_leaf_setflag(args);
+        /*
+         * Set up the operation.
+         */
+        error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+        if (error)
+                return error;
+        leaf = bp->b_addr;
+#ifdef DEBUG
+        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        ASSERT(args->index < ichdr.count);
+        ASSERT(args->index >= 0);
+#endif
+        entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+        ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+        entry->flags |= XFS_ATTR_INCOMPLETE;
+        xfs_trans_log_buf(args->trans, bp,
+                        XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+        if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+                name_rmt->valueblk = 0;
+                name_rmt->valuelen = 0;
+                xfs_trans_log_buf(args->trans, bp,
+                         XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+        }
+        /*
+         * Commit the flag value change and start the next trans in series.
+         */
+        return xfs_trans_roll(&args->trans, args->dp);
+}
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr3_leaf_flipflags(
+        struct xfs_da_args      *args)
+{
+        struct xfs_attr_leafblock *leaf1;
+        struct xfs_attr_leafblock *leaf2;
+        struct xfs_attr_leaf_entry *entry1;
+        struct xfs_attr_leaf_entry *entry2;
+        struct xfs_attr_leaf_name_remote *name_rmt;
+        struct xfs_buf          *bp1;
+        struct xfs_buf          *bp2;
+        int error;
+#ifdef DEBUG
+        struct xfs_attr3_icleaf_hdr ichdr1;
+        struct xfs_attr3_icleaf_hdr ichdr2;
+        xfs_attr_leaf_name_local_t *name_loc;
+        int namelen1, namelen2;
+        char *name1, *name2;
+#endif /* DEBUG */
+        trace_xfs_attr_leaf_flipflags(args);
+        /*
+         * Read the block containing the "old" attr
+         */
+        error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+        if (error)
+                return error;
+        /*
+         * Read the block containing the "new" attr, if it is different
+         */
+        if (args->blkno2 != args->blkno) {
+                error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
+                                           -1, &bp2);
+                if (error)
+                        return error;
+        } else {
+                bp2 = bp1;
+        }
+        leaf1 = bp1->b_addr;
+        entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
+        leaf2 = bp2->b_addr;
+        entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
+#ifdef DEBUG
+        xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+        ASSERT(args->index < ichdr1.count);
+        ASSERT(args->index >= 0);
+        xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+        ASSERT(args->index2 < ichdr2.count);
+        ASSERT(args->index2 >= 0);
+        if (entry1->flags & XFS_ATTR_LOCAL) {
+                name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
+                namelen1 = name_loc->namelen;
+                name1 = (char *)name_loc->nameval;
+        } else {
+                name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+                namelen1 = name_rmt->namelen;
+                name1 = (char *)name_rmt->name;
+        }
+        if (entry2->flags & XFS_ATTR_LOCAL) {
+                name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
+                namelen2 = name_loc->namelen;
+                name2 = (char *)name_loc->nameval;
+        } else {
+                name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+                namelen2 = name_rmt->namelen;
+                name2 = (char *)name_rmt->name;
+        }
+        ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
+        ASSERT(namelen1 == namelen2);
+        ASSERT(memcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+        ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+        ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+        entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+        xfs_trans_log_buf(args->trans, bp1,
+                          XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+        if (args->rmtblkno) {
+                ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+                name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
+                name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+                name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
+                xfs_trans_log_buf(args->trans, bp1,
+                         XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+        }
+        entry2->flags |= XFS_ATTR_INCOMPLETE;
+        xfs_trans_log_buf(args->trans, bp2,
+                          XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+        if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+                name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
+                name_rmt->valueblk = 0;
+                name_rmt->valuelen = 0;
+                xfs_trans_log_buf(args->trans, bp2,
+                         XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+        }
+        /*
+         * Commit the flag value change and start the next trans in series.
+         */
+        error = xfs_trans_roll(&args->trans, args->dp);
+        return error;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
new file mode 100644
index 000000000000..e2929da7c3ba
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define __XFS_ATTR_LEAF_H__
+struct attrlist;
+struct attrlist_cursor_kern;
+struct xfs_attr_list_context;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+/*
+ * Used to keep a list of "remote value" extents when unlinking an inode.
+ */
+typedef struct xfs_attr_inactive_list {
+        xfs_dablk_t     valueblk;       /* block number of value bytes */
+        int             valuelen;       /* number of bytes in value */
+} xfs_attr_inactive_list_t;
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Internal routines when attribute fork size < XFS_LITINO(mp).
+ */
+void    xfs_attr_shortform_create(struct xfs_da_args *args);
+void    xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
+int     xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int     xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int     xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int     xfs_attr_shortform_remove(struct xfs_da_args *args);
+int     xfs_attr_shortform_list(struct xfs_attr_list_context *context);
+int     xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
+int     xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
+/*
+ * Internal routines when attribute fork size == XFS_LBSIZE(mp).
+ */
+int     xfs_attr3_leaf_to_node(struct xfs_da_args *args);
+int     xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
+                                   struct xfs_da_args *args, int forkoff);
+int     xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
+int     xfs_attr3_leaf_setflag(struct xfs_da_args *args);
+int     xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
+/*
+ * Routines used for growing the Btree.
+ */
+int     xfs_attr3_leaf_split(struct xfs_da_state *state,
+                                   struct xfs_da_state_blk *oldblk,
+                                   struct xfs_da_state_blk *newblk);
+int     xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
+                                        struct xfs_da_args *args);
+int     xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int     xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
+                                 struct xfs_da_args *args);
+int     xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
+                                    struct xfs_da_args *args);
+int     xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+                                      struct xfs_attr_list_context *context);
+/*
+ * Routines used for shrinking the Btree.
+ */
+int     xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void    xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
+                                       struct xfs_da_state_blk *drop_blk,
+                                       struct xfs_da_state_blk *save_blk);
+int     xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+/*
+ * Utility routines.
+ */
+xfs_dahash_t    xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
+int     xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
+                                   struct xfs_buf *leaf2_bp);
+int     xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
+int     xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                        xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                        struct xfs_buf **bpp);
+void    xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+                                     struct xfs_attr_leafblock *from);
+void    xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+                                   struct xfs_attr3_icleaf_hdr *from);
+#endif  /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
new file mode 100644
index 000000000000..7510ab8058a4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+#define ATTR_RMTVALUE_MAPSIZE   1       /* # of map entries at once */
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+        struct xfs_mount *mp,
+        int             attrlen)
+{
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+                return (attrlen + buflen - 1) / buflen;
+        }
+        return XFS_B_TO_FSB(mp, attrlen);
+}
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+        void                    *ptr,
+        xfs_ino_t               ino,
+        uint32_t                offset,
+        uint32_t                size,
+        xfs_daddr_t             bno)
+{
+        struct xfs_attr3_rmt_hdr *rmt = ptr;
+        if (bno != be64_to_cpu(rmt->rm_blkno))
+                return false;
+        if (offset != be32_to_cpu(rmt->rm_offset))
+                return false;
+        if (size != be32_to_cpu(rmt->rm_bytes))
+                return false;
+        if (ino != be64_to_cpu(rmt->rm_owner))
+                return false;
+        /* ok */
+        return true;
+}
+static bool
+xfs_attr3_rmt_verify(
+        struct xfs_mount        *mp,
+        void                    *ptr,
+        int                     fsbsize,
+        xfs_daddr_t             bno)
+{
+        struct xfs_attr3_rmt_hdr *rmt = ptr;
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return false;
+        if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+                return false;
+        if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+                return false;
+        if (be64_to_cpu(rmt->rm_blkno) != bno)
+                return false;
+        if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+                return false;
+        if (be32_to_cpu(rmt->rm_offset) +
+                                be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+                return false;
+        if (rmt->rm_owner == 0)
+                return false;
+        return true;
+}
+static void
+xfs_attr3_rmt_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        char            *ptr;
+        int             len;
+        xfs_daddr_t     bno;
+        int             blksize = mp->m_attr_geo->blksize;
+        /* no verification of non-crc buffers */
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        ptr = bp->b_addr;
+        bno = bp->b_bn;
+        len = BBTOB(bp->b_length);
+        ASSERT(len >= blksize);
+        while (len > 0) {
+                if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
+                        xfs_buf_ioerror(bp, -EFSBADCRC);
+                        break;
+                }
+                if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+                        xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                        break;
+                }
+                len -= blksize;
+                ptr += blksize;
+                bno += BTOBB(blksize);
+        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+        else
+                ASSERT(len == 0);
+}
+static void
+xfs_attr3_rmt_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        char            *ptr;
+        int             len;
+        xfs_daddr_t     bno;
+        int             blksize = mp->m_attr_geo->blksize;
+        /* no verification of non-crc buffers */
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        ptr = bp->b_addr;
+        bno = bp->b_bn;
+        len = BBTOB(bp->b_length);
+        ASSERT(len >= blksize);
+        while (len > 0) {
+                if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+                        xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                        xfs_verifier_error(bp);
+                        return;
+                }
+                if (bip) {
+                        struct xfs_attr3_rmt_hdr *rmt;
+                        rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+                        rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+                }
+                xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
+                len -= blksize;
+                ptr += blksize;
+                bno += BTOBB(blksize);
+        }
+        ASSERT(len == 0);
+}
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+        .verify_read = xfs_attr3_rmt_read_verify,
+        .verify_write = xfs_attr3_rmt_write_verify,
+};
+STATIC int
+xfs_attr3_rmt_hdr_set(
+        struct xfs_mount        *mp,
+        void                    *ptr,
+        xfs_ino_t               ino,
+        uint32_t                offset,
+        uint32_t                size,
+        xfs_daddr_t             bno)
+{
+        struct xfs_attr3_rmt_hdr *rmt = ptr;
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return 0;
+        rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+        rmt->rm_offset = cpu_to_be32(offset);
+        rmt->rm_bytes = cpu_to_be32(size);
+        uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+        rmt->rm_owner = cpu_to_be64(ino);
+        rmt->rm_blkno = cpu_to_be64(bno);
+        return sizeof(struct xfs_attr3_rmt_hdr);
+}
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+        struct xfs_mount *mp,
+        struct xfs_buf  *bp,
+        xfs_ino_t       ino,
+        int             *offset,
+        int             *valuelen,
+        __uint8_t       **dst)
+{
+        char            *src = bp->b_addr;
+        xfs_daddr_t     bno = bp->b_bn;
+        int             len = BBTOB(bp->b_length);
+        int             blksize = mp->m_attr_geo->blksize;
+        ASSERT(len >= blksize);
+        while (len > 0 && *valuelen > 0) {
+                int hdr_size = 0;
+                int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+                byte_cnt = min(*valuelen, byte_cnt);
+                if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                        if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
+                                                  byte_cnt, bno)) {
+                                xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+                                        bno, *offset, byte_cnt, ino);
+                                return -EFSCORRUPTED;
+                        }
+                        hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+                }
+                memcpy(*dst, src + hdr_size, byte_cnt);
+                /* roll buffer forwards */
+                len -= blksize;
+                src += blksize;
+                bno += BTOBB(blksize);
+                /* roll attribute data forwards */
+                *valuelen -= byte_cnt;
+                *dst += byte_cnt;
+                *offset += byte_cnt;
+        }
+        return 0;
+}
+STATIC void
+xfs_attr_rmtval_copyin(
+        struct xfs_mount *mp,
+        struct xfs_buf  *bp,
+        xfs_ino_t       ino,
+        int             *offset,
+        int             *valuelen,
+        __uint8_t       **src)
+{
+        char            *dst = bp->b_addr;
+        xfs_daddr_t     bno = bp->b_bn;
+        int             len = BBTOB(bp->b_length);
+        int             blksize = mp->m_attr_geo->blksize;
+        ASSERT(len >= blksize);
+        while (len > 0 && *valuelen > 0) {
+                int hdr_size;
+                int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
+                byte_cnt = min(*valuelen, byte_cnt);
+                hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+                                                 byte_cnt, bno);
+                memcpy(dst + hdr_size, *src, byte_cnt);
+                /*
+                 * If this is the last block, zero the remainder of it.
+                 * Check that we are actually the last block, too.
+                 */
+                if (byte_cnt + hdr_size < blksize) {
+                        ASSERT(*valuelen - byte_cnt == 0);
+                        ASSERT(len == blksize);
+                        memset(dst + hdr_size + byte_cnt, 0,
+                                        blksize - hdr_size - byte_cnt);
+                }
+                /* roll buffer forwards */
+                len -= blksize;
+                dst += blksize;
+                bno += BTOBB(blksize);
+                /* roll attribute data forwards */
+                *valuelen -= byte_cnt;
+                *src += byte_cnt;
+                *offset += byte_cnt;
+        }
+}
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(
+        struct xfs_da_args      *args)
+{
+        struct xfs_bmbt_irec    map[ATTR_RMTVALUE_MAPSIZE];
+        struct xfs_mount        *mp = args->dp->i_mount;
+        struct xfs_buf          *bp;
+        xfs_dablk_t             lblkno = args->rmtblkno;
+        __uint8_t               *dst = args->value;
+        int                     valuelen;
+        int                     nmap;
+        int                     error;
+        int                     blkcnt = args->rmtblkcnt;
+        int                     i;
+        int                     offset = 0;
+        trace_xfs_attr_rmtval_get(args);
+        ASSERT(!(args->flags & ATTR_KERNOVAL));
+        ASSERT(args->rmtvaluelen == args->valuelen);
+        valuelen = args->rmtvaluelen;
+        while (valuelen > 0) {
+                nmap = ATTR_RMTVALUE_MAPSIZE;
+                error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+                                       blkcnt, map, &nmap,
+                                       XFS_BMAPI_ATTRFORK);
+                if (error)
+                        return error;
+                ASSERT(nmap >= 1);
+                for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+                        xfs_daddr_t     dblkno;
+                        int             dblkcnt;
+                        ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+                               (map[i].br_startblock != HOLESTARTBLOCK));
+                        dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+                        dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+                        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+                                                   dblkno, dblkcnt, 0, &bp,
+                                                   &xfs_attr3_rmt_buf_ops);
+                        if (error)
+                                return error;
+                        error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+                                                        &offset, &valuelen,
+                                                        &dst);
+                        xfs_buf_relse(bp);
+                        if (error)
+                                return error;
+                        /* roll attribute extent map forwards */
+                        lblkno += map[i].br_blockcount;
+                        blkcnt -= map[i].br_blockcount;
+                }
+        }
+        ASSERT(valuelen == 0);
+        return 0;
+}
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+int
+xfs_attr_rmtval_set(
+        struct xfs_da_args      *args)
+{
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_mount        *mp = dp->i_mount;
+        struct xfs_bmbt_irec    map;
+        xfs_dablk_t             lblkno;
+        xfs_fileoff_t           lfileoff = 0;
+        __uint8_t               *src = args->value;
+        int                     blkcnt;
+        int                     valuelen;
+        int                     nmap;
+        int                     error;
+        int                     offset = 0;
+        trace_xfs_attr_rmtval_set(args);
+        /*
+         * Find a "hole" in the attribute address space large enough for
+         * us to drop the new attribute's value into. Because CRC enable
+         * attributes have headers, we can't just do a straight byte to FSB
+         * conversion and have to take the header space into account.
+         */
+        blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
+        error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+                                                   XFS_ATTR_FORK);
+        if (error)
+                return error;
+        args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+        args->rmtblkcnt = blkcnt;
+        /*
+         * Roll through the "value", allocating blocks on disk as required.
+         */
+        while (blkcnt > 0) {
+                int     committed;
+                /*
+                 * Allocate a single extent, up to the size of the value.
+                 */
+                xfs_bmap_init(args->flist, args->firstblock);
+                nmap = 1;
+                error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
+                                  blkcnt,
+                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                  args->firstblock, args->total, &map, &nmap,
+                                  args->flist);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        return error;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed)
+                        xfs_trans_ijoin(args->trans, dp, 0);
+                ASSERT(nmap == 1);
+                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                       (map.br_startblock != HOLESTARTBLOCK));
+                lblkno += map.br_blockcount;
+                blkcnt -= map.br_blockcount;
+                /*
+                 * Start the next trans in the chain.
+                 */
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
+                        return error;
+        }
+        /*
+         * Roll through the "value", copying the attribute value to the
+         * already-allocated blocks.  Blocks are written synchronously
+         * so that we can know they are all on disk before we turn off
+         * the INCOMPLETE flag.
+         */
+        lblkno = args->rmtblkno;
+        blkcnt = args->rmtblkcnt;
+        valuelen = args->rmtvaluelen;
+        while (valuelen > 0) {
+                struct xfs_buf  *bp;
+                xfs_daddr_t     dblkno;
+                int             dblkcnt;
+                ASSERT(blkcnt > 0);
+                xfs_bmap_init(args->flist, args->firstblock);
+                nmap = 1;
+                error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+                                       blkcnt, &map, &nmap,
+                                       XFS_BMAPI_ATTRFORK);
+                if (error)
+                        return error;
+                ASSERT(nmap == 1);
+                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                       (map.br_startblock != HOLESTARTBLOCK));
+                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+                dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+                bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+                if (!bp)
+                        return -ENOMEM;
+                bp->b_ops = &xfs_attr3_rmt_buf_ops;
+                xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+                                       &valuelen, &src);
+                error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
+                xfs_buf_relse(bp);
+                if (error)
+                        return error;
+                /* roll attribute extent map forwards */
+                lblkno += map.br_blockcount;
+                blkcnt -= map.br_blockcount;
+        }
+        ASSERT(valuelen == 0);
+        return 0;
+}
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+        struct xfs_da_args      *args)
+{
+        struct xfs_mount        *mp = args->dp->i_mount;
+        xfs_dablk_t             lblkno;
+        int                     blkcnt;
+        int                     error;
+        int                     done;
+        trace_xfs_attr_rmtval_remove(args);
+        /*
+         * Roll through the "value", invalidating the attribute value's blocks.
+         */
+        lblkno = args->rmtblkno;
+        blkcnt = args->rmtblkcnt;
+        while (blkcnt > 0) {
+                struct xfs_bmbt_irec    map;
+                struct xfs_buf          *bp;
+                xfs_daddr_t             dblkno;
+                int                     dblkcnt;
+                int                     nmap;
+                /*
+                 * Try to remember where we decided to put the value.
+                 */
+                nmap = 1;
+                error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+                                       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+                if (error)
+                        return error;
+                ASSERT(nmap == 1);
+                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                       (map.br_startblock != HOLESTARTBLOCK));
+                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+                dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+                /*
+                 * If the "remote" value is in the cache, remove it.
+                 */
+                bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+                if (bp) {
+                        xfs_buf_stale(bp);
+                        xfs_buf_relse(bp);
+                        bp = NULL;
+                }
+                lblkno += map.br_blockcount;
+                blkcnt -= map.br_blockcount;
+        }
+        /*
+         * Keep de-allocating extents until the remote-value region is gone.
+         */
+        lblkno = args->rmtblkno;
+        blkcnt = args->rmtblkcnt;
+        done = 0;
+        while (!done) {
+                int committed;
+                xfs_bmap_init(args->flist, args->firstblock);
+                error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+                                    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                    1, args->firstblock, args->flist,
+                                    &done);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        return error;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed)
+                        xfs_trans_ijoin(args->trans, args->dp, 0);
+                /*
+                 * Close out trans and start the next one in the chain.
+                 */
+                error = xfs_trans_roll(&args->trans, args->dp);
+                if (error)
+                        return error;
+        }
+        return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
new file mode 100644
index 000000000000..5a9acfa156d7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_REMOTE_H__
+#define __XFS_ATTR_REMOTE_H__
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_rmtval_set(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_da_args *args);
+#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
new file mode 100644
index 000000000000..919756e3ba53
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define __XFS_ATTR_SF_H__
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+        struct xfs_attr_sf_hdr {        /* constant-structure header block */
+                __be16  totsize;        /* total bytes in shortform list */
+                __u8    count;  /* count of active entries */
+        } hdr;
+        struct xfs_attr_sf_entry {
+                __uint8_t namelen;      /* actual length of name (no NULL) */
+                __uint8_t valuelen;     /* actual length of value (no NULL) */
+                __uint8_t flags;        /* flags bits (see xfs_attr_leaf.h) */
+                __uint8_t nameval[1];   /* name & value bytes concatenated */
+        } list[1];                      /* variable sized array */
+} xfs_attr_shortform_t;
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+        __uint8_t       entno;          /* entry number in original list */
+        __uint8_t       namelen;        /* length of name value (no null) */
+        __uint8_t       valuelen;       /* length of value */
+        __uint8_t       flags;          /* flags bits (see xfs_attr_leaf.h) */
+        xfs_dahash_t    hash;           /* this entry's hash value */
+        unsigned char   *name;          /* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)   /* space name/value uses */ \
+        (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
+#define XFS_ATTR_SF_ENTSIZE_MAX                 /* max space for name&value */ \
+        ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
+#define XFS_ATTR_SF_ENTSIZE(sfep)               /* space an entry uses */ \
+        ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
+#define XFS_ATTR_SF_NEXTENTRY(sfep)             /* next entry in struct */ \
+        ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
+#define XFS_ATTR_SF_TOTSIZE(dp)                 /* total space in use */ \
+        (be16_to_cpu(((xfs_attr_shortform_t *)  \
+                ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
+#endif  /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
new file mode 100644
index 000000000000..e1649c0d3e02
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bit.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BIT_H__
+#define __XFS_BIT_H__
+/*
+ * XFS bit manipulation routines.
+ */
+/*
+ * masks with n high/low bits set, 64-bit values
+ */
+static inline __uint64_t xfs_mask64hi(int n)
+{
+        return (__uint64_t)-1 << (64 - (n));
+}
+static inline __uint32_t xfs_mask32lo(int n)
+{
+        return ((__uint32_t)1 << (n)) - 1;
+}
+static inline __uint64_t xfs_mask64lo(int n)
+{
+        return ((__uint64_t)1 << (n)) - 1;
+}
+/* Get high bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_highbit32(__uint32_t v)
+{
+        return fls(v) - 1;
+}
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+        return fls64(v) - 1;
+}
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+        return ffs(v) - 1;
+}
+/* Get low bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_lowbit64(__uint64_t v)
+{
+        __uint32_t      w = (__uint32_t)v;
+        int             n = 0;
+        if (w) {        /* lower bits */
+                n = ffs(w);
+        } else {        /* upper bits */
+                w = (__uint32_t)(v >> 32);
+                if (w) {
+                        n = ffs(w);
+                        if (n)
+                                n += 32;
+                }
+        }
+        return n - 1;
+}
+/* Return whether bitmap is empty (1 == empty) */
+extern int xfs_bitmap_empty(uint *map, uint size);
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+#endif  /* __XFS_BIT_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
new file mode 100644
index 000000000000..94ac88306fa6
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -0,0 +1,5606 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
+kmem_zone_t             *xfs_bmap_free_item_zone;
+/*
+ * Miscellaneous helper functions
+ */
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        int             whichfork)      /* data or attr fork */
+{
+        int             level;          /* btree level */
+        uint            maxblocks;      /* max blocks at this level */
+        uint            maxleafents;    /* max leaf entries possible */
+        int             maxrootrecs;    /* max records in root block */
+        int             minleafrecs;    /* min records in leaf block */
+        int             minnoderecs;    /* min records in node block */
+        int             sz;             /* root block size */
+        /*
+         * The maximum number of extents in a file, hence the maximum
+         * number of leaf entries, is controlled by the type of di_nextents
+         * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+         * (a signed 16-bit number, xfs_aextnum_t).
+         *
+         * Note that we can no longer assume that if we are in ATTR1 that
+         * the fork offset of all the inodes will be
+         * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+         * with ATTR2 and then mounted back with ATTR1, keeping the
+         * di_forkoff's fixed but probably at various positions. Therefore,
+         * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+         * of a minimum size available.
+         */
+        if (whichfork == XFS_DATA_FORK) {
+                maxleafents = MAXEXTNUM;
+                sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+        } else {
+                maxleafents = MAXAEXTNUM;
+                sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+        }
+        maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
+        minleafrecs = mp->m_bmap_dmnr[0];
+        minnoderecs = mp->m_bmap_dmnr[1];
+        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+        for (level = 1; maxblocks > 1; level++) {
+                if (maxblocks <= maxrootrecs)
+                        maxblocks = 1;
+                else
+                        maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+        }
+        mp->m_bm_maxlevels[whichfork] = level;
+}
+STATIC int                              /* error */
+xfs_bmbt_lookup_eq(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+STATIC int                              /* error */
+xfs_bmbt_lookup_ge(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+        return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+                XFS_IFORK_NEXTENTS(ip, whichfork) >
+                        XFS_IFORK_MAXEXT(ip, whichfork);
+}
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+        return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+                XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                        XFS_IFORK_MAXEXT(ip, whichfork);
+}
+/*
+ * Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        xfs_exntst_t            state)
+{
+        union xfs_btree_rec     rec;
+        xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+        return xfs_btree_update(cur, &rec);
+}
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_filblks_t   len)            /* delayed extent length */
+{
+        int             level;          /* btree level number */
+        int             maxrecs;        /* maximum record count at this level */
+        xfs_mount_t     *mp;            /* mount structure */
+        xfs_filblks_t   rval;           /* return value */
+        mp = ip->i_mount;
+        maxrecs = mp->m_bmap_dmxr[0];
+        for (level = 0, rval = 0;
+             level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+             level++) {
+                len += maxrecs - 1;
+                do_div(len, maxrecs);
+                rval += len;
+                if (len == 1)
+                        return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+                                level - 1;
+                if (level == 0)
+                        maxrecs = mp->m_bmap_dmxr[1];
+        }
+        return rval;
+}
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        uint                    offset;
+        if (mp->m_sb.sb_inodesize == 256) {
+                offset = XFS_LITINO(mp, ip->i_d.di_version) -
+                                XFS_BMDR_SPACE_CALC(MINABTPTRS);
+        } else {
+                offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+        }
+        ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+        return offset;
+}
+/*
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+        xfs_inode_t     *ip,
+        int             whichfork)
+{
+        if (whichfork == XFS_ATTR_FORK &&
+            ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+            ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+            ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+                uint    dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+                if (dfl_forkoff > ip->i_d.di_forkoff)
+                        ip->i_d.di_forkoff = dfl_forkoff;
+        }
+}
+/*
+ * Debug/sanity checking code
+ */
+STATIC int
+xfs_bmap_sanity_check(
+        struct xfs_mount        *mp,
+        struct xfs_buf          *bp,
+        int                     level)
+{
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
+            block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
+                return 0;
+        if (be16_to_cpu(block->bb_level) != level ||
+            be16_to_cpu(block->bb_numrecs) == 0 ||
+            be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+                return 0;
+        return 1;
+}
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+        struct xfs_btree_cur    *cur,
+        xfs_fsblock_t           bno)
+{
+        struct xfs_log_item_desc *lidp;
+        int                     i;
+        if (!cur)
+                return NULL;
+        for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+                if (!cur->bc_bufs[i])
+                        break;
+                if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+                        return cur->bc_bufs[i];
+        }
+        /* Chase down all the log items to see if the bp is there */
+        list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+                struct xfs_buf_log_item *bip;
+                bip = (struct xfs_buf_log_item *)lidp->lid_item;
+                if (bip->bli_item.li_type == XFS_LI_BUF &&
+                    XFS_BUF_ADDR(bip->bli_buf) == bno)
+                        return bip->bli_buf;
+        }
+        return NULL;
+}
+STATIC void
+xfs_check_block(
+        struct xfs_btree_block  *block,
+        xfs_mount_t             *mp,
+        int                     root,
+        short                   sz)
+{
+        int                     i, j, dmxr;
+        __be64                  *pp, *thispa;   /* pointer to block address */
+        xfs_bmbt_key_t          *prevp, *keyp;
+        ASSERT(be16_to_cpu(block->bb_level) > 0);
+        prevp = NULL;
+        for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+                dmxr = mp->m_bmap_dmxr[0];
+                keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+                if (prevp) {
+                        ASSERT(be64_to_cpu(prevp->br_startoff) <
+                               be64_to_cpu(keyp->br_startoff));
+                }
+                prevp = keyp;
+                /*
+                 * Compare the block numbers to see if there are dups.
+                 */
+                if (root)
+                        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+                else
+                        pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+                for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+                        if (root)
+                                thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+                        else
+                                thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+                        if (*thispa == *pp) {
+                                xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+                                        __func__, j, i,
+                                        (unsigned long long)be64_to_cpu(*thispa));
+                                panic("%s: ptrs are equal in node\n",
+                                        __func__);
+                        }
+                }
+        }
+}
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+STATIC void
+xfs_bmap_check_leaf_extents(
+        xfs_btree_cur_t         *cur,   /* btree cursor or null */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        int                     whichfork)      /* data or attr fork */
+{
+        struct xfs_btree_block  *block; /* current btree block */
+        xfs_fsblock_t           bno;    /* block # of "block" */
+        xfs_buf_t               *bp;    /* buffer for "block" */
+        int                     error;  /* error return value */
+        xfs_extnum_t            i=0, j; /* index into the extents list */
+        xfs_ifork_t             *ifp;   /* fork structure */
+        int                     level;  /* btree level, for checking */
+        xfs_mount_t             *mp;    /* file system mount structure */
+        __be64                  *pp;    /* pointer to block address */
+        xfs_bmbt_rec_t          *ep;    /* pointer to current extent */
+        xfs_bmbt_rec_t          last = {0, 0}; /* last extent in prev block */
+        xfs_bmbt_rec_t          *nextp; /* pointer to next extent */
+        int                     bp_release = 0;
+        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+                return;
+        }
+        bno = NULLFSBLOCK;
+        mp = ip->i_mount;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        block = ifp->if_broot;
+        /*
+         * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+         */
+        level = be16_to_cpu(block->bb_level);
+        ASSERT(level > 0);
+        xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+        bno = be64_to_cpu(*pp);
+        ASSERT(bno != NULLDFSBNO);
+        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+        /*
+         * Go down the tree until leaf level is reached, following the first
+         * pointer (leftmost) at each level.
+         */
+        while (level-- > 0) {
+                /* See if buf is in cur first */
+                bp_release = 0;
+                bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+                if (!bp) {
+                        bp_release = 1;
+                        error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
+                                goto error_norelse;
+                }
+                block = XFS_BUF_TO_BLOCK(bp);
+                XFS_WANT_CORRUPTED_GOTO(
+                        xfs_bmap_sanity_check(mp, bp, level),
+                        error0);
+                if (level == 0)
+                        break;
+                /*
+                 * Check this block for basic sanity (increasing keys and
+                 * no duplicate blocks).
+                 */
+                xfs_check_block(block, mp, 0, 0);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+                bno = be64_to_cpu(*pp);
+                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+                if (bp_release) {
+                        bp_release = 0;
+                        xfs_trans_brelse(NULL, bp);
+                }
+        }
+        /*
+         * Here with bp and block set to the leftmost leaf node in the tree.
+         */
+        i = 0;
+        /*
+         * Loop over all leaf nodes checking that all extents are in the right order.
+         */
+        for (;;) {
+                xfs_fsblock_t   nextbno;
+                xfs_extnum_t    num_recs;
+                num_recs = xfs_btree_get_numrecs(block);
+                /*
+                 * Read-ahead the next leaf block, if any.
+                 */
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+                /*
+                 * Check all the extents to make sure they are OK.
+                 * If we had a previous block, the last entry should
+                 * conform with the first entry in this one.
+                 */
+                ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+                if (i) {
+                        ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+                               xfs_bmbt_disk_get_blockcount(&last) <=
+                               xfs_bmbt_disk_get_startoff(ep));
+                }
+                for (j = 1; j < num_recs; j++) {
+                        nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+                        ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+                               xfs_bmbt_disk_get_blockcount(ep) <=
+                               xfs_bmbt_disk_get_startoff(nextp));
+                        ep = nextp;
+                }
+                last = *ep;
+                i += num_recs;
+                if (bp_release) {
+                        bp_release = 0;
+                        xfs_trans_brelse(NULL, bp);
+                }
+                bno = nextbno;
+                /*
+                 * If we've reached the end, stop.
+                 */
+                if (bno == NULLFSBLOCK)
+                        break;
+                bp_release = 0;
+                bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+                if (!bp) {
+                        bp_release = 1;
+                        error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
+                                goto error_norelse;
+                }
+                block = XFS_BUF_TO_BLOCK(bp);
+        }
+        if (bp_release) {
+                bp_release = 0;
+                xfs_trans_brelse(NULL, bp);
+        }
+        return;
+error0:
+        xfs_warn(mp, "%s: at error0", __func__);
+        if (bp_release)
+                xfs_trans_brelse(NULL, bp);
+error_norelse:
+        xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+                __func__, i);
+        panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+        return;
+}
+/*
+ * Add bmap trace insert entries for all the contents of the extent records.
+ */
+void
+xfs_bmap_trace_exlist(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    cnt,            /* count of entries in the list */
+        int             whichfork,      /* data or attr fork */
+        unsigned long   caller_ip)
+{
+        xfs_extnum_t    idx;            /* extent record index */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        int             state = 0;
+        if (whichfork == XFS_ATTR_FORK)
+                state |= BMAP_ATTRFORK;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+        for (idx = 0; idx < cnt; idx++)
+                trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the caller's original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+        xfs_fileoff_t           bno,
+        xfs_filblks_t           len,
+        int                     flags,
+        xfs_bmbt_irec_t         *mval,
+        int                     nmap,
+        int                     ret_nmap)
+{
+        int                     i;              /* index to map values */
+        ASSERT(ret_nmap <= nmap);
+        for (i = 0; i < ret_nmap; i++) {
+                ASSERT(mval[i].br_blockcount > 0);
+                if (!(flags & XFS_BMAPI_ENTIRE)) {
+                        ASSERT(mval[i].br_startoff >= bno);
+                        ASSERT(mval[i].br_blockcount <= len);
+                        ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+                               bno + len);
+                } else {
+                        ASSERT(mval[i].br_startoff < bno + len);
+                        ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+                               bno);
+                }
+                ASSERT(i == 0 ||
+                       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+                       mval[i].br_startoff);
+                ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+                       mval[i].br_startblock != HOLESTARTBLOCK);
+                ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+                       mval[i].br_state == XFS_EXT_UNWRITTEN);
+        }
+}
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)         do { } while (0)
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+/*
+ * bmap free list manipulation functions
+ */
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+        xfs_fsblock_t           bno,            /* fs block number of extent */
+        xfs_filblks_t           len,            /* length of extent */
+        xfs_bmap_free_t         *flist,         /* list of extents */
+        xfs_mount_t             *mp)            /* mount point structure */
+{
+        xfs_bmap_free_item_t    *cur;           /* current (next) element */
+        xfs_bmap_free_item_t    *new;           /* new element */
+        xfs_bmap_free_item_t    *prev;          /* previous element */
+#ifdef DEBUG
+        xfs_agnumber_t          agno;
+        xfs_agblock_t           agbno;
+        ASSERT(bno != NULLFSBLOCK);
+        ASSERT(len > 0);
+        ASSERT(len <= MAXEXTLEN);
+        ASSERT(!isnullstartblock(bno));
+        agno = XFS_FSB_TO_AGNO(mp, bno);
+        agbno = XFS_FSB_TO_AGBNO(mp, bno);
+        ASSERT(agno < mp->m_sb.sb_agcount);
+        ASSERT(agbno < mp->m_sb.sb_agblocks);
+        ASSERT(len < mp->m_sb.sb_agblocks);
+        ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+        ASSERT(xfs_bmap_free_item_zone != NULL);
+        new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+        new->xbfi_startblock = bno;
+        new->xbfi_blockcount = (xfs_extlen_t)len;
+        for (prev = NULL, cur = flist->xbf_first;
+             cur != NULL;
+             prev = cur, cur = cur->xbfi_next) {
+                if (cur->xbfi_startblock >= bno)
+                        break;
+        }
+        if (prev)
+                prev->xbfi_next = new;
+        else
+                flist->xbf_first = new;
+        new->xbfi_next = cur;
+        flist->xbf_count++;
+}
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+void
+xfs_bmap_del_free(
+        xfs_bmap_free_t         *flist, /* free item list header */
+        xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
+        xfs_bmap_free_item_t    *free)  /* list item to be freed */
+{
+        if (prev)
+                prev->xbfi_next = free->xbfi_next;
+        else
+                flist->xbf_first = free->xbfi_next;
+        flist->xbf_count--;
+        kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+        xfs_bmap_free_t         *flist) /* list of bmap_free_items */
+{
+        xfs_bmap_free_item_t    *free;  /* free list item */
+        xfs_bmap_free_item_t    *next;
+        if (flist->xbf_count == 0)
+                return;
+        ASSERT(flist->xbf_first != NULL);
+        for (free = flist->xbf_first; free; free = next) {
+                next = free->xbfi_next;
+                xfs_bmap_del_free(flist, NULL, free);
+        }
+        ASSERT(flist->xbf_count == 0);
+}
+/*
+ * Inode fork format manipulation functions
+ */
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int                              /* error */
+xfs_bmap_btree_to_extents(
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     *logflagsp, /* inode logging flags */
+        int                     whichfork)  /* data or attr fork */
+{
+        /* REFERENCED */
+        struct xfs_btree_block  *cblock;/* child btree block */
+        xfs_fsblock_t           cbno;   /* child block number */
+        xfs_buf_t               *cbp;   /* child block's buffer */
+        int                     error;  /* error return value */
+        xfs_ifork_t             *ifp;   /* inode fork data */
+        xfs_mount_t             *mp;    /* mount point structure */
+        __be64                  *pp;    /* ptr to block address */
+        struct xfs_btree_block  *rblock;/* root btree block */
+        mp = ip->i_mount;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+        rblock = ifp->if_broot;
+        ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+        ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+        ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+        cbno = be64_to_cpu(*pp);
+        *logflagsp = 0;
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+                return error;
+#endif
+        error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+                                &xfs_bmbt_buf_ops);
+        if (error)
+                return error;
+        cblock = XFS_BUF_TO_BLOCK(cbp);
+        if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+                return error;
+        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+        ip->i_d.di_nblocks--;
+        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_binval(tp, cbp);
+        if (cur->bc_bufs[0] == cbp)
+                cur->bc_bufs[0] = NULL;
+        xfs_iroot_realloc(ip, -1, whichfork);
+        ASSERT(ifp->if_broot == NULL);
+        ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+        *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+        return 0;
+}
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int                                      /* error */
+xfs_bmap_extents_to_btree(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first-block-allocated */
+        xfs_bmap_free_t         *flist,         /* blocks freed in xaction */
+        xfs_btree_cur_t         **curp,         /* cursor returned to caller */
+        int                     wasdel,         /* converting a delayed alloc */
+        int                     *logflagsp,     /* inode logging flags */
+        int                     whichfork)      /* data or attr fork */
+{
+        struct xfs_btree_block  *ablock;        /* allocated (child) bt block */
+        xfs_buf_t               *abp;           /* buffer for ablock */
+        xfs_alloc_arg_t         args;           /* allocation arguments */
+        xfs_bmbt_rec_t          *arp;           /* child record pointer */
+        struct xfs_btree_block  *block;         /* btree root block */
+        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
+        xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
+        int                     error;          /* error return value */
+        xfs_extnum_t            i, cnt;         /* extent record index */
+        xfs_ifork_t             *ifp;           /* inode fork pointer */
+        xfs_bmbt_key_t          *kp;            /* root block key pointer */
+        xfs_mount_t             *mp;            /* mount structure */
+        xfs_extnum_t            nextents;       /* number of file extents */
+        xfs_bmbt_ptr_t          *pp;            /* root block address pointer */
+        mp = ip->i_mount;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+        /*
+         * Make space in the inode incore.
+         */
+        xfs_iroot_realloc(ip, 1, whichfork);
+        ifp->if_flags |= XFS_IFBROOT;
+        /*
+         * Fill in the root.
+         */
+        block = ifp->if_broot;
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+                                 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
+                                 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+        else
+                xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+                                 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
+                                 XFS_BTREE_LONG_PTRS);
+        /*
+         * Need a cursor.  Can't allocate until bb_level is filled in.
+         */
+        cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+        cur->bc_private.b.firstblock = *firstblock;
+        cur->bc_private.b.flist = flist;
+        cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+        /*
+         * Convert to a btree with two levels, one record in root.
+         */
+        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+        memset(&args, 0, sizeof(args));
+        args.tp = tp;
+        args.mp = mp;
+        args.firstblock = *firstblock;
+        if (*firstblock == NULLFSBLOCK) {
+                args.type = XFS_ALLOCTYPE_START_BNO;
+                args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+        } else if (flist->xbf_low) {
+                args.type = XFS_ALLOCTYPE_START_BNO;
+                args.fsbno = *firstblock;
+        } else {
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+                args.fsbno = *firstblock;
+        }
+        args.minlen = args.maxlen = args.prod = 1;
+        args.wasdel = wasdel;
+        *logflagsp = 0;
+        if ((error = xfs_alloc_vextent(&args))) {
+                xfs_iroot_realloc(ip, -1, whichfork);
+                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                return error;
+        }
+        /*
+         * Allocation can't fail, the space was reserved.
+         */
+        ASSERT(args.fsbno != NULLFSBLOCK);
+        ASSERT(*firstblock == NULLFSBLOCK ||
+               args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+               (flist->xbf_low &&
+                args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+        *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+        cur->bc_private.b.allocated++;
+        ip->i_d.di_nblocks++;
+        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+        abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+        /*
+         * Fill in the child block.
+         */
+        abp->b_ops = &xfs_bmbt_buf_ops;
+        ablock = XFS_BUF_TO_BLOCK(abp);
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+                                XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+                                XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+        else
+                xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+                                XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+                                XFS_BTREE_LONG_PTRS);
+        arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        for (cnt = i = 0; i < nextents; i++) {
+                ep = xfs_iext_get_ext(ifp, i);
+                if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+                        arp->l0 = cpu_to_be64(ep->l0);
+                        arp->l1 = cpu_to_be64(ep->l1);
+                        arp++; cnt++;
+                }
+        }
+        ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+        xfs_btree_set_numrecs(ablock, cnt);
+        /*
+         * Fill in the root key and pointer.
+         */
+        kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+        arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+        kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+        pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+                                                be16_to_cpu(block->bb_level)));
+        *pp = cpu_to_be64(args.fsbno);
+        /*
+         * Do all this logging at the end so that
+         * the root is at the right level.
+         */
+        xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+        xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+        ASSERT(*curp == NULL);
+        *curp = cur;
+        *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+        return 0;
+}
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+void
+xfs_bmap_local_to_extents_empty(
+        struct xfs_inode        *ip,
+        int                     whichfork)
+{
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+        ASSERT(ifp->if_bytes == 0);
+        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+        xfs_bmap_forkoff_reset(ip, whichfork);
+        ifp->if_flags &= ~XFS_IFINLINE;
+        ifp->if_flags |= XFS_IFEXTENTS;
+        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+}
+STATIC int                              /* error */
+xfs_bmap_local_to_extents(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_fsblock_t   *firstblock,    /* first block allocated in xaction */
+        xfs_extlen_t    total,          /* total blocks needed by transaction */
+        int             *logflagsp,     /* inode logging flags */
+        int             whichfork,
+        void            (*init_fn)(struct xfs_trans *tp,
+                                   struct xfs_buf *bp,
+                                   struct xfs_inode *ip,
+                                   struct xfs_ifork *ifp))
+{
+        int             error = 0;
+        int             flags;          /* logging flags returned */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_alloc_arg_t args;           /* allocation arguments */
+        xfs_buf_t       *bp;            /* buffer for extent block */
+        xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
+        /*
+         * We don't want to deal with the case of keeping inode data inline yet.
+         * So sending the data fork of a regular inode is invalid.
+         */
+        ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+        if (!ifp->if_bytes) {
+                xfs_bmap_local_to_extents_empty(ip, whichfork);
+                flags = XFS_ILOG_CORE;
+                goto done;
+        }
+        flags = 0;
+        error = 0;
+        ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
+                                                                XFS_IFINLINE);
+        memset(&args, 0, sizeof(args));
+        args.tp = tp;
+        args.mp = ip->i_mount;
+        args.firstblock = *firstblock;
+        /*
+         * Allocate a block.  We know we need only one, since the
+         * file currently fits in an inode.
+         */
+        if (*firstblock == NULLFSBLOCK) {
+                args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+                args.type = XFS_ALLOCTYPE_START_BNO;
+        } else {
+                args.fsbno = *firstblock;
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        }
+        args.total = total;
+        args.minlen = args.maxlen = args.prod = 1;
+        error = xfs_alloc_vextent(&args);
+        if (error)
+                goto done;
+        /* Can't fail, the space was reserved. */
+        ASSERT(args.fsbno != NULLFSBLOCK);
+        ASSERT(args.len == 1);
+        *firstblock = args.fsbno;
+        bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+        /* initialise the block and copy the data */
+        init_fn(tp, bp, ip, ifp);
+        /* account for the change in fork size and log everything */
+        xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+        xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+        xfs_bmap_local_to_extents_empty(ip, whichfork);
+        flags |= XFS_ILOG_CORE;
+        xfs_iext_add(ifp, 0, 1);
+        ep = xfs_iext_get_ext(ifp, 0);
+        xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+        trace_xfs_bmap_post_update(ip, 0,
+                        whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+                        _THIS_IP_);
+        XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+        ip->i_d.di_nblocks = 1;
+        xfs_trans_mod_dquot_byino(tp, ip,
+                XFS_TRANS_DQ_BCOUNT, 1L);
+        flags |= xfs_ilog_fext(whichfork);
+done:
+        *logflagsp = flags;
+        return error;
+}
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int                                      /* error */
+xfs_bmap_add_attrfork_btree(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first block allocated */
+        xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+        int                     *flags)         /* inode logging flags */
+{
+        xfs_btree_cur_t         *cur;           /* btree cursor */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* file system mount struct */
+        int                     stat;           /* newroot status */
+        mp = ip->i_mount;
+        if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+                *flags |= XFS_ILOG_DBROOT;
+        else {
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+                cur->bc_private.b.flist = flist;
+                cur->bc_private.b.firstblock = *firstblock;
+                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+                        goto error0;
+                /* must be at least one entry */
+                XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+                if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
+                        goto error0;
+                if (stat == 0) {
+                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                        return -ENOSPC;
+                }
+                *firstblock = cur->bc_private.b.firstblock;
+                cur->bc_private.b.allocated = 0;
+                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        }
+        return 0;
+error0:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int                                      /* error */
+xfs_bmap_add_attrfork_extents(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first block allocated */
+        xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+        int                     *flags)         /* inode logging flags */
+{
+        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
+        int                     error;          /* error return value */
+        if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+                return 0;
+        cur = NULL;
+        error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+                flags, XFS_DATA_FORK);
+        if (cur) {
+                cur->bc_private.b.allocated = 0;
+                xfs_btree_del_cursor(cur,
+                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        }
+        return error;
+}
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter.
+ */
+STATIC int                                      /* error */
+xfs_bmap_add_attrfork_local(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first block allocated */
+        xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+        int                     *flags)         /* inode logging flags */
+{
+        xfs_da_args_t           dargs;          /* args for dir/attr code */
+        if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+                return 0;
+        if (S_ISDIR(ip->i_d.di_mode)) {
+                memset(&dargs, 0, sizeof(dargs));
+                dargs.geo = ip->i_mount->m_dir_geo;
+                dargs.dp = ip;
+                dargs.firstblock = firstblock;
+                dargs.flist = flist;
+                dargs.total = dargs.geo->fsbcount;
+                dargs.whichfork = XFS_DATA_FORK;
+                dargs.trans = tp;
+                return xfs_dir2_sf_to_block(&dargs);
+        }
+        if (S_ISLNK(ip->i_d.di_mode))
+                return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+                                                 flags, XFS_DATA_FORK,
+                                                 xfs_symlink_local_to_remote);
+        /* should only be called for types that support local format data */
+        ASSERT(0);
+        return -EFSCORRUPTED;
+}
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int                                             /* error code */
+xfs_bmap_add_attrfork(
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        int                     size,           /* space new attribute needs */
+        int                     rsvd)           /* xact may use reserved blks */
+{
+        xfs_fsblock_t           firstblock;     /* 1st block/ag allocated */
+        xfs_bmap_free_t         flist;          /* freed extent records */
+        xfs_mount_t             *mp;            /* mount structure */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        int                     blks;           /* space reservation */
+        int                     version = 1;    /* superblock attr version */
+        int                     committed;      /* xaction was committed */
+        int                     logflags;       /* logging flags */
+        int                     error;          /* error return value */
+        int                     cancel_flags = 0;
+        ASSERT(XFS_IFORK_Q(ip) == 0);
+        mp = ip->i_mount;
+        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+        tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+        blks = XFS_ADDAFORK_SPACE_RES(mp);
+        if (rsvd)
+                tp->t_flags |= XFS_TRANS_RESERVE;
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+                        XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+                        XFS_QMOPT_RES_REGBLKS);
+        if (error)
+                goto trans_cancel;
+        cancel_flags |= XFS_TRANS_ABORT;
+        if (XFS_IFORK_Q(ip))
+                goto trans_cancel;
+        if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+                /*
+                 * For inodes coming from pre-6.2 filesystems.
+                 */
+                ASSERT(ip->i_d.di_aformat == 0);
+                ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+        }
+        ASSERT(ip->i_d.di_anextents == 0);
+        xfs_trans_ijoin(tp, ip, 0);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        switch (ip->i_d.di_format) {
+        case XFS_DINODE_FMT_DEV:
+                ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+                break;
+        case XFS_DINODE_FMT_UUID:
+                ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+                break;
+        case XFS_DINODE_FMT_LOCAL:
+        case XFS_DINODE_FMT_EXTENTS:
+        case XFS_DINODE_FMT_BTREE:
+                ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+                if (!ip->i_d.di_forkoff)
+                        ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+                else if (mp->m_flags & XFS_MOUNT_ATTR2)
+                        version = 2;
+                break;
+        default:
+                ASSERT(0);
+                error = -EINVAL;
+                goto trans_cancel;
+        }
+        ASSERT(ip->i_afp == NULL);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+        ip->i_afp->if_flags = XFS_IFEXTENTS;
+        logflags = 0;
+        xfs_bmap_init(&flist, &firstblock);
+        switch (ip->i_d.di_format) {
+        case XFS_DINODE_FMT_LOCAL:
+                error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+                        &logflags);
+                break;
+        case XFS_DINODE_FMT_EXTENTS:
+                error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+                        &flist, &logflags);
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+                        &logflags);
+                break;
+        default:
+                error = 0;
+                break;
+        }
+        if (logflags)
+                xfs_trans_log_inode(tp, ip, logflags);
+        if (error)
+                goto bmap_cancel;
+        if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+           (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+                __int64_t sbfields = 0;
+                spin_lock(&mp->m_sb_lock);
+                if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+                        xfs_sb_version_addattr(&mp->m_sb);
+                        sbfields |= XFS_SB_VERSIONNUM;
+                }
+                if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+                        xfs_sb_version_addattr2(&mp->m_sb);
+                        sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+                }
+                if (sbfields) {
+                        spin_unlock(&mp->m_sb_lock);
+                        xfs_mod_sb(tp, sbfields);
+                } else
+                        spin_unlock(&mp->m_sb_lock);
+        }
+        error = xfs_bmap_finish(&tp, &flist, &committed);
+        if (error)
+                goto bmap_cancel;
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        return error;
+bmap_cancel:
+        xfs_bmap_cancel(&flist);
+trans_cancel:
+        xfs_trans_cancel(tp, cancel_flags);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        return error;
+}
+/*
+ * Internal and external extent tree search functions.
+ */
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int                                     /* error */
+xfs_bmap_read_extents(
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_inode_t             *ip,    /* incore inode */
+        int                     whichfork) /* data or attr fork */
+{
+        struct xfs_btree_block  *block; /* current btree block */
+        xfs_fsblock_t           bno;    /* block # of "block" */
+        xfs_buf_t               *bp;    /* buffer for "block" */
+        int                     error;  /* error return value */
+        xfs_exntfmt_t           exntf;  /* XFS_EXTFMT_NOSTATE, if checking */
+        xfs_extnum_t            i, j;   /* index into the extents list */
+        xfs_ifork_t             *ifp;   /* fork structure */
+        int                     level;  /* btree level, for checking */
+        xfs_mount_t             *mp;    /* file system mount structure */
+        __be64                  *pp;    /* pointer to block address */
+        /* REFERENCED */
+        xfs_extnum_t            room;   /* number of entries there's room for */
+        bno = NULLFSBLOCK;
+        mp = ip->i_mount;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+                                        XFS_EXTFMT_INODE(ip);
+        block = ifp->if_broot;
+        /*
+         * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+         */
+        level = be16_to_cpu(block->bb_level);
+        ASSERT(level > 0);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+        bno = be64_to_cpu(*pp);
+        ASSERT(bno != NULLDFSBNO);
+        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+        /*
+         * Go down the tree until leaf level is reached, following the first
+         * pointer (leftmost) at each level.
+         */
+        while (level-- > 0) {
+                error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                                XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+                if (error)
+                        return error;
+                block = XFS_BUF_TO_BLOCK(bp);
+                XFS_WANT_CORRUPTED_GOTO(
+                        xfs_bmap_sanity_check(mp, bp, level),
+                        error0);
+                if (level == 0)
+                        break;
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+                bno = be64_to_cpu(*pp);
+                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+                xfs_trans_brelse(tp, bp);
+        }
+        /*
+         * Here with bp and block set to the leftmost leaf node in the tree.
+         */
+        room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        i = 0;
+        /*
+         * Loop over all leaf nodes.  Copy information to the extent records.
+         */
+        for (;;) {
+                xfs_bmbt_rec_t  *frp;
+                xfs_fsblock_t   nextbno;
+                xfs_extnum_t    num_recs;
+                xfs_extnum_t    start;
+                num_recs = xfs_btree_get_numrecs(block);
+                if (unlikely(i + num_recs > room)) {
+                        ASSERT(i + num_recs <= room);
+                        xfs_warn(ip->i_mount,
+                                "corrupt dinode %Lu, (btree extents).",
+                                (unsigned long long) ip->i_ino);
+                        XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+                                XFS_ERRLEVEL_LOW, ip->i_mount, block);
+                        goto error0;
+                }
+                XFS_WANT_CORRUPTED_GOTO(
+                        xfs_bmap_sanity_check(mp, bp, 0),
+                        error0);
+                /*
+                 * Read-ahead the next leaf block, if any.
+                 */
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+                if (nextbno != NULLFSBLOCK)
+                        xfs_btree_reada_bufl(mp, nextbno, 1,
+                                             &xfs_bmbt_buf_ops);
+                /*
+                 * Copy records into the extent records.
+                 */
+                frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+                start = i;
+                for (j = 0; j < num_recs; j++, i++, frp++) {
+                        xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+                        trp->l0 = be64_to_cpu(frp->l0);
+                        trp->l1 = be64_to_cpu(frp->l1);
+                }
+                if (exntf == XFS_EXTFMT_NOSTATE) {
+                        /*
+                         * Check all attribute bmap btree records and
+                         * any "older" data bmap btree records for a
+                         * set bit in the "extent flag" position.
+                         */
+                        if (unlikely(xfs_check_nostate_extents(ifp,
+                                        start, num_recs))) {
+                                XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+                                                 XFS_ERRLEVEL_LOW,
+                                                 ip->i_mount);
+                                goto error0;
+                        }
+                }
+                xfs_trans_brelse(tp, bp);
+                bno = nextbno;
+                /*
+                 * If we've reached the end, stop.
+                 */
+                if (bno == NULLFSBLOCK)
+                        break;
+                error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                                XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+                if (error)
+                        return error;
+                block = XFS_BUF_TO_BLOCK(bp);
+        }
+        ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+        ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+        XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+        return 0;
+error0:
+        xfs_trans_brelse(tp, bp);
+        return -EFSCORRUPTED;
+}
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none).  Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *            /* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_fileoff_t   bno,            /* block number searched for */
+        int             *eofp,          /* out: end of file found */
+        xfs_extnum_t    *lastxp,        /* out: last extent index */
+        xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+        xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+        xfs_bmbt_rec_host_t *ep;                /* extent record pointer */
+        xfs_extnum_t    lastx;          /* last extent index */
+        /*
+         * Initialize the extent entry structure to catch access to
+         * uninitialized br_startblock field.
+         */
+        gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+        gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+        gotp->br_state = XFS_EXT_INVALID;
+#if XFS_BIG_BLKNOS
+        gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+#else
+        gotp->br_startblock = 0xffffa5a5;
+#endif
+        prevp->br_startoff = NULLFILEOFF;
+        ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+        if (lastx > 0) {
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+        }
+        if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+                xfs_bmbt_get_all(ep, gotp);
+                *eofp = 0;
+        } else {
+                if (lastx > 0) {
+                        *gotp = *prevp;
+                }
+                *eofp = 1;
+                ep = NULL;
+        }
+        *lastxp = lastx;
+        return ep;
+}
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
+xfs_bmap_search_extents(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_fileoff_t   bno,            /* block number searched for */
+        int             fork,           /* data or attr fork */
+        int             *eofp,          /* out: end of file found */
+        xfs_extnum_t    *lastxp,        /* out: last extent index */
+        xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+        xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
+        XFS_STATS_INC(xs_look_exlist);
+        ifp = XFS_IFORK_PTR(ip, fork);
+        ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+        if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+                     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+                xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+                                "Access to block zero in inode %llu "
+                                "start_block: %llx start_off: %llx "
+                                "blkcnt: %llx extent-state: %x lastx: %x",
+                        (unsigned long long)ip->i_ino,
+                        (unsigned long long)gotp->br_startblock,
+                        (unsigned long long)gotp->br_startoff,
+                        (unsigned long long)gotp->br_blockcount,
+                        gotp->br_state, *lastxp);
+                *lastxp = NULLEXTNUM;
+                *eofp = 1;
+                return NULL;
+        }
+        return ep;
+}
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int                                             /* error */
+xfs_bmap_first_unused(
+        xfs_trans_t     *tp,                    /* transaction pointer */
+        xfs_inode_t     *ip,                    /* incore inode */
+        xfs_extlen_t    len,                    /* size of hole to find */
+        xfs_fileoff_t   *first_unused,          /* unused block */
+        int             whichfork)              /* data or attr fork */
+{
+        int             error;                  /* error return value */
+        int             idx;                    /* extent record index */
+        xfs_ifork_t     *ifp;                   /* inode fork pointer */
+        xfs_fileoff_t   lastaddr;               /* last block number seen */
+        xfs_fileoff_t   lowest;                 /* lowest useful block */
+        xfs_fileoff_t   max;                    /* starting useful block */
+        xfs_fileoff_t   off;                    /* offset for this block */
+        xfs_extnum_t    nextents;               /* number of extent entries */
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+               XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+               XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+                *first_unused = 0;
+                return 0;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(tp, ip, whichfork)))
+                return error;
+        lowest = *first_unused;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+                off = xfs_bmbt_get_startoff(ep);
+                /*
+                 * See if the hole before this extent will work.
+                 */
+                if (off >= lowest + len && off - max >= len) {
+                        *first_unused = max;
+                        return 0;
+                }
+                lastaddr = off + xfs_bmbt_get_blockcount(ep);
+                max = XFS_FILEOFF_MAX(lastaddr, lowest);
+        }
+        *first_unused = max;
+        return 0;
+}
+/*
+ * Returns the file-relative block number of the last block - 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int                                             /* error */
+xfs_bmap_last_before(
+        xfs_trans_t     *tp,                    /* transaction pointer */
+        xfs_inode_t     *ip,                    /* incore inode */
+        xfs_fileoff_t   *last_block,            /* last block */
+        int             whichfork)              /* data or attr fork */
+{
+        xfs_fileoff_t   bno;                    /* input file offset */
+        int             eof;                    /* hit end of file */
+        xfs_bmbt_rec_host_t *ep;                /* pointer to last extent */
+        int             error;                  /* error return value */
+        xfs_bmbt_irec_t got;                    /* current extent value */
+        xfs_ifork_t     *ifp;                   /* inode fork pointer */
+        xfs_extnum_t    lastx;                  /* last extent used */
+        xfs_bmbt_irec_t prev;                   /* previous extent value */
+        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+               return -EIO;
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+                *last_block = 0;
+                return 0;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(tp, ip, whichfork)))
+                return error;
+        bno = *last_block - 1;
+        ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+                &prev);
+        if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+                if (prev.br_startoff == NULLFILEOFF)
+                        *last_block = 0;
+                else
+                        *last_block = prev.br_startoff + prev.br_blockcount;
+        }
+        /*
+         * Otherwise *last_block is already the right answer.
+         */
+        return 0;
+}
+int
+xfs_bmap_last_extent(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        int                     whichfork,
+        struct xfs_bmbt_irec    *rec,
+        int                     *is_empty)
+{
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        int                     error;
+        int                     nextents;
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                error = xfs_iread_extents(tp, ip, whichfork);
+                if (error)
+                        return error;
+        }
+        nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        if (nextents == 0) {
+                *is_empty = 1;
+                return 0;
+        }
+        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+        *is_empty = 0;
+        return 0;
+}
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+        struct xfs_bmalloca     *bma,
+        int                     whichfork)
+{
+        struct xfs_bmbt_irec    rec;
+        int                     is_empty;
+        int                     error;
+        bma->aeof = 0;
+        error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+                                     &is_empty);
+        if (error)
+                return error;
+        if (is_empty) {
+                bma->aeof = 1;
+                return 0;
+        }
+        /*
+         * Check if we are allocation or past the last extent, or at least into
+         * the last delayed allocated extent.
+         */
+        bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+                (bma->offset >= rec.br_startoff &&
+                 isnullstartblock(rec.br_startblock));
+        return 0;
+}
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           *last_block,
+        int                     whichfork)
+{
+        struct xfs_bmbt_irec    rec;
+        int                     is_empty;
+        int                     error;
+        *last_block = 0;
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+                return 0;
+        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+               return -EIO;
+        error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+        if (error || is_empty)
+                return error;
+        *last_block = rec.br_startoff + rec.br_blockcount;
+        return 0;
+}
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int                                     /* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+        xfs_inode_t     *ip,            /* incore inode */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_rec_host_t *ep;        /* ptr to fork's extent */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        int             rval;           /* return value */
+        xfs_bmbt_irec_t s;              /* internal version of extent */
+#ifndef DEBUG
+        if (whichfork == XFS_DATA_FORK)
+                return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
+#endif  /* !DEBUG */
+        if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+                return 0;
+        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+                return 0;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+        ep = xfs_iext_get_ext(ifp, 0);
+        xfs_bmbt_get_all(ep, &s);
+        rval = s.br_startoff == 0 && s.br_blockcount == 1;
+        if (rval && whichfork == XFS_DATA_FORK)
+                ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
+        return rval;
+}
+/*
+ * Extent tree manipulation functions used during allocation.
+ */
+/*
+ * Convert a delayed allocation to a real allocation.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_delay_real(
+        struct xfs_bmalloca     *bma)
+{
+        struct xfs_bmbt_irec    *new = &bma->got;
+        int                     diff;   /* temp value */
+        xfs_bmbt_rec_host_t     *ep;    /* extent entry for idx */
+        int                     error;  /* error return value */
+        int                     i;      /* temp state */
+        xfs_ifork_t             *ifp;   /* inode fork pointer */
+        xfs_fileoff_t           new_endoff;     /* end offset of new entry */
+        xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
+                                        /* left is 0, right is 1, prev is 2 */
+        int                     rval=0; /* return value (logging flags) */
+        int                     state = 0;/* state bits, accessed thru macros */
+        xfs_filblks_t           da_new; /* new count del alloc blocks used */
+        xfs_filblks_t           da_old; /* old count del alloc blocks used */
+        xfs_filblks_t           temp=0; /* value for da_new calculations */
+        xfs_filblks_t           temp2=0;/* value for da_new calculations */
+        int                     tmp_rval;       /* partial logging flags */
+        ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+        ASSERT(bma->idx >= 0);
+        ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+        ASSERT(!isnullstartblock(new->br_startblock));
+        ASSERT(!bma->cur ||
+               (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+        XFS_STATS_INC(xs_add_exlist);
+#define LEFT            r[0]
+#define RIGHT           r[1]
+#define PREV            r[2]
+        /*
+         * Set up a bunch of variables to make the tests simpler.
+         */
+        ep = xfs_iext_get_ext(ifp, bma->idx);
+        xfs_bmbt_get_all(ep, &PREV);
+        new_endoff = new->br_startoff + new->br_blockcount;
+        ASSERT(PREV.br_startoff <= new->br_startoff);
+        ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+        da_old = startblockval(PREV.br_startblock);
+        da_new = 0;
+        /*
+         * Set flags determining what part of the previous delayed allocation
+         * extent is being replaced by a real allocation.
+         */
+        if (PREV.br_startoff == new->br_startoff)
+                state |= BMAP_LEFT_FILLING;
+        if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+                state |= BMAP_RIGHT_FILLING;
+        /*
+         * Check and set flags if this segment has a left neighbor.
+         * Don't set contiguous if the combined extent would be too large.
+         */
+        if (bma->idx > 0) {
+                state |= BMAP_LEFT_VALID;
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
+                if (isnullstartblock(LEFT.br_startblock))
+                        state |= BMAP_LEFT_DELAY;
+        }
+        if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+            LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+            LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+            LEFT.br_state == new->br_state &&
+            LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+                state |= BMAP_LEFT_CONTIG;
+        /*
+         * Check and set flags if this segment has a right neighbor.
+         * Don't set contiguous if the combined extent would be too large.
+         * Also check for all-three-contiguous being too large.
+         */
+        if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+                state |= BMAP_RIGHT_VALID;
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
+                if (isnullstartblock(RIGHT.br_startblock))
+                        state |= BMAP_RIGHT_DELAY;
+        }
+        if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+            new_endoff == RIGHT.br_startoff &&
+            new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+            new->br_state == RIGHT.br_state &&
+            new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+            ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+                       BMAP_RIGHT_FILLING)) !=
+                      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+                       BMAP_RIGHT_FILLING) ||
+             LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                        <= MAXEXTLEN))
+                state |= BMAP_RIGHT_CONTIG;
+        error = 0;
+        /*
+         * Switch out based on the FILLING and CONTIG state bits.
+         */
+        switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+                         BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+             BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+                /*
+                 * Filling in all of a previously delayed allocation extent.
+                 * The left and right neighbors are both contiguous with new.
+                 */
+                bma->idx--;
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+                        LEFT.br_blockcount + PREV.br_blockcount +
+                        RIGHT.br_blockcount);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+                bma->ip->i_d.di_nextents--;
+                if (bma->cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_btree_delete(bma->cur, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_btree_decrement(bma->cur, 0, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+                                        LEFT.br_startblock,
+                                        LEFT.br_blockcount +
+                                        PREV.br_blockcount +
+                                        RIGHT.br_blockcount, LEFT.br_state);
+                        if (error)
+                                goto done;
+                }
+                break;
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+                /*
+                 * Filling in all of a previously delayed allocation extent.
+                 * The left neighbor is contiguous, the right is not.
+                 */
+                bma->idx--;
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+                        LEFT.br_blockcount + PREV.br_blockcount);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+                if (bma->cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
+                                        LEFT.br_startblock, LEFT.br_blockcount,
+                                        &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+                                        LEFT.br_startblock,
+                                        LEFT.br_blockcount +
+                                        PREV.br_blockcount, LEFT.br_state);
+                        if (error)
+                                goto done;
+                }
+                break;
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+                /*
+                 * Filling in all of a previously delayed allocation extent.
+                 * The right neighbor is contiguous, the left is not.
+                 */
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_startblock(ep, new->br_startblock);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount + RIGHT.br_blockcount);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+                if (bma->cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
+                                        new->br_startblock,
+                                        PREV.br_blockcount +
+                                        RIGHT.br_blockcount, PREV.br_state);
+                        if (error)
+                                goto done;
+                }
+                break;
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+                /*
+                 * Filling in all of a previously delayed allocation extent.
+                 * Neither the left nor right neighbors are contiguous with
+                 * the new one.
+                 */
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_startblock(ep, new->br_startblock);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                bma->ip->i_d.di_nextents++;
+                if (bma->cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                }
+                break;
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+                /*
+                 * Filling in the first part of a previous delayed allocation.
+                 * The left neighbor is contiguous.
+                 */
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
+                        LEFT.br_blockcount + new->br_blockcount);
+                xfs_bmbt_set_startoff(ep,
+                        PREV.br_startoff + new->br_blockcount);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
+                temp = PREV.br_blockcount - new->br_blockcount;
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep, temp);
+                if (bma->cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
+                                        LEFT.br_startblock, LEFT.br_blockcount,
+                                        &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
+                                        LEFT.br_startblock,
+                                        LEFT.br_blockcount +
+                                        new->br_blockcount,
+                                        LEFT.br_state);
+                        if (error)
+                                goto done;
+                }
+                da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+                        startblockval(PREV.br_startblock));
+                xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                bma->idx--;
+                break;
+        case BMAP_LEFT_FILLING:
+                /*
+                 * Filling in the first part of a previous delayed allocation.
+                 * The left neighbor is not contiguous.
+                 */
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_startoff(ep, new_endoff);
+                temp = PREV.br_blockcount - new->br_blockcount;
+                xfs_bmbt_set_blockcount(ep, temp);
+                xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+                bma->ip->i_d.di_nextents++;
+                if (bma->cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                }
+                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                                        bma->firstblock, bma->flist,
+                                        &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
+                        rval |= tmp_rval;
+                        if (error)
+                                goto done;
+                }
+                da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+                        startblockval(PREV.br_startblock) -
+                        (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+                ep = xfs_iext_get_ext(ifp, bma->idx + 1);
+                xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+                trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+                break;
+        case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+                /*
+                 * Filling in the last part of a previous delayed allocation.
+                 * The right neighbor is contiguous with the new allocation.
+                 */
+                temp = PREV.br_blockcount - new->br_blockcount;
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep, temp);
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
+                        new->br_startoff, new->br_startblock,
+                        new->br_blockcount + RIGHT.br_blockcount,
+                        RIGHT.br_state);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+                if (bma->cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_bmbt_update(bma->cur, new->br_startoff,
+                                        new->br_startblock,
+                                        new->br_blockcount +
+                                        RIGHT.br_blockcount,
+                                        RIGHT.br_state);
+                        if (error)
+                                goto done;
+                }
+                da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+                        startblockval(PREV.br_startblock));
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                bma->idx++;
+                break;
+        case BMAP_RIGHT_FILLING:
+                /*
+                 * Filling in the last part of a previous delayed allocation.
+                 * The right neighbor is not contiguous.
+                 */
+                temp = PREV.br_blockcount - new->br_blockcount;
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep, temp);
+                xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+                bma->ip->i_d.di_nextents++;
+                if (bma->cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                }
+                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                                bma->firstblock, bma->flist, &bma->cur, 1,
+                                &tmp_rval, XFS_DATA_FORK);
+                        rval |= tmp_rval;
+                        if (error)
+                                goto done;
+                }
+                da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+                        startblockval(PREV.br_startblock) -
+                        (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+                ep = xfs_iext_get_ext(ifp, bma->idx);
+                xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                bma->idx++;
+                break;
+        case 0:
+                /*
+                 * Filling in the middle part of a previous delayed allocation.
+                 * Contiguity is impossible here.
+                 * This case is avoided almost all the time.
+                 *
+                 * We start with a delayed allocation:
+                 *
+                 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+                 *  PREV @ idx
+                 *
+                 * and we are allocating:
+                 *                     +rrrrrrrrrrrrrrrrr+
+                 *                            new
+                 *
+                 * and we set it up for insertion as:
+                 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+                 *                            new
+                 *  PREV @ idx          LEFT              RIGHT
+                 *                      inserted at idx + 1
+                 */
+                temp = new->br_startoff - PREV.br_startoff;
+                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
+                LEFT = *new;
+                RIGHT.br_state = PREV.br_state;
+                RIGHT.br_startblock = nullstartblock(
+                                (int)xfs_bmap_worst_indlen(bma->ip, temp2));
+                RIGHT.br_startoff = new_endoff;
+                RIGHT.br_blockcount = temp2;
+                /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+                xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+                bma->ip->i_d.di_nextents++;
+                if (bma->cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                }
+                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                                        bma->firstblock, bma->flist, &bma->cur,
+                                        1, &tmp_rval, XFS_DATA_FORK);
+                        rval |= tmp_rval;
+                        if (error)
+                                goto done;
+                }
+                temp = xfs_bmap_worst_indlen(bma->ip, temp);
+                temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
+                diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
+                        (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+                if (diff > 0) {
+                        error = xfs_icsb_modify_counters(bma->ip->i_mount,
+                                        XFS_SBS_FDBLOCKS,
+                                        -((int64_t)diff), 0);
+                        ASSERT(!error);
+                        if (error)
+                                goto done;
+                }
+                ep = xfs_iext_get_ext(ifp, bma->idx);
+                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
+                        nullstartblock((int)temp2));
+                trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
+                bma->idx++;
+                da_new = temp + temp2;
+                break;
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+        case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+        case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+        case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+        case BMAP_LEFT_CONTIG:
+        case BMAP_RIGHT_CONTIG:
+                /*
+                 * These cases are all impossible.
+                 */
+                ASSERT(0);
+        }
+        /* convert to a btree if necessary */
+        if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+                int     tmp_logflags;   /* partial log flag return val */
+                ASSERT(bma->cur == NULL);
+                error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                                bma->firstblock, bma->flist, &bma->cur,
+                                da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+                bma->logflags |= tmp_logflags;
+                if (error)
+                        goto done;
+        }
+        /* adjust for changes in reserved delayed indirect blocks */
+        if (da_old || da_new) {
+                temp = da_new;
+                if (bma->cur)
+                        temp += bma->cur->bc_private.b.allocated;
+                ASSERT(temp <= da_old);
+                if (temp < da_old)
+                        xfs_icsb_modify_counters(bma->ip->i_mount,
+                                        XFS_SBS_FDBLOCKS,
+                                        (int64_t)(da_old - temp), 0);
+        }
+        /* clear out the allocated field, done with it now in any case. */
+        if (bma->cur)
+                bma->cur->bc_private.b.allocated = 0;
+        xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
+done:
+        bma->logflags |= rval;
+        return error;
+#undef  LEFT
+#undef  RIGHT
+#undef  PREV
+}
+/*
+ * Convert an unwritten allocation to a real allocation or vice versa.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_unwritten_real(
+        struct xfs_trans        *tp,
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
+        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
+        xfs_fsblock_t           *first, /* pointer to firstblock variable */
+        xfs_bmap_free_t         *flist, /* list of extents to be freed */
+        int                     *logflagsp) /* inode logging flags */
+{
+        xfs_btree_cur_t         *cur;   /* btree cursor */
+        xfs_bmbt_rec_host_t     *ep;    /* extent entry for idx */
+        int                     error;  /* error return value */
+        int                     i;      /* temp state */
+        xfs_ifork_t             *ifp;   /* inode fork pointer */
+        xfs_fileoff_t           new_endoff;     /* end offset of new entry */
+        xfs_exntst_t            newext; /* new extent state */
+        xfs_exntst_t            oldext; /* old extent state */
+        xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
+                                        /* left is 0, right is 1, prev is 2 */
+        int                     rval=0; /* return value (logging flags) */
+        int                     state = 0;/* state bits, accessed thru macros */
+        *logflagsp = 0;
+        cur = *curp;
+        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+        ASSERT(*idx >= 0);
+        ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+        ASSERT(!isnullstartblock(new->br_startblock));
+        XFS_STATS_INC(xs_add_exlist);
+#define LEFT            r[0]
+#define RIGHT           r[1]
+#define PREV            r[2]
+        /*
+         * Set up a bunch of variables to make the tests simpler.
+         */
+        error = 0;
+        ep = xfs_iext_get_ext(ifp, *idx);
+        xfs_bmbt_get_all(ep, &PREV);
+        newext = new->br_state;
+        oldext = (newext == XFS_EXT_UNWRITTEN) ?
+                XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+        ASSERT(PREV.br_state == oldext);
+        new_endoff = new->br_startoff + new->br_blockcount;
+        ASSERT(PREV.br_startoff <= new->br_startoff);
+        ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+        /*
+         * Set flags determining what part of the previous oldext allocation
+         * extent is being replaced by a newext allocation.
+         */
+        if (PREV.br_startoff == new->br_startoff)
+                state |= BMAP_LEFT_FILLING;
+        if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+                state |= BMAP_RIGHT_FILLING;
+        /*
+         * Check and set flags if this segment has a left neighbor.
+         * Don't set contiguous if the combined extent would be too large.
+         */
+        if (*idx > 0) {
+                state |= BMAP_LEFT_VALID;
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+                if (isnullstartblock(LEFT.br_startblock))
+                        state |= BMAP_LEFT_DELAY;
+        }
+        if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+            LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+            LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+            LEFT.br_state == newext &&
+            LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+                state |= BMAP_LEFT_CONTIG;
+        /*
+         * Check and set flags if this segment has a right neighbor.
+         * Don't set contiguous if the combined extent would be too large.
+         * Also check for all-three-contiguous being too large.
+         */
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+                state |= BMAP_RIGHT_VALID;
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+                if (isnullstartblock(RIGHT.br_startblock))
+                        state |= BMAP_RIGHT_DELAY;
+        }
+        if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+            new_endoff == RIGHT.br_startoff &&
+            new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+            newext == RIGHT.br_state &&
+            new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+            ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+                       BMAP_RIGHT_FILLING)) !=
+                      (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+                       BMAP_RIGHT_FILLING) ||
+             LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                        <= MAXEXTLEN))
+                state |= BMAP_RIGHT_CONTIG;
+        /*
+         * Switch out based on the FILLING and CONTIG state bits.
+         */
+        switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+                         BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+             BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+                /*
+                 * Setting all of a previous oldext extent to newext.
+                 * The left and right neighbors are both contiguous with new.
+                 */
+                --*idx;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+                        LEFT.br_blockcount + PREV.br_blockcount +
+                        RIGHT.br_blockcount);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                xfs_iext_remove(ip, *idx + 1, 2, state);
+                ip->i_d.di_nextents -= 2;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_btree_delete(cur, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_btree_delete(cur, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                                LEFT.br_startblock,
+                                LEFT.br_blockcount + PREV.br_blockcount +
+                                RIGHT.br_blockcount, LEFT.br_state)))
+                                goto done;
+                }
+                break;
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+                /*
+                 * Setting all of a previous oldext extent to newext.
+                 * The left neighbor is contiguous, the right is not.
+                 */
+                --*idx;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+                        LEFT.br_blockcount + PREV.br_blockcount);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                xfs_iext_remove(ip, *idx + 1, 1, state);
+                ip->i_d.di_nextents--;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_btree_delete(cur, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                                LEFT.br_startblock,
+                                LEFT.br_blockcount + PREV.br_blockcount,
+                                LEFT.br_state)))
+                                goto done;
+                }
+                break;
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+                /*
+                 * Setting all of a previous oldext extent to newext.
+                 * The right neighbor is contiguous, the left is not.
+                 */
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount + RIGHT.br_blockcount);
+                xfs_bmbt_set_state(ep, newext);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                xfs_iext_remove(ip, *idx + 1, 1, state);
+                ip->i_d.di_nextents--;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_btree_delete(cur, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                                new->br_startblock,
+                                new->br_blockcount + RIGHT.br_blockcount,
+                                newext)))
+                                goto done;
+                }
+                break;
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+                /*
+                 * Setting all of a previous oldext extent to newext.
+                 * Neither the left nor right neighbors are contiguous with
+                 * the new one.
+                 */
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_state(ep, newext);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                                new->br_startblock, new->br_blockcount,
+                                newext)))
+                                goto done;
+                }
+                break;
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+                /*
+                 * Setting the first part of a previous oldext extent to newext.
+                 * The left neighbor is contiguous.
+                 */
+                trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+                        LEFT.br_blockcount + new->br_blockcount);
+                xfs_bmbt_set_startoff(ep,
+                        PREV.br_startoff + new->br_blockcount);
+                trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_startblock(ep,
+                        new->br_startblock + new->br_blockcount);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount - new->br_blockcount);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                --*idx;
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_bmbt_update(cur,
+                                PREV.br_startoff + new->br_blockcount,
+                                PREV.br_startblock + new->br_blockcount,
+                                PREV.br_blockcount - new->br_blockcount,
+                                oldext)))
+                                goto done;
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
+                                goto done;
+                        error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                                LEFT.br_startblock,
+                                LEFT.br_blockcount + new->br_blockcount,
+                                LEFT.br_state);
+                        if (error)
+                                goto done;
+                }
+                break;
+        case BMAP_LEFT_FILLING:
+                /*
+                 * Setting the first part of a previous oldext extent to newext.
+                 * The left neighbor is not contiguous.
+                 */
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
+                xfs_bmbt_set_startoff(ep, new_endoff);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount - new->br_blockcount);
+                xfs_bmbt_set_startblock(ep,
+                        new->br_startblock + new->br_blockcount);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                xfs_iext_insert(ip, *idx, 1, new, state);
+                ip->i_d.di_nextents++;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_bmbt_update(cur,
+                                PREV.br_startoff + new->br_blockcount,
+                                PREV.br_startblock + new->br_blockcount,
+                                PREV.br_blockcount - new->br_blockcount,
+                                oldext)))
+                                goto done;
+                        cur->bc_rec.b = *new;
+                        if ((error = xfs_btree_insert(cur, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                }
+                break;
+        case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+                /*
+                 * Setting the last part of a previous oldext extent to newext.
+                 * The right neighbor is contiguous with the new allocation.
+                 */
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount - new->br_blockcount);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                        new->br_startoff, new->br_startblock,
+                        new->br_blockcount + RIGHT.br_blockcount, newext);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock,
+                                        PREV.br_blockcount, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+                                PREV.br_startblock,
+                                PREV.br_blockcount - new->br_blockcount,
+                                oldext)))
+                                goto done;
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
+                                goto done;
+                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                                new->br_startblock,
+                                new->br_blockcount + RIGHT.br_blockcount,
+                                newext)))
+                                goto done;
+                }
+                break;
+        case BMAP_RIGHT_FILLING:
+                /*
+                 * Setting the last part of a previous oldext extent to newext.
+                 * The right neighbor is not contiguous.
+                 */
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount - new->br_blockcount);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
+                xfs_iext_insert(ip, *idx, 1, new, state);
+                ip->i_d.di_nextents++;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+                                PREV.br_startblock,
+                                PREV.br_blockcount - new->br_blockcount,
+                                oldext)))
+                                goto done;
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        if ((error = xfs_btree_insert(cur, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                }
+                break;
+        case 0:
+                /*
+                 * Setting the middle part of a previous oldext extent to
+                 * newext.  Contiguity is impossible here.
+                 * One extent becomes three extents.
+                 */
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep,
+                        new->br_startoff - PREV.br_startoff);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                r[0] = *new;
+                r[1].br_startoff = new_endoff;
+                r[1].br_blockcount =
+                        PREV.br_startoff + PREV.br_blockcount - new_endoff;
+                r[1].br_startblock = new->br_startblock + new->br_blockcount;
+                r[1].br_state = oldext;
+                ++*idx;
+                xfs_iext_insert(ip, *idx, 2, &r[0], state);
+                ip->i_d.di_nextents += 2;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        /* new right extent - oldext */
+                        if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
+                                r[1].br_startblock, r[1].br_blockcount,
+                                r[1].br_state)))
+                                goto done;
+                        /* new left extent - oldext */
+                        cur->bc_rec.b = PREV;
+                        cur->bc_rec.b.br_blockcount =
+                                new->br_startoff - PREV.br_startoff;
+                        if ((error = xfs_btree_insert(cur, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        /*
+                         * Reset the cursor to the position of the new extent
+                         * we are about to insert as we can't trust it after
+                         * the previous insert.
+                         */
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        /* new middle extent - newext */
+                        cur->bc_rec.b.br_state = new->br_state;
+                        if ((error = xfs_btree_insert(cur, &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                }
+                break;
+        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+        case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+        case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+        case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+        case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+        case BMAP_LEFT_CONTIG:
+        case BMAP_RIGHT_CONTIG:
+                /*
+                 * These cases are all impossible.
+                 */
+                ASSERT(0);
+        }
+        /* convert to a btree if necessary */
+        if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
+                int     tmp_logflags;   /* partial log flag return val */
+                ASSERT(cur == NULL);
+                error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+                                0, &tmp_logflags, XFS_DATA_FORK);
+                *logflagsp |= tmp_logflags;
+                if (error)
+                        goto done;
+        }
+        /* clear out the allocated field, done with it now in any case. */
+        if (cur) {
+                cur->bc_private.b.allocated = 0;
+                *curp = cur;
+        }
+        xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
+done:
+        *logflagsp |= rval;
+        return error;
+#undef  LEFT
+#undef  RIGHT
+#undef  PREV
+}
+/*
+ * Convert a hole to a delayed allocation.
+ */
+STATIC void
+xfs_bmap_add_extent_hole_delay(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
+        xfs_bmbt_irec_t         *new)   /* new data to add to file extents */
+{
+        xfs_ifork_t             *ifp;   /* inode fork pointer */
+        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
+        xfs_filblks_t           newlen=0;       /* new indirect size */
+        xfs_filblks_t           oldlen=0;       /* old indirect size */
+        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
+        int                     state;  /* state bits, accessed thru macros */
+        xfs_filblks_t           temp=0; /* temp for indirect calculations */
+        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+        state = 0;
+        ASSERT(isnullstartblock(new->br_startblock));
+        /*
+         * Check and set flags if this segment has a left neighbor
+         */
+        if (*idx > 0) {
+                state |= BMAP_LEFT_VALID;
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+                if (isnullstartblock(left.br_startblock))
+                        state |= BMAP_LEFT_DELAY;
+        }
+        /*
+         * Check and set flags if the current (right) segment exists.
+         * If it doesn't exist, we're converting the hole at end-of-file.
+         */
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+                state |= BMAP_RIGHT_VALID;
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+                if (isnullstartblock(right.br_startblock))
+                        state |= BMAP_RIGHT_DELAY;
+        }
+        /*
+         * Set contiguity flags on the left and right neighbors.
+         * Don't let extents get too large, even if the pieces are contiguous.
+         */
+        if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+            left.br_startoff + left.br_blockcount == new->br_startoff &&
+            left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+                state |= BMAP_LEFT_CONTIG;
+        if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+            new->br_startoff + new->br_blockcount == right.br_startoff &&
+            new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+            (!(state & BMAP_LEFT_CONTIG) ||
+             (left.br_blockcount + new->br_blockcount +
+              right.br_blockcount <= MAXEXTLEN)))
+                state |= BMAP_RIGHT_CONTIG;
+        /*
+         * Switch out based on the contiguity flags.
+         */
+        switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+        case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+                /*
+                 * New allocation is contiguous with delayed allocations
+                 * on the left and on the right.
+                 * Merge all three into a single extent record.
+                 */
+                --*idx;
+                temp = left.br_blockcount + new->br_blockcount +
+                        right.br_blockcount;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+                oldlen = startblockval(left.br_startblock) +
+                        startblockval(new->br_startblock) +
+                        startblockval(right.br_startblock);
+                newlen = xfs_bmap_worst_indlen(ip, temp);
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+                        nullstartblock((int)newlen));
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                xfs_iext_remove(ip, *idx + 1, 1, state);
+                break;
+        case BMAP_LEFT_CONTIG:
+                /*
+                 * New allocation is contiguous with a delayed allocation
+                 * on the left.
+                 * Merge the new allocation with the left neighbor.
+                 */
+                --*idx;
+                temp = left.br_blockcount + new->br_blockcount;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+                oldlen = startblockval(left.br_startblock) +
+                        startblockval(new->br_startblock);
+                newlen = xfs_bmap_worst_indlen(ip, temp);
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+                        nullstartblock((int)newlen));
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                break;
+        case BMAP_RIGHT_CONTIG:
+                /*
+                 * New allocation is contiguous with a delayed allocation
+                 * on the right.
+                 * Merge the new allocation with the right neighbor.
+                 */
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                temp = new->br_blockcount + right.br_blockcount;
+                oldlen = startblockval(new->br_startblock) +
+                        startblockval(right.br_startblock);
+                newlen = xfs_bmap_worst_indlen(ip, temp);
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                        new->br_startoff,
+                        nullstartblock((int)newlen), temp, right.br_state);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                break;
+        case 0:
+                /*
+                 * New allocation is not contiguous with another
+                 * delayed allocation.
+                 * Insert a new entry.
+                 */
+                oldlen = newlen = 0;
+                xfs_iext_insert(ip, *idx, 1, new, state);
+                break;
+        }
+        if (oldlen != newlen) {
+                ASSERT(oldlen > newlen);
+                xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+                        (int64_t)(oldlen - newlen), 0);
+                /*
+                 * Nothing to do for disk quota accounting here.
+                 */
+        }
+}
+/*
+ * Convert a hole to a real allocation.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_hole_real(
+        struct xfs_bmalloca     *bma,
+        int                     whichfork)
+{
+        struct xfs_bmbt_irec    *new = &bma->got;
+        int                     error;  /* error return value */
+        int                     i;      /* temp state */
+        xfs_ifork_t             *ifp;   /* inode fork pointer */
+        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
+        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
+        int                     rval=0; /* return value (logging flags) */
+        int                     state;  /* state bits, accessed thru macros */
+        ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+        ASSERT(bma->idx >= 0);
+        ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+        ASSERT(!isnullstartblock(new->br_startblock));
+        ASSERT(!bma->cur ||
+               !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+        XFS_STATS_INC(xs_add_exlist);
+        state = 0;
+        if (whichfork == XFS_ATTR_FORK)
+                state |= BMAP_ATTRFORK;
+        /*
+         * Check and set flags if this segment has a left neighbor.
+         */
+        if (bma->idx > 0) {
+                state |= BMAP_LEFT_VALID;
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
+                if (isnullstartblock(left.br_startblock))
+                        state |= BMAP_LEFT_DELAY;
+        }
+        /*
+         * Check and set flags if this segment has a current value.
+         * Not true if we're inserting into the "hole" at eof.
+         */
+        if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+                state |= BMAP_RIGHT_VALID;
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
+                if (isnullstartblock(right.br_startblock))
+                        state |= BMAP_RIGHT_DELAY;
+        }
+        /*
+         * We're inserting a real allocation between "left" and "right".
+         * Set the contiguity flags.  Don't let extents get too large.
+         */
+        if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+            left.br_startoff + left.br_blockcount == new->br_startoff &&
+            left.br_startblock + left.br_blockcount == new->br_startblock &&
+            left.br_state == new->br_state &&
+            left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+                state |= BMAP_LEFT_CONTIG;
+        if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+            new->br_startoff + new->br_blockcount == right.br_startoff &&
+            new->br_startblock + new->br_blockcount == right.br_startblock &&
+            new->br_state == right.br_state &&
+            new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+            (!(state & BMAP_LEFT_CONTIG) ||
+             left.br_blockcount + new->br_blockcount +
+             right.br_blockcount <= MAXEXTLEN))
+                state |= BMAP_RIGHT_CONTIG;
+        error = 0;
+        /*
+         * Select which case we're in here, and implement it.
+         */
+        switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+        case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+                /*
+                 * New allocation is contiguous with real allocations on the
+                 * left and on the right.
+                 * Merge all three into a single extent record.
+                 */
+                --bma->idx;
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+                        left.br_blockcount + new->br_blockcount +
+                        right.br_blockcount);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+                XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+                        XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
+                if (bma->cur == NULL) {
+                        rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+                } else {
+                        rval = XFS_ILOG_CORE;
+                        error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+                                        right.br_startblock, right.br_blockcount,
+                                        &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_btree_delete(bma->cur, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_btree_decrement(bma->cur, 0, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_bmbt_update(bma->cur, left.br_startoff,
+                                        left.br_startblock,
+                                        left.br_blockcount +
+                                                new->br_blockcount +
+                                                right.br_blockcount,
+                                        left.br_state);
+                        if (error)
+                                goto done;
+                }
+                break;
+        case BMAP_LEFT_CONTIG:
+                /*
+                 * New allocation is contiguous with a real allocation
+                 * on the left.
+                 * Merge the new allocation with the left neighbor.
+                 */
+                --bma->idx;
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+                        left.br_blockcount + new->br_blockcount);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                if (bma->cur == NULL) {
+                        rval = xfs_ilog_fext(whichfork);
+                } else {
+                        rval = 0;
+                        error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+                                        left.br_startblock, left.br_blockcount,
+                                        &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_bmbt_update(bma->cur, left.br_startoff,
+                                        left.br_startblock,
+                                        left.br_blockcount +
+                                                new->br_blockcount,
+                                        left.br_state);
+                        if (error)
+                                goto done;
+                }
+                break;
+        case BMAP_RIGHT_CONTIG:
+                /*
+                 * New allocation is contiguous with a real allocation
+                 * on the right.
+                 * Merge the new allocation with the right neighbor.
+                 */
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
+                        new->br_startoff, new->br_startblock,
+                        new->br_blockcount + right.br_blockcount,
+                        right.br_state);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+                if (bma->cur == NULL) {
+                        rval = xfs_ilog_fext(whichfork);
+                } else {
+                        rval = 0;
+                        error = xfs_bmbt_lookup_eq(bma->cur,
+                                        right.br_startoff,
+                                        right.br_startblock,
+                                        right.br_blockcount, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        error = xfs_bmbt_update(bma->cur, new->br_startoff,
+                                        new->br_startblock,
+                                        new->br_blockcount +
+                                                right.br_blockcount,
+                                        right.br_state);
+                        if (error)
+                                goto done;
+                }
+                break;
+        case 0:
+                /*
+                 * New allocation is not contiguous with another
+                 * real allocation.
+                 * Insert a new entry.
+                 */
+                xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+                XFS_IFORK_NEXT_SET(bma->ip, whichfork,
+                        XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
+                if (bma->cur == NULL) {
+                        rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+                } else {
+                        rval = XFS_ILOG_CORE;
+                        error = xfs_bmbt_lookup_eq(bma->cur,
+                                        new->br_startoff,
+                                        new->br_startblock,
+                                        new->br_blockcount, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        bma->cur->bc_rec.b.br_state = new->br_state;
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                }
+                break;
+        }
+        /* convert to a btree if necessary */
+        if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+                int     tmp_logflags;   /* partial log flag return val */
+                ASSERT(bma->cur == NULL);
+                error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                                bma->firstblock, bma->flist, &bma->cur,
+                                0, &tmp_logflags, whichfork);
+                bma->logflags |= tmp_logflags;
+                if (error)
+                        goto done;
+        }
+        /* clear out the allocated field, done with it now in any case. */
+        if (bma->cur)
+                bma->cur->bc_private.b.allocated = 0;
+        xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+done:
+        bma->logflags |= rval;
+        return error;
+}
+/*
+ * Functions used in the extent read, allocate and remove paths
+ */
+/*
+ * Adjust the size of the new extent based on di_extsize and rt extsize.
+ */
+int
+xfs_bmap_extsize_align(
+        xfs_mount_t     *mp,
+        xfs_bmbt_irec_t *gotp,          /* next extent pointer */
+        xfs_bmbt_irec_t *prevp,         /* previous extent pointer */
+        xfs_extlen_t    extsz,          /* align to this extent size */
+        int             rt,             /* is this a realtime inode? */
+        int             eof,            /* is extent at end-of-file? */
+        int             delay,          /* creating delalloc extent? */
+        int             convert,        /* overwriting unwritten extent? */
+        xfs_fileoff_t   *offp,          /* in/out: aligned offset */
+        xfs_extlen_t    *lenp)          /* in/out: aligned length */
+{
+        xfs_fileoff_t   orig_off;       /* original offset */
+        xfs_extlen_t    orig_alen;      /* original length */
+        xfs_fileoff_t   orig_end;       /* original off+len */
+        xfs_fileoff_t   nexto;          /* next file offset */
+        xfs_fileoff_t   prevo;          /* previous file offset */
+        xfs_fileoff_t   align_off;      /* temp for offset */
+        xfs_extlen_t    align_alen;     /* temp for length */
+        xfs_extlen_t    temp;           /* temp for calculations */
+        if (convert)
+                return 0;
+        orig_off = align_off = *offp;
+        orig_alen = align_alen = *lenp;
+        orig_end = orig_off + orig_alen;
+        /*
+         * If this request overlaps an existing extent, then don't
+         * attempt to perform any additional alignment.
+         */
+        if (!delay && !eof &&
+            (orig_off >= gotp->br_startoff) &&
+            (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
+                return 0;
+        }
+        /*
+         * If the file offset is unaligned vs. the extent size
+         * we need to align it.  This will be possible unless
+         * the file was previously written with a kernel that didn't
+         * perform this alignment, or if a truncate shot us in the
+         * foot.
+         */
+        temp = do_mod(orig_off, extsz);
+        if (temp) {
+                align_alen += temp;
+                align_off -= temp;
+        }
+        /*
+         * Same adjustment for the end of the requested area.
+         */
+        if ((temp = (align_alen % extsz))) {
+                align_alen += extsz - temp;
+        }
+        /*
+         * If the previous block overlaps with this proposed allocation
+         * then move the start forward without adjusting the length.
+         */
+        if (prevp->br_startoff != NULLFILEOFF) {
+                if (prevp->br_startblock == HOLESTARTBLOCK)
+                        prevo = prevp->br_startoff;
+                else
+                        prevo = prevp->br_startoff + prevp->br_blockcount;
+        } else
+                prevo = 0;
+        if (align_off != orig_off && align_off < prevo)
+                align_off = prevo;
+        /*
+         * If the next block overlaps with this proposed allocation
+         * then move the start back without adjusting the length,
+         * but not before offset 0.
+         * This may of course make the start overlap previous block,
+         * and if we hit the offset 0 limit then the next block
+         * can still overlap too.
+         */
+        if (!eof && gotp->br_startoff != NULLFILEOFF) {
+                if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
+                    (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
+                        nexto = gotp->br_startoff + gotp->br_blockcount;
+                else
+                        nexto = gotp->br_startoff;
+        } else
+                nexto = NULLFILEOFF;
+        if (!eof &&
+            align_off + align_alen != orig_end &&
+            align_off + align_alen > nexto)
+                align_off = nexto > align_alen ? nexto - align_alen : 0;
+        /*
+         * If we're now overlapping the next or previous extent that
+         * means we can't fit an extsz piece in this hole.  Just move
+         * the start forward to the first valid spot and set
+         * the length so we hit the end.
+         */
+        if (align_off != orig_off && align_off < prevo)
+                align_off = prevo;
+        if (align_off + align_alen != orig_end &&
+            align_off + align_alen > nexto &&
+            nexto != NULLFILEOFF) {
+                ASSERT(nexto > prevo);
+                align_alen = nexto - align_off;
+        }
+        /*
+         * If realtime, and the result isn't a multiple of the realtime
+         * extent size we need to remove blocks until it is.
+         */
+        if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+                /*
+                 * We're not covering the original request, or
+                 * we won't be able to once we fix the length.
+                 */
+                if (orig_off < align_off ||
+                    orig_end > align_off + align_alen ||
+                    align_alen - temp < orig_alen)
+                        return -EINVAL;
+                /*
+                 * Try to fix it by moving the start up.
+                 */
+                if (align_off + temp <= orig_off) {
+                        align_alen -= temp;
+                        align_off += temp;
+                }
+                /*
+                 * Try to fix it by moving the end in.
+                 */
+                else if (align_off + align_alen - temp >= orig_end)
+                        align_alen -= temp;
+                /*
+                 * Set the start to the minimum then trim the length.
+                 */
+                else {
+                        align_alen -= orig_off - align_off;
+                        align_off = orig_off;
+                        align_alen -= align_alen % mp->m_sb.sb_rextsize;
+                }
+                /*
+                 * Result doesn't cover the request, fail it.
+                 */
+                if (orig_off < align_off || orig_end > align_off + align_alen)
+                        return -EINVAL;
+        } else {
+                ASSERT(orig_off >= align_off);
+                ASSERT(orig_end <= align_off + align_alen);
+        }
+#ifdef DEBUG
+        if (!eof && gotp->br_startoff != NULLFILEOFF)
+                ASSERT(align_off + align_alen <= gotp->br_startoff);
+        if (prevp->br_startoff != NULLFILEOFF)
+                ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
+#endif
+        *lenp = align_alen;
+        *offp = align_off;
+        return 0;
+}
+#define XFS_ALLOC_GAP_UNITS     4
+void
+xfs_bmap_adjacent(
+        struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
+{
+        xfs_fsblock_t   adjust;         /* adjustment to block numbers */
+        xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
+        xfs_mount_t     *mp;            /* mount point structure */
+        int             nullfb;         /* true if ap->firstblock isn't set */
+        int             rt;             /* true if inode is realtime */
+#define ISVALID(x,y)    \
+        (rt ? \
+                (x) < mp->m_sb.sb_rblocks : \
+                XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+                XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+                XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+        mp = ap->ip->i_mount;
+        nullfb = *ap->firstblock == NULLFSBLOCK;
+        rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
+        /*
+         * If allocating at eof, and there's a previous real block,
+         * try to use its last block as our starting point.
+         */
+        if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
+            !isnullstartblock(ap->prev.br_startblock) &&
+            ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
+                    ap->prev.br_startblock)) {
+                ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
+                /*
+                 * Adjust for the gap between prevp and us.
+                 */
+                adjust = ap->offset -
+                        (ap->prev.br_startoff + ap->prev.br_blockcount);
+                if (adjust &&
+                    ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+                        ap->blkno += adjust;
+        }
+        /*
+         * If not at eof, then compare the two neighbor blocks.
+         * Figure out whether either one gives us a good starting point,
+         * and pick the better one.
+         */
+        else if (!ap->eof) {
+                xfs_fsblock_t   gotbno;         /* right side block number */
+                xfs_fsblock_t   gotdiff=0;      /* right side difference */
+                xfs_fsblock_t   prevbno;        /* left side block number */
+                xfs_fsblock_t   prevdiff=0;     /* left side difference */
+                /*
+                 * If there's a previous (left) block, select a requested
+                 * start block based on it.
+                 */
+                if (ap->prev.br_startoff != NULLFILEOFF &&
+                    !isnullstartblock(ap->prev.br_startblock) &&
+                    (prevbno = ap->prev.br_startblock +
+                               ap->prev.br_blockcount) &&
+                    ISVALID(prevbno, ap->prev.br_startblock)) {
+                        /*
+                         * Calculate gap to end of previous block.
+                         */
+                        adjust = prevdiff = ap->offset -
+                                (ap->prev.br_startoff +
+                                 ap->prev.br_blockcount);
+                        /*
+                         * Figure the startblock based on the previous block's
+                         * end and the gap size.
+                         * Heuristic!
+                         * If the gap is large relative to the piece we're
+                         * allocating, or using it gives us an invalid block
+                         * number, then just use the end of the previous block.
+                         */
+                        if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+                            ISVALID(prevbno + prevdiff,
+                                    ap->prev.br_startblock))
+                                prevbno += adjust;
+                        else
+                                prevdiff += adjust;
+                        /*
+                         * If the firstblock forbids it, can't use it,
+                         * must use default.
+                         */
+                        if (!rt && !nullfb &&
+                            XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+                                prevbno = NULLFSBLOCK;
+                }
+                /*
+                 * No previous block or can't follow it, just default.
+                 */
+                else
+                        prevbno = NULLFSBLOCK;
+                /*
+                 * If there's a following (right) block, select a requested
+                 * start block based on it.
+                 */
+                if (!isnullstartblock(ap->got.br_startblock)) {
+                        /*
+                         * Calculate gap to start of next block.
+                         */
+                        adjust = gotdiff = ap->got.br_startoff - ap->offset;
+                        /*
+                         * Figure the startblock based on the next block's
+                         * start and the gap size.
+                         */
+                        gotbno = ap->got.br_startblock;
+                        /*
+                         * Heuristic!
+                         * If the gap is large relative to the piece we're
+                         * allocating, or using it gives us an invalid block
+                         * number, then just use the start of the next block
+                         * offset by our length.
+                         */
+                        if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
+                            ISVALID(gotbno - gotdiff, gotbno))
+                                gotbno -= adjust;
+                        else if (ISVALID(gotbno - ap->length, gotbno)) {
+                                gotbno -= ap->length;
+                                gotdiff += adjust - ap->length;
+                        } else
+                                gotdiff += adjust;
+                        /*
+                         * If the firstblock forbids it, can't use it,
+                         * must use default.
+                         */
+                        if (!rt && !nullfb &&
+                            XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+                                gotbno = NULLFSBLOCK;
+                }
+                /*
+                 * No next block, just default.
+                 */
+                else
+                        gotbno = NULLFSBLOCK;
+                /*
+                 * If both valid, pick the better one, else the only good
+                 * one, else ap->blkno is already set (to 0 or the inode block).
+                 */
+                if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+                        ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
+                else if (prevbno != NULLFSBLOCK)
+                        ap->blkno = prevbno;
+                else if (gotbno != NULLFSBLOCK)
+                        ap->blkno = gotbno;
+        }
+#undef ISVALID
+}
+static int
+xfs_bmap_longest_free_extent(
+        struct xfs_trans        *tp,
+        xfs_agnumber_t          ag,
+        xfs_extlen_t            *blen,
+        int                     *notinit)
+{
+        struct xfs_mount        *mp = tp->t_mountp;
+        struct xfs_perag        *pag;
+        xfs_extlen_t            longest;
+        int                     error = 0;
+        pag = xfs_perag_get(mp, ag);
+        if (!pag->pagf_init) {
+                error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+                if (error)
+                        goto out;
+                if (!pag->pagf_init) {
+                        *notinit = 1;
+                        goto out;
+                }
+        }
+        longest = xfs_alloc_longest_free_extent(mp, pag);
+        if (*blen < longest)
+                *blen = longest;
+out:
+        xfs_perag_put(pag);
+        return error;
+}
+static void
+xfs_bmap_select_minlen(
+        struct xfs_bmalloca     *ap,
+        struct xfs_alloc_arg    *args,
+        xfs_extlen_t            *blen,
+        int                     notinit)
+{
+        if (notinit || *blen < ap->minlen) {
+                /*
+                 * Since we did a BUF_TRYLOCK above, it is possible that
+                 * there is space for this request.
+                 */
+                args->minlen = ap->minlen;
+        } else if (*blen < args->maxlen) {
+                /*
+                 * If the best seen length is less than the request length,
+                 * use the best as the minimum.
+                 */
+                args->minlen = *blen;
+        } else {
+                /*
+                 * Otherwise we've seen an extent as big as maxlen, use that
+                 * as the minimum.
+                 */
+                args->minlen = args->maxlen;
+        }
+}
+STATIC int
+xfs_bmap_btalloc_nullfb(
+        struct xfs_bmalloca     *ap,
+        struct xfs_alloc_arg    *args,
+        xfs_extlen_t            *blen)
+{
+        struct xfs_mount        *mp = ap->ip->i_mount;
+        xfs_agnumber_t          ag, startag;
+        int                     notinit = 0;
+        int                     error;
+        args->type = XFS_ALLOCTYPE_START_BNO;
+        args->total = ap->total;
+        startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+        if (startag == NULLAGNUMBER)
+                startag = ag = 0;
+        while (*blen < args->maxlen) {
+                error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+                                                     &notinit);
+                if (error)
+                        return error;
+                if (++ag == mp->m_sb.sb_agcount)
+                        ag = 0;
+                if (ag == startag)
+                        break;
+        }
+        xfs_bmap_select_minlen(ap, args, blen, notinit);
+        return 0;
+}
+STATIC int
+xfs_bmap_btalloc_filestreams(
+        struct xfs_bmalloca     *ap,
+        struct xfs_alloc_arg    *args,
+        xfs_extlen_t            *blen)
+{
+        struct xfs_mount        *mp = ap->ip->i_mount;
+        xfs_agnumber_t          ag;
+        int                     notinit = 0;
+        int                     error;
+        args->type = XFS_ALLOCTYPE_NEAR_BNO;
+        args->total = ap->total;
+        ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+        if (ag == NULLAGNUMBER)
+                ag = 0;
+        error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+        if (error)
+                return error;
+        if (*blen < args->maxlen) {
+                error = xfs_filestream_new_ag(ap, &ag);
+                if (error)
+                        return error;
+                error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+                                                     &notinit);
+                if (error)
+                        return error;
+        }
+        xfs_bmap_select_minlen(ap, args, blen, notinit);
+        /*
+         * Set the failure fallback case to look in the selected AG as stream
+         * may have moved.
+         */
+        ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+        return 0;
+}
+STATIC int
+xfs_bmap_btalloc(
+        struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
+{
+        xfs_mount_t     *mp;            /* mount point structure */
+        xfs_alloctype_t atype = 0;      /* type for allocation routines */
+        xfs_extlen_t    align;          /* minimum allocation alignment */
+        xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
+        xfs_agnumber_t  ag;
+        xfs_alloc_arg_t args;
+        xfs_extlen_t    blen;
+        xfs_extlen_t    nextminlen = 0;
+        int             nullfb;         /* true if ap->firstblock isn't set */
+        int             isaligned;
+        int             tryagain;
+        int             error;
+        int             stripe_align;
+        ASSERT(ap->length);
+        mp = ap->ip->i_mount;
+        /* stripe alignment for allocation is determined by mount parameters */
+        stripe_align = 0;
+        if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+                stripe_align = mp->m_swidth;
+        else if (mp->m_dalign)
+                stripe_align = mp->m_dalign;
+        align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+        if (unlikely(align)) {
+                error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+                                                align, 0, ap->eof, 0, ap->conv,
+                                                &ap->offset, &ap->length);
+                ASSERT(!error);
+                ASSERT(ap->length);
+        }
+        nullfb = *ap->firstblock == NULLFSBLOCK;
+        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
+        if (nullfb) {
+                if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+                        ag = xfs_filestream_lookup_ag(ap->ip);
+                        ag = (ag != NULLAGNUMBER) ? ag : 0;
+                        ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
+                } else {
+                        ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+                }
+        } else
+                ap->blkno = *ap->firstblock;
+        xfs_bmap_adjacent(ap);
+        /*
+         * If allowed, use ap->blkno; otherwise must use firstblock since
+         * it's in the right allocation group.
+         */
+        if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
+                ;
+        else
+                ap->blkno = *ap->firstblock;
+        /*
+         * Normal allocation, done through xfs_alloc_vextent.
+         */
+        tryagain = isaligned = 0;
+        memset(&args, 0, sizeof(args));
+        args.tp = ap->tp;
+        args.mp = mp;
+        args.fsbno = ap->blkno;
+        /* Trim the allocation back to the maximum an AG can fit. */
+        args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
+        args.firstblock = *ap->firstblock;
+        blen = 0;
+        if (nullfb) {
+                /*
+                 * Search for an allocation group with a single extent large
+                 * enough for the request.  If one isn't found, then adjust
+                 * the minimum allocation size to the largest space found.
+                 */
+                if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+                        error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+                else
+                        error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+                if (error)
+                        return error;
+        } else if (ap->flist->xbf_low) {
+                if (xfs_inode_is_filestream(ap->ip))
+                        args.type = XFS_ALLOCTYPE_FIRST_AG;
+                else
+                        args.type = XFS_ALLOCTYPE_START_BNO;
+                args.total = args.minlen = ap->minlen;
+        } else {
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+                args.total = ap->total;
+                args.minlen = ap->minlen;
+        }
+        /* apply extent size hints if obtained earlier */
+        if (unlikely(align)) {
+                args.prod = align;
+                if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
+                        args.mod = (xfs_extlen_t)(args.prod - args.mod);
+        } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+                args.prod = 1;
+                args.mod = 0;
+        } else {
+                args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+                if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
+                        args.mod = (xfs_extlen_t)(args.prod - args.mod);
+        }
+        /*
+         * If we are not low on available data blocks, and the
+         * underlying logical volume manager is a stripe, and
+         * the file offset is zero then try to allocate data
+         * blocks on stripe unit boundary.
+         * NOTE: ap->aeof is only set if the allocation length
+         * is >= the stripe unit and the allocation offset is
+         * at the end of file.
+         */
+        if (!ap->flist->xbf_low && ap->aeof) {
+                if (!ap->offset) {
+                        args.alignment = stripe_align;
+                        atype = args.type;
+                        isaligned = 1;
+                        /*
+                         * Adjust for alignment
+                         */
+                        if (blen > args.alignment && blen <= args.maxlen)
+                                args.minlen = blen - args.alignment;
+                        args.minalignslop = 0;
+                } else {
+                        /*
+                         * First try an exact bno allocation.
+                         * If it fails then do a near or start bno
+                         * allocation with alignment turned on.
+                         */
+                        atype = args.type;
+                        tryagain = 1;
+                        args.type = XFS_ALLOCTYPE_THIS_BNO;
+                        args.alignment = 1;
+                        /*
+                         * Compute the minlen+alignment for the
+                         * next case.  Set slop so that the value
+                         * of minlen+alignment+slop doesn't go up
+                         * between the calls.
+                         */
+                        if (blen > stripe_align && blen <= args.maxlen)
+                                nextminlen = blen - stripe_align;
+                        else
+                                nextminlen = args.minlen;
+                        if (nextminlen + stripe_align > args.minlen + 1)
+                                args.minalignslop =
+                                        nextminlen + stripe_align -
+                                        args.minlen - 1;
+                        else
+                                args.minalignslop = 0;
+                }
+        } else {
+                args.alignment = 1;
+                args.minalignslop = 0;
+        }
+        args.minleft = ap->minleft;
+        args.wasdel = ap->wasdel;
+        args.isfl = 0;
+        args.userdata = ap->userdata;
+        if ((error = xfs_alloc_vextent(&args)))
+                return error;
+        if (tryagain && args.fsbno == NULLFSBLOCK) {
+                /*
+                 * Exact allocation failed. Now try with alignment
+                 * turned on.
+                 */
+                args.type = atype;
+                args.fsbno = ap->blkno;
+                args.alignment = stripe_align;
+                args.minlen = nextminlen;
+                args.minalignslop = 0;
+                isaligned = 1;
+                if ((error = xfs_alloc_vextent(&args)))
+                        return error;
+        }
+        if (isaligned && args.fsbno == NULLFSBLOCK) {
+                /*
+                 * allocation failed, so turn off alignment and
+                 * try again.
+                 */
+                args.type = atype;
+                args.fsbno = ap->blkno;
+                args.alignment = 0;
+                if ((error = xfs_alloc_vextent(&args)))
+                        return error;
+        }
+        if (args.fsbno == NULLFSBLOCK && nullfb &&
+            args.minlen > ap->minlen) {
+                args.minlen = ap->minlen;
+                args.type = XFS_ALLOCTYPE_START_BNO;
+                args.fsbno = ap->blkno;
+                if ((error = xfs_alloc_vextent(&args)))
+                        return error;
+        }
+        if (args.fsbno == NULLFSBLOCK && nullfb) {
+                args.fsbno = 0;
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
+                args.total = ap->minlen;
+                args.minleft = 0;
+                if ((error = xfs_alloc_vextent(&args)))
+                        return error;
+                ap->flist->xbf_low = 1;
+        }
+        if (args.fsbno != NULLFSBLOCK) {
+                /*
+                 * check the allocation happened at the same or higher AG than
+                 * the first block that was allocated.
+                 */
+                ASSERT(*ap->firstblock == NULLFSBLOCK ||
+                       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+                       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+                       (ap->flist->xbf_low &&
+                        XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+                        XFS_FSB_TO_AGNO(mp, args.fsbno)));
+                ap->blkno = args.fsbno;
+                if (*ap->firstblock == NULLFSBLOCK)
+                        *ap->firstblock = args.fsbno;
+                ASSERT(nullfb || fb_agno == args.agno ||
+                       (ap->flist->xbf_low && fb_agno < args.agno));
+                ap->length = args.len;
+                ap->ip->i_d.di_nblocks += args.len;
+                xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+                if (ap->wasdel)
+                        ap->ip->i_delayed_blks -= args.len;
+                /*
+                 * Adjust the disk quota also. This was reserved
+                 * earlier.
+                 */
+                xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+                        ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
+                                        XFS_TRANS_DQ_BCOUNT,
+                        (long) args.len);
+        } else {
+                ap->blkno = NULLFSBLOCK;
+                ap->length = 0;
+        }
+        return 0;
+}
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int
+xfs_bmap_alloc(
+        struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
+{
+        if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+                return xfs_bmap_rtalloc(ap);
+        return xfs_bmap_btalloc(ap);
+}
+/*
+ * Trim the returned map to the required bounds
+ */
+STATIC void
+xfs_bmapi_trim_map(
+        struct xfs_bmbt_irec    *mval,
+        struct xfs_bmbt_irec    *got,
+        xfs_fileoff_t           *bno,
+        xfs_filblks_t           len,
+        xfs_fileoff_t           obno,
+        xfs_fileoff_t           end,
+        int                     n,
+        int                     flags)
+{
+        if ((flags & XFS_BMAPI_ENTIRE) ||
+            got->br_startoff + got->br_blockcount <= obno) {
+                *mval = *got;
+                if (isnullstartblock(got->br_startblock))
+                        mval->br_startblock = DELAYSTARTBLOCK;
+                return;
+        }
+        if (obno > *bno)
+                *bno = obno;
+        ASSERT((*bno >= obno) || (n == 0));
+        ASSERT(*bno < end);
+        mval->br_startoff = *bno;
+        if (isnullstartblock(got->br_startblock))
+                mval->br_startblock = DELAYSTARTBLOCK;
+        else
+                mval->br_startblock = got->br_startblock +
+                                        (*bno - got->br_startoff);
+        /*
+         * Return the minimum of what we got and what we asked for for
+         * the length.  We can use the len variable here because it is
+         * modified below and we could have been there before coming
+         * here if the first part of the allocation didn't overlap what
+         * was asked for.
+         */
+        mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+                        got->br_blockcount - (*bno - got->br_startoff));
+        mval->br_state = got->br_state;
+        ASSERT(mval->br_blockcount <= len);
+        return;
+}
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+        struct xfs_bmbt_irec    **map,
+        xfs_fileoff_t           *bno,
+        xfs_filblks_t           *len,
+        xfs_fileoff_t           obno,
+        xfs_fileoff_t           end,
+        int                     *n,
+        int                     flags)
+{
+        xfs_bmbt_irec_t *mval = *map;
+        ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+               ((mval->br_startoff + mval->br_blockcount) <= end));
+        ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+               (mval->br_startoff < obno));
+        *bno = mval->br_startoff + mval->br_blockcount;
+        *len = end - *bno;
+        if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+                /* update previous map with new information */
+                ASSERT(mval->br_startblock == mval[-1].br_startblock);
+                ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+                ASSERT(mval->br_state == mval[-1].br_state);
+                mval[-1].br_blockcount = mval->br_blockcount;
+                mval[-1].br_state = mval->br_state;
+        } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+                   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+                   mval[-1].br_startblock != HOLESTARTBLOCK &&
+                   mval->br_startblock == mval[-1].br_startblock +
+                                          mval[-1].br_blockcount &&
+                   ((flags & XFS_BMAPI_IGSTATE) ||
+                        mval[-1].br_state == mval->br_state)) {
+                ASSERT(mval->br_startoff ==
+                       mval[-1].br_startoff + mval[-1].br_blockcount);
+                mval[-1].br_blockcount += mval->br_blockcount;
+        } else if (*n > 0 &&
+                   mval->br_startblock == DELAYSTARTBLOCK &&
+                   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+                   mval->br_startoff ==
+                   mval[-1].br_startoff + mval[-1].br_blockcount) {
+                mval[-1].br_blockcount += mval->br_blockcount;
+                mval[-1].br_state = mval->br_state;
+        } else if (!((*n == 0) &&
+                     ((mval->br_startoff + mval->br_blockcount) <=
+                      obno))) {
+                mval++;
+                (*n)++;
+        }
+        *map = mval;
+}
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           bno,
+        xfs_filblks_t           len,
+        struct xfs_bmbt_irec    *mval,
+        int                     *nmap,
+        int                     flags)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ifork        *ifp;
+        struct xfs_bmbt_irec    got;
+        struct xfs_bmbt_irec    prev;
+        xfs_fileoff_t           obno;
+        xfs_fileoff_t           end;
+        xfs_extnum_t            lastx;
+        int                     error;
+        int                     eof;
+        int                     n = 0;
+        int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                                                XFS_ATTR_FORK : XFS_DATA_FORK;
+        ASSERT(*nmap >= 1);
+        ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
+                           XFS_BMAPI_IGSTATE)));
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        XFS_STATS_INC(xs_blk_mapr);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                error = xfs_iread_extents(NULL, ip, whichfork);
+                if (error)
+                        return error;
+        }
+        xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+        end = bno + len;
+        obno = bno;
+        while (bno < end && n < *nmap) {
+                /* Reading past eof, act as though there's a hole up to end. */
+                if (eof)
+                        got.br_startoff = end;
+                if (got.br_startoff > bno) {
+                        /* Reading in a hole.  */
+                        mval->br_startoff = bno;
+                        mval->br_startblock = HOLESTARTBLOCK;
+                        mval->br_blockcount =
+                                XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+                        mval->br_state = XFS_EXT_NORM;
+                        bno += mval->br_blockcount;
+                        len -= mval->br_blockcount;
+                        mval++;
+                        n++;
+                        continue;
+                }
+                /* set up the extent map to return. */
+                xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+                xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+                /* If we're done, stop now. */
+                if (bno >= end || n >= *nmap)
+                        break;
+                /* Else go on to the next record. */
+                if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+                else
+                        eof = 1;
+        }
+        *nmap = n;
+        return 0;
+}
+STATIC int
+xfs_bmapi_reserve_delalloc(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           aoff,
+        xfs_filblks_t           len,
+        struct xfs_bmbt_irec    *got,
+        struct xfs_bmbt_irec    *prev,
+        xfs_extnum_t            *lastx,
+        int                     eof)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+        xfs_extlen_t            alen;
+        xfs_extlen_t            indlen;
+        char                    rt = XFS_IS_REALTIME_INODE(ip);
+        xfs_extlen_t            extsz;
+        int                     error;
+        alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+        if (!eof)
+                alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+        /* Figure out the extent size, adjust alen */
+        extsz = xfs_get_extsz_hint(ip);
+        if (extsz) {
+                /*
+                 * Make sure we don't exceed a single extent length when we
+                 * align the extent by reducing length we are going to
+                 * allocate by the maximum amount extent size aligment may
+                 * require.
+                 */
+                alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
+                error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+                                               1, 0, &aoff, &alen);
+                ASSERT(!error);
+        }
+        if (rt)
+                extsz = alen / mp->m_sb.sb_rextsize;
+        /*
+         * Make a transaction-less quota reservation for delayed allocation
+         * blocks.  This number gets adjusted later.  We return if we haven't
+         * allocated blocks already inside this loop.
+         */
+        error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
+                        rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+        if (error)
+                return error;
+        /*
+         * Split changing sb for alen and indlen since they could be coming
+         * from different places.
+         */
+        indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+        ASSERT(indlen > 0);
+        if (rt) {
+                error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+                                          -((int64_t)extsz), 0);
+        } else {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                 -((int64_t)alen), 0);
+        }
+        if (error)
+                goto out_unreserve_quota;
+        error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                         -((int64_t)indlen), 0);
+        if (error)
+                goto out_unreserve_blocks;
+        ip->i_delayed_blks += alen;
+        got->br_startoff = aoff;
+        got->br_startblock = nullstartblock(indlen);
+        got->br_blockcount = alen;
+        got->br_state = XFS_EXT_NORM;
+        xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+        /*
+         * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+         * might have merged it into one of the neighbouring ones.
+         */
+        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+        ASSERT(got->br_startoff <= aoff);
+        ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+        ASSERT(isnullstartblock(got->br_startblock));
+        ASSERT(got->br_state == XFS_EXT_NORM);
+        return 0;
+out_unreserve_blocks:
+        if (rt)
+                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+        else
+                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+out_unreserve_quota:
+        if (XFS_IS_QUOTA_ON(mp))
+                xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
+                                XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+        return error;
+}
+/*
+ * Map file blocks to filesystem blocks, adding delayed allocations as needed.
+ */
+int
+xfs_bmapi_delay(
+        struct xfs_inode        *ip,    /* incore inode */
+        xfs_fileoff_t           bno,    /* starting file offs. mapped */
+        xfs_filblks_t           len,    /* length to map in file */
+        struct xfs_bmbt_irec    *mval,  /* output: map values */
+        int                     *nmap,  /* i/o: mval size/count */
+        int                     flags)  /* XFS_BMAPI_... */
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+        struct xfs_bmbt_irec    got;    /* current file extent record */
+        struct xfs_bmbt_irec    prev;   /* previous file extent record */
+        xfs_fileoff_t           obno;   /* old block number (offset) */
+        xfs_fileoff_t           end;    /* end of mapped file region */
+        xfs_extnum_t            lastx;  /* last useful extent number */
+        int                     eof;    /* we've hit the end of extents */
+        int                     n = 0;  /* current extent index */
+        int                     error = 0;
+        ASSERT(*nmap >= 1);
+        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+        ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        XFS_STATS_INC(xs_blk_mapw);
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+                if (error)
+                        return error;
+        }
+        xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
+        end = bno + len;
+        obno = bno;
+        while (bno < end && n < *nmap) {
+                if (eof || got.br_startoff > bno) {
+                        error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+                                                           &prev, &lastx, eof);
+                        if (error) {
+                                if (n == 0) {
+                                        *nmap = 0;
+                                        return error;
+                                }
+                                break;
+                        }
+                }
+                /* set up the extent map to return. */
+                xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+                xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+                /* If we're done, stop now. */
+                if (bno >= end || n >= *nmap)
+                        break;
+                /* Else go on to the next record. */
+                prev = got;
+                if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+                else
+                        eof = 1;
+        }
+        *nmap = n;
+        return 0;
+}
+static int
+xfs_bmapi_allocate(
+        struct xfs_bmalloca     *bma)
+{
+        struct xfs_mount        *mp = bma->ip->i_mount;
+        int                     whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
+                                                XFS_ATTR_FORK : XFS_DATA_FORK;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+        int                     tmp_logflags = 0;
+        int                     error;
+        ASSERT(bma->length > 0);
+        /*
+         * For the wasdelay case, we could also just allocate the stuff asked
+         * for in this bmap call but that wouldn't be as good.
+         */
+        if (bma->wasdel) {
+                bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+                bma->offset = bma->got.br_startoff;
+                if (bma->idx != NULLEXTNUM && bma->idx) {
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
+                                         &bma->prev);
+                }
+        } else {
+                bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+                if (!bma->eof)
+                        bma->length = XFS_FILBLKS_MIN(bma->length,
+                                        bma->got.br_startoff - bma->offset);
+        }
+        /*
+         * Indicate if this is the first user data in the file, or just any
+         * user data.
+         */
+        if (!(bma->flags & XFS_BMAPI_METADATA)) {
+                bma->userdata = (bma->offset == 0) ?
+                        XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+        }
+        bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+        /*
+         * Only want to do the alignment at the eof if it is userdata and
+         * allocation length is larger than a stripe unit.
+         */
+        if (mp->m_dalign && bma->length >= mp->m_dalign &&
+            !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+                error = xfs_bmap_isaeof(bma, whichfork);
+                if (error)
+                        return error;
+        }
+        error = xfs_bmap_alloc(bma);
+        if (error)
+                return error;
+        if (bma->flist->xbf_low)
+                bma->minleft = 0;
+        if (bma->cur)
+                bma->cur->bc_private.b.firstblock = *bma->firstblock;
+        if (bma->blkno == NULLFSBLOCK)
+                return 0;
+        if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+                bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+                bma->cur->bc_private.b.firstblock = *bma->firstblock;
+                bma->cur->bc_private.b.flist = bma->flist;
+        }
+        /*
+         * Bump the number of extents we've allocated
+         * in this call.
+         */
+        bma->nallocs++;
+        if (bma->cur)
+                bma->cur->bc_private.b.flags =
+                        bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+        bma->got.br_startoff = bma->offset;
+        bma->got.br_startblock = bma->blkno;
+        bma->got.br_blockcount = bma->length;
+        bma->got.br_state = XFS_EXT_NORM;
+        /*
+         * A wasdelay extent has been initialized, so shouldn't be flagged
+         * as unwritten.
+         */
+        if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
+            xfs_sb_version_hasextflgbit(&mp->m_sb))
+                bma->got.br_state = XFS_EXT_UNWRITTEN;
+        if (bma->wasdel)
+                error = xfs_bmap_add_extent_delay_real(bma);
+        else
+                error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+        bma->logflags |= tmp_logflags;
+        if (error)
+                return error;
+        /*
+         * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+         * or xfs_bmap_add_extent_hole_real might have merged it into one of
+         * the neighbouring ones.
+         */
+        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+        ASSERT(bma->got.br_startoff <= bma->offset);
+        ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+               bma->offset + bma->length);
+        ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+               bma->got.br_state == XFS_EXT_UNWRITTEN);
+        return 0;
+}
+STATIC int
+xfs_bmapi_convert_unwritten(
+        struct xfs_bmalloca     *bma,
+        struct xfs_bmbt_irec    *mval,
+        xfs_filblks_t           len,
+        int                     flags)
+{
+        int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                                                XFS_ATTR_FORK : XFS_DATA_FORK;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+        int                     tmp_logflags = 0;
+        int                     error;
+        /* check if we need to do unwritten->real conversion */
+        if (mval->br_state == XFS_EXT_UNWRITTEN &&
+            (flags & XFS_BMAPI_PREALLOC))
+                return 0;
+        /* check if we need to do real->unwritten conversion */
+        if (mval->br_state == XFS_EXT_NORM &&
+            (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+                        (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+                return 0;
+        /*
+         * Modify (by adding) the state flag, if writing.
+         */
+        ASSERT(mval->br_blockcount <= len);
+        if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+                bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+                                        bma->ip, whichfork);
+                bma->cur->bc_private.b.firstblock = *bma->firstblock;
+                bma->cur->bc_private.b.flist = bma->flist;
+        }
+        mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+                                ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+        error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+                        &bma->cur, mval, bma->firstblock, bma->flist,
+                        &tmp_logflags);
+        bma->logflags |= tmp_logflags;
+        if (error)
+                return error;
+        /*
+         * Update our extent pointer, given that
+         * xfs_bmap_add_extent_unwritten_real might have merged it into one
+         * of the neighbouring ones.
+         */
+        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+        /*
+         * We may have combined previously unwritten space with written space,
+         * so generate another request.
+         */
+        if (mval->br_blockcount < len)
+                return -EAGAIN;
+        return 0;
+}
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary.  Details behaviour is controlled by the flags
+ * parameter.  Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ *
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int
+xfs_bmapi_write(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        xfs_fileoff_t           bno,            /* starting file offs. mapped */
+        xfs_filblks_t           len,            /* length to map in file */
+        int                     flags,          /* XFS_BMAPI_... */
+        xfs_fsblock_t           *firstblock,    /* first allocated block
+                                                   controls a.g. for allocs */
+        xfs_extlen_t            total,          /* total blocks needed */
+        struct xfs_bmbt_irec    *mval,          /* output: map values */
+        int                     *nmap,          /* i/o: mval size/count */
+        struct xfs_bmap_free    *flist)         /* i/o: list extents to free */
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ifork        *ifp;
+        struct xfs_bmalloca     bma = { NULL }; /* args for xfs_bmap_alloc */
+        xfs_fileoff_t           end;            /* end of mapped file region */
+        int                     eof;            /* after the end of extents */
+        int                     error;          /* error return */
+        int                     n;              /* current extent index */
+        xfs_fileoff_t           obno;           /* old block number (offset) */
+        int                     whichfork;      /* data or attr fork */
+        char                    inhole;         /* current location is hole in file */
+        char                    wasdelay;       /* old extent was delayed */
+#ifdef DEBUG
+        xfs_fileoff_t           orig_bno;       /* original block number value */
+        int                     orig_flags;     /* original flags arg value */
+        xfs_filblks_t           orig_len;       /* original value of len arg */
+        struct xfs_bmbt_irec    *orig_mval;     /* original value of mval */
+        int                     orig_nmap;      /* original value of *nmap */
+        orig_bno = bno;
+        orig_len = len;
+        orig_flags = flags;
+        orig_mval = mval;
+        orig_nmap = *nmap;
+#endif
+        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                XFS_ATTR_FORK : XFS_DATA_FORK;
+        ASSERT(*nmap >= 1);
+        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+        ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+        ASSERT(tp != NULL);
+        ASSERT(len > 0);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        XFS_STATS_INC(xs_blk_mapw);
+        if (*firstblock == NULLFSBLOCK) {
+                if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+                        bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+                else
+                        bma.minleft = 1;
+        } else {
+                bma.minleft = 0;
+        }
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                error = xfs_iread_extents(tp, ip, whichfork);
+                if (error)
+                        goto error0;
+        }
+        xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
+                                &bma.prev);
+        n = 0;
+        end = bno + len;
+        obno = bno;
+        bma.tp = tp;
+        bma.ip = ip;
+        bma.total = total;
+        bma.userdata = 0;
+        bma.flist = flist;
+        bma.firstblock = firstblock;
+        while (bno < end && n < *nmap) {
+                inhole = eof || bma.got.br_startoff > bno;
+                wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+                /*
+                 * First, deal with the hole before the allocated space
+                 * that we found, if any.
+                 */
+                if (inhole || wasdelay) {
+                        bma.eof = eof;
+                        bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+                        bma.wasdel = wasdelay;
+                        bma.offset = bno;
+                        bma.flags = flags;
+                        /*
+                         * There's a 32/64 bit type mismatch between the
+                         * allocation length request (which can be 64 bits in
+                         * length) and the bma length request, which is
+                         * xfs_extlen_t and therefore 32 bits. Hence we have to
+                         * check for 32-bit overflows and handle them here.
+                         */
+                        if (len > (xfs_filblks_t)MAXEXTLEN)
+                                bma.length = MAXEXTLEN;
+                        else
+                                bma.length = len;
+                        ASSERT(len > 0);
+                        ASSERT(bma.length > 0);
+                        error = xfs_bmapi_allocate(&bma);
+                        if (error)
+                                goto error0;
+                        if (bma.blkno == NULLFSBLOCK)
+                                break;
+                }
+                /* Deal with the allocated space we found.  */
+                xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
+                                                        end, n, flags);
+                /* Execute unwritten extent conversion if necessary */
+                error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
+                if (error == -EAGAIN)
+                        continue;
+                if (error)
+                        goto error0;
+                /* update the extent map to return */
+                xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+                /*
+                 * If we're done, stop now.  Stop when we've allocated
+                 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
+                 * the transaction may get too big.
+                 */
+                if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
+                        break;
+                /* Else go on to the next record. */
+                bma.prev = bma.got;
+                if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
+                                         &bma.got);
+                } else
+                        eof = 1;
+        }
+        *nmap = n;
+        /*
+         * Transform from btree to extents, give it cur.
+         */
+        if (xfs_bmap_wants_extents(ip, whichfork)) {
+                int             tmp_logflags = 0;
+                ASSERT(bma.cur);
+                error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
+                        &tmp_logflags, whichfork);
+                bma.logflags |= tmp_logflags;
+                if (error)
+                        goto error0;
+        }
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+               XFS_IFORK_NEXTENTS(ip, whichfork) >
+                XFS_IFORK_MAXEXT(ip, whichfork));
+        error = 0;
+error0:
+        /*
+         * Log everything.  Do this after conversion, there's no point in
+         * logging the extent records if we've converted to btree format.
+         */
+        if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+                bma.logflags &= ~xfs_ilog_fext(whichfork);
+        else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
+                 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+                bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+        /*
+         * Log whatever the flags say, even if error.  Otherwise we might miss
+         * detecting a case where the data is changed, there's an error,
+         * and it's not logged so we don't shutdown when we should.
+         */
+        if (bma.logflags)
+                xfs_trans_log_inode(tp, ip, bma.logflags);
+        if (bma.cur) {
+                if (!error) {
+                        ASSERT(*firstblock == NULLFSBLOCK ||
+                               XFS_FSB_TO_AGNO(mp, *firstblock) ==
+                               XFS_FSB_TO_AGNO(mp,
+                                       bma.cur->bc_private.b.firstblock) ||
+                               (flist->xbf_low &&
+                                XFS_FSB_TO_AGNO(mp, *firstblock) <
+                                XFS_FSB_TO_AGNO(mp,
+                                        bma.cur->bc_private.b.firstblock)));
+                        *firstblock = bma.cur->bc_private.b.firstblock;
+                }
+                xfs_btree_del_cursor(bma.cur,
+                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        }
+        if (!error)
+                xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+                        orig_nmap, *nmap);
+        return error;
+}
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int                              /* error */
+xfs_bmap_del_extent(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_trans_t             *tp,    /* current transaction pointer */
+        xfs_extnum_t            *idx,   /* extent number to update/delete */
+        xfs_bmap_free_t         *flist, /* list of extents to be freed */
+        xfs_btree_cur_t         *cur,   /* if null, not a btree */
+        xfs_bmbt_irec_t         *del,   /* data to remove from extents */
+        int                     *logflagsp, /* inode logging flags */
+        int                     whichfork) /* data or attr fork */
+{
+        xfs_filblks_t           da_new; /* new delay-alloc indirect blocks */
+        xfs_filblks_t           da_old; /* old delay-alloc indirect blocks */
+        xfs_fsblock_t           del_endblock=0; /* first block past del */
+        xfs_fileoff_t           del_endoff;     /* first offset past del */
+        int                     delay;  /* current block is delayed allocated */
+        int                     do_fx;  /* free extent at end of routine */
+        xfs_bmbt_rec_host_t     *ep;    /* current extent entry pointer */
+        int                     error;  /* error return value */
+        int                     flags;  /* inode logging flags */
+        xfs_bmbt_irec_t         got;    /* current extent entry */
+        xfs_fileoff_t           got_endoff;     /* first offset past got */
+        int                     i;      /* temp state */
+        xfs_ifork_t             *ifp;   /* inode fork pointer */
+        xfs_mount_t             *mp;    /* mount structure */
+        xfs_filblks_t           nblks;  /* quota/sb block count */
+        xfs_bmbt_irec_t         new;    /* new record to be inserted */
+        /* REFERENCED */
+        uint                    qfield; /* quota field to update */
+        xfs_filblks_t           temp;   /* for indirect length calculations */
+        xfs_filblks_t           temp2;  /* for indirect length calculations */
+        int                     state = 0;
+        XFS_STATS_INC(xs_del_exlist);
+        if (whichfork == XFS_ATTR_FORK)
+                state |= BMAP_ATTRFORK;
+        mp = ip->i_mount;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+                (uint)sizeof(xfs_bmbt_rec_t)));
+        ASSERT(del->br_blockcount > 0);
+        ep = xfs_iext_get_ext(ifp, *idx);
+        xfs_bmbt_get_all(ep, &got);
+        ASSERT(got.br_startoff <= del->br_startoff);
+        del_endoff = del->br_startoff + del->br_blockcount;
+        got_endoff = got.br_startoff + got.br_blockcount;
+        ASSERT(got_endoff >= del_endoff);
+        delay = isnullstartblock(got.br_startblock);
+        ASSERT(isnullstartblock(del->br_startblock) == delay);
+        flags = 0;
+        qfield = 0;
+        error = 0;
+        /*
+         * If deleting a real allocation, must free up the disk space.
+         */
+        if (!delay) {
+                flags = XFS_ILOG_CORE;
+                /*
+                 * Realtime allocation.  Free it and record di_nblocks update.
+                 */
+                if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+                        xfs_fsblock_t   bno;
+                        xfs_filblks_t   len;
+                        ASSERT(do_mod(del->br_blockcount,
+                                      mp->m_sb.sb_rextsize) == 0);
+                        ASSERT(do_mod(del->br_startblock,
+                                      mp->m_sb.sb_rextsize) == 0);
+                        bno = del->br_startblock;
+                        len = del->br_blockcount;
+                        do_div(bno, mp->m_sb.sb_rextsize);
+                        do_div(len, mp->m_sb.sb_rextsize);
+                        error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+                        if (error)
+                                goto done;
+                        do_fx = 0;
+                        nblks = len * mp->m_sb.sb_rextsize;
+                        qfield = XFS_TRANS_DQ_RTBCOUNT;
+                }
+                /*
+                 * Ordinary allocation.
+                 */
+                else {
+                        do_fx = 1;
+                        nblks = del->br_blockcount;
+                        qfield = XFS_TRANS_DQ_BCOUNT;
+                }
+                /*
+                 * Set up del_endblock and cur for later.
+                 */
+                del_endblock = del->br_startblock + del->br_blockcount;
+                if (cur) {
+                        if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                        got.br_startblock, got.br_blockcount,
+                                        &i)))
+                                goto done;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                }
+                da_old = da_new = 0;
+        } else {
+                da_old = startblockval(got.br_startblock);
+                da_new = 0;
+                nblks = 0;
+                do_fx = 0;
+        }
+        /*
+         * Set flag value to use in switch statement.
+         * Left-contig is 2, right-contig is 1.
+         */
+        switch (((got.br_startoff == del->br_startoff) << 1) |
+                (got_endoff == del_endoff)) {
+        case 3:
+                /*
+                 * Matches the whole extent.  Delete the entry.
+                 */
+                xfs_iext_remove(ip, *idx, 1,
+                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+                --*idx;
+                if (delay)
+                        break;
+                XFS_IFORK_NEXT_SET(ip, whichfork,
+                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+                flags |= XFS_ILOG_CORE;
+                if (!cur) {
+                        flags |= xfs_ilog_fext(whichfork);
+                        break;
+                }
+                if ((error = xfs_btree_delete(cur, &i)))
+                        goto done;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                break;
+        case 2:
+                /*
+                 * Deleting the first part of the extent.
+                 */
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_startoff(ep, del_endoff);
+                temp = got.br_blockcount - del->br_blockcount;
+                xfs_bmbt_set_blockcount(ep, temp);
+                if (delay) {
+                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                                da_old);
+                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                        trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                        da_new = temp;
+                        break;
+                }
+                xfs_bmbt_set_startblock(ep, del_endblock);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                if (!cur) {
+                        flags |= xfs_ilog_fext(whichfork);
+                        break;
+                }
+                if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+                                got.br_blockcount - del->br_blockcount,
+                                got.br_state)))
+                        goto done;
+                break;
+        case 1:
+                /*
+                 * Deleting the last part of the extent.
+                 */
+                temp = got.br_blockcount - del->br_blockcount;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep, temp);
+                if (delay) {
+                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                                da_old);
+                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                        trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                        da_new = temp;
+                        break;
+                }
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                if (!cur) {
+                        flags |= xfs_ilog_fext(whichfork);
+                        break;
+                }
+                if ((error = xfs_bmbt_update(cur, got.br_startoff,
+                                got.br_startblock,
+                                got.br_blockcount - del->br_blockcount,
+                                got.br_state)))
+                        goto done;
+                break;
+        case 0:
+                /*
+                 * Deleting the middle of the extent.
+                 */
+                temp = del->br_startoff - got.br_startoff;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(ep, temp);
+                new.br_startoff = del_endoff;
+                temp2 = got_endoff - del_endoff;
+                new.br_blockcount = temp2;
+                new.br_state = got.br_state;
+                if (!delay) {
+                        new.br_startblock = del_endblock;
+                        flags |= XFS_ILOG_CORE;
+                        if (cur) {
+                                if ((error = xfs_bmbt_update(cur,
+                                                got.br_startoff,
+                                                got.br_startblock, temp,
+                                                got.br_state)))
+                                        goto done;
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
+                                        goto done;
+                                cur->bc_rec.b = new;
+                                error = xfs_btree_insert(cur, &i);
+                                if (error && error != -ENOSPC)
+                                        goto done;
+                                /*
+                                 * If get no-space back from btree insert,
+                                 * it tried a split, and we have a zero
+                                 * block reservation.
+                                 * Fix up our state and return the error.
+                                 */
+                                if (error == -ENOSPC) {
+                                        /*
+                                         * Reset the cursor, don't trust
+                                         * it after any insert operation.
+                                         */
+                                        if ((error = xfs_bmbt_lookup_eq(cur,
+                                                        got.br_startoff,
+                                                        got.br_startblock,
+                                                        temp, &i)))
+                                                goto done;
+                                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                                        /*
+                                         * Update the btree record back
+                                         * to the original value.
+                                         */
+                                        if ((error = xfs_bmbt_update(cur,
+                                                        got.br_startoff,
+                                                        got.br_startblock,
+                                                        got.br_blockcount,
+                                                        got.br_state)))
+                                                goto done;
+                                        /*
+                                         * Reset the extent record back
+                                         * to the original value.
+                                         */
+                                        xfs_bmbt_set_blockcount(ep,
+                                                got.br_blockcount);
+                                        flags = 0;
+                                        error = -ENOSPC;
+                                        goto done;
+                                }
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        } else
+                                flags |= xfs_ilog_fext(whichfork);
+                        XFS_IFORK_NEXT_SET(ip, whichfork,
+                                XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+                } else {
+                        ASSERT(whichfork == XFS_DATA_FORK);
+                        temp = xfs_bmap_worst_indlen(ip, temp);
+                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                        temp2 = xfs_bmap_worst_indlen(ip, temp2);
+                        new.br_startblock = nullstartblock((int)temp2);
+                        da_new = temp + temp2;
+                        while (da_new > da_old) {
+                                if (temp) {
+                                        temp--;
+                                        da_new--;
+                                        xfs_bmbt_set_startblock(ep,
+                                                nullstartblock((int)temp));
+                                }
+                                if (da_new == da_old)
+                                        break;
+                                if (temp2) {
+                                        temp2--;
+                                        da_new--;
+                                        new.br_startblock =
+                                                nullstartblock((int)temp2);
+                                }
+                        }
+                }
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+                ++*idx;
+                break;
+        }
+        /*
+         * If we need to, add to list of extents to delete.
+         */
+        if (do_fx)
+                xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+                        mp);
+        /*
+         * Adjust inode # blocks in the file.
+         */
+        if (nblks)
+                ip->i_d.di_nblocks -= nblks;
+        /*
+         * Adjust quota data.
+         */
+        if (qfield)
+                xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+        /*
+         * Account for change in delayed indirect blocks.
+         * Nothing to do for disk quota accounting here.
+         */
+        ASSERT(da_old >= da_new);
+        if (da_old > da_new) {
+                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                        (int64_t)(da_old - da_new), 0);
+        }
+done:
+        *logflagsp = flags;
+        return error;
+}
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.  If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int                                             /* error */
+xfs_bunmapi(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        xfs_fileoff_t           bno,            /* starting offset to unmap */
+        xfs_filblks_t           len,            /* length to unmap in file */
+        int                     flags,          /* misc flags */
+        xfs_extnum_t            nexts,          /* number of extents max */
+        xfs_fsblock_t           *firstblock,    /* first allocated block
+                                                   controls a.g. for allocs */
+        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        int                     *done)          /* set if not done yet */
+{
+        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
+        xfs_bmbt_irec_t         del;            /* extent being deleted */
+        int                     eof;            /* is deleting at eof */
+        xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
+        int                     error;          /* error return value */
+        xfs_extnum_t            extno;          /* extent number in list */
+        xfs_bmbt_irec_t         got;            /* current extent record */
+        xfs_ifork_t             *ifp;           /* inode fork pointer */
+        int                     isrt;           /* freeing in rt area */
+        xfs_extnum_t            lastx;          /* last extent index used */
+        int                     logflags;       /* transaction logging flags */
+        xfs_extlen_t            mod;            /* rt extent offset */
+        xfs_mount_t             *mp;            /* mount structure */
+        xfs_extnum_t            nextents;       /* number of file extents */
+        xfs_bmbt_irec_t         prev;           /* previous extent record */
+        xfs_fileoff_t           start;          /* first file offset deleted */
+        int                     tmp_logflags;   /* partial logging flags */
+        int                     wasdel;         /* was a delayed alloc extent */
+        int                     whichfork;      /* data or attribute fork */
+        xfs_fsblock_t           sum;
+        trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                XFS_ATTR_FORK : XFS_DATA_FORK;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (unlikely(
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+                XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
+                                 ip->i_mount);
+                return -EFSCORRUPTED;
+        }
+        mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        ASSERT(len > 0);
+        ASSERT(nexts >= 0);
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(tp, ip, whichfork)))
+                return error;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        if (nextents == 0) {
+                *done = 1;
+                return 0;
+        }
+        XFS_STATS_INC(xs_blk_unmap);
+        isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+        start = bno;
+        bno = start + len - 1;
+        ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+                &prev);
+        /*
+         * Check to see if the given block number is past the end of the
+         * file, back up to the last block if so...
+         */
+        if (eof) {
+                ep = xfs_iext_get_ext(ifp, --lastx);
+                xfs_bmbt_get_all(ep, &got);
+                bno = got.br_startoff + got.br_blockcount - 1;
+        }
+        logflags = 0;
+        if (ifp->if_flags & XFS_IFBROOT) {
+                ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+                cur->bc_private.b.firstblock = *firstblock;
+                cur->bc_private.b.flist = flist;
+                cur->bc_private.b.flags = 0;
+        } else
+                cur = NULL;
+        if (isrt) {
+                /*
+                 * Synchronize by locking the bitmap inode.
+                 */
+                xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+        }
+        extno = 0;
+        while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+               (nexts == 0 || extno < nexts)) {
+                /*
+                 * Is the found extent after a hole in which bno lives?
+                 * Just back up to the previous extent, if so.
+                 */
+                if (got.br_startoff > bno) {
+                        if (--lastx < 0)
+                                break;
+                        ep = xfs_iext_get_ext(ifp, lastx);
+                        xfs_bmbt_get_all(ep, &got);
+                }
+                /*
+                 * Is the last block of this extent before the range
+                 * we're supposed to delete?  If so, we're done.
+                 */
+                bno = XFS_FILEOFF_MIN(bno,
+                        got.br_startoff + got.br_blockcount - 1);
+                if (bno < start)
+                        break;
+                /*
+                 * Then deal with the (possibly delayed) allocated space
+                 * we found.
+                 */
+                ASSERT(ep != NULL);
+                del = got;
+                wasdel = isnullstartblock(del.br_startblock);
+                if (got.br_startoff < start) {
+                        del.br_startoff = start;
+                        del.br_blockcount -= start - got.br_startoff;
+                        if (!wasdel)
+                                del.br_startblock += start - got.br_startoff;
+                }
+                if (del.br_startoff + del.br_blockcount > bno + 1)
+                        del.br_blockcount = bno + 1 - del.br_startoff;
+                sum = del.br_startblock + del.br_blockcount;
+                if (isrt &&
+                    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+                        /*
+                         * Realtime extent not lined up at the end.
+                         * The extent could have been split into written
+                         * and unwritten pieces, or we could just be
+                         * unmapping part of it.  But we can't really
+                         * get rid of part of a realtime extent.
+                         */
+                        if (del.br_state == XFS_EXT_UNWRITTEN ||
+                            !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+                                /*
+                                 * This piece is unwritten, or we're not
+                                 * using unwritten extents.  Skip over it.
+                                 */
+                                ASSERT(bno >= mod);
+                                bno -= mod > del.br_blockcount ?
+                                        del.br_blockcount : mod;
+                                if (bno < got.br_startoff) {
+                                        if (--lastx >= 0)
+                                                xfs_bmbt_get_all(xfs_iext_get_ext(
+                                                        ifp, lastx), &got);
+                                }
+                                continue;
+                        }
+                        /*
+                         * It's written, turn it unwritten.
+                         * This is better than zeroing it.
+                         */
+                        ASSERT(del.br_state == XFS_EXT_NORM);
+                        ASSERT(xfs_trans_get_block_res(tp) > 0);
+                        /*
+                         * If this spans a realtime extent boundary,
+                         * chop it back to the start of the one we end at.
+                         */
+                        if (del.br_blockcount > mod) {
+                                del.br_startoff += del.br_blockcount - mod;
+                                del.br_startblock += del.br_blockcount - mod;
+                                del.br_blockcount = mod;
+                        }
+                        del.br_state = XFS_EXT_UNWRITTEN;
+                        error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+                                        &lastx, &cur, &del, firstblock, flist,
+                                        &logflags);
+                        if (error)
+                                goto error0;
+                        goto nodelete;
+                }
+                if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
+                        /*
+                         * Realtime extent is lined up at the end but not
+                         * at the front.  We'll get rid of full extents if
+                         * we can.
+                         */
+                        mod = mp->m_sb.sb_rextsize - mod;
+                        if (del.br_blockcount > mod) {
+                                del.br_blockcount -= mod;
+                                del.br_startoff += mod;
+                                del.br_startblock += mod;
+                        } else if ((del.br_startoff == start &&
+                                    (del.br_state == XFS_EXT_UNWRITTEN ||
+                                     xfs_trans_get_block_res(tp) == 0)) ||
+                                   !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+                                /*
+                                 * Can't make it unwritten.  There isn't
+                                 * a full extent here so just skip it.
+                                 */
+                                ASSERT(bno >= del.br_blockcount);
+                                bno -= del.br_blockcount;
+                                if (got.br_startoff > bno) {
+                                        if (--lastx >= 0) {
+                                                ep = xfs_iext_get_ext(ifp,
+                                                                      lastx);
+                                                xfs_bmbt_get_all(ep, &got);
+                                        }
+                                }
+                                continue;
+                        } else if (del.br_state == XFS_EXT_UNWRITTEN) {
+                                /*
+                                 * This one is already unwritten.
+                                 * It must have a written left neighbor.
+                                 * Unwrite the killed part of that one and
+                                 * try again.
+                                 */
+                                ASSERT(lastx > 0);
+                                xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+                                                lastx - 1), &prev);
+                                ASSERT(prev.br_state == XFS_EXT_NORM);
+                                ASSERT(!isnullstartblock(prev.br_startblock));
+                                ASSERT(del.br_startblock ==
+                                       prev.br_startblock + prev.br_blockcount);
+                                if (prev.br_startoff < start) {
+                                        mod = start - prev.br_startoff;
+                                        prev.br_blockcount -= mod;
+                                        prev.br_startblock += mod;
+                                        prev.br_startoff = start;
+                                }
+                                prev.br_state = XFS_EXT_UNWRITTEN;
+                                lastx--;
+                                error = xfs_bmap_add_extent_unwritten_real(tp,
+                                                ip, &lastx, &cur, &prev,
+                                                firstblock, flist, &logflags);
+                                if (error)
+                                        goto error0;
+                                goto nodelete;
+                        } else {
+                                ASSERT(del.br_state == XFS_EXT_NORM);
+                                del.br_state = XFS_EXT_UNWRITTEN;
+                                error = xfs_bmap_add_extent_unwritten_real(tp,
+                                                ip, &lastx, &cur, &del,
+                                                firstblock, flist, &logflags);
+                                if (error)
+                                        goto error0;
+                                goto nodelete;
+                        }
+                }
+                if (wasdel) {
+                        ASSERT(startblockval(del.br_startblock) > 0);
+                        /* Update realtime/data freespace, unreserve quota */
+                        if (isrt) {
+                                xfs_filblks_t rtexts;
+                                rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
+                                do_div(rtexts, mp->m_sb.sb_rextsize);
+                                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+                                                (int64_t)rtexts, 0);
+                                (void)xfs_trans_reserve_quota_nblks(NULL,
+                                        ip, -((long)del.br_blockcount), 0,
+                                        XFS_QMOPT_RES_RTBLKS);
+                        } else {
+                                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                (int64_t)del.br_blockcount, 0);
+                                (void)xfs_trans_reserve_quota_nblks(NULL,
+                                        ip, -((long)del.br_blockcount), 0,
+                                        XFS_QMOPT_RES_REGBLKS);
+                        }
+                        ip->i_delayed_blks -= del.br_blockcount;
+                        if (cur)
+                                cur->bc_private.b.flags |=
+                                        XFS_BTCUR_BPRV_WASDEL;
+                } else if (cur)
+                        cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+                /*
+                 * If it's the case where the directory code is running
+                 * with no block reservation, and the deleted block is in
+                 * the middle of its extent, and the resulting insert
+                 * of an extent would cause transformation to btree format,
+                 * then reject it.  The calling code will then swap
+                 * blocks around instead.
+                 * We have to do this now, rather than waiting for the
+                 * conversion to btree format, since the transaction
+                 * will be dirty.
+                 */
+                if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+                    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+                    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+                        XFS_IFORK_MAXEXT(ip, whichfork) &&
+                    del.br_startoff > got.br_startoff &&
+                    del.br_startoff + del.br_blockcount <
+                    got.br_startoff + got.br_blockcount) {
+                        error = -ENOSPC;
+                        goto error0;
+                }
+                error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+                                &tmp_logflags, whichfork);
+                logflags |= tmp_logflags;
+                if (error)
+                        goto error0;
+                bno = del.br_startoff - 1;
+nodelete:
+                /*
+                 * If not done go on to the next (previous) record.
+                 */
+                if (bno != (xfs_fileoff_t)-1 && bno >= start) {
+                        if (lastx >= 0) {
+                                ep = xfs_iext_get_ext(ifp, lastx);
+                                if (xfs_bmbt_get_startoff(ep) > bno) {
+                                        if (--lastx >= 0)
+                                                ep = xfs_iext_get_ext(ifp,
+                                                                      lastx);
+                                }
+                                xfs_bmbt_get_all(ep, &got);
+                        }
+                        extno++;
+                }
+        }
+        *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+        /*
+         * Convert to a btree if necessary.
+         */
+        if (xfs_bmap_needs_btree(ip, whichfork)) {
+                ASSERT(cur == NULL);
+                error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+                        &cur, 0, &tmp_logflags, whichfork);
+                logflags |= tmp_logflags;
+                if (error)
+                        goto error0;
+        }
+        /*
+         * transform from btree to extents, give it cur
+         */
+        else if (xfs_bmap_wants_extents(ip, whichfork)) {
+                ASSERT(cur != NULL);
+                error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+                        whichfork);
+                logflags |= tmp_logflags;
+                if (error)
+                        goto error0;
+        }
+        /*
+         * transform from extents to local?
+         */
+        error = 0;
+error0:
+        /*
+         * Log everything.  Do this after conversion, there's no point in
+         * logging the extent records if we've converted to btree format.
+         */
+        if ((logflags & xfs_ilog_fext(whichfork)) &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+                logflags &= ~xfs_ilog_fext(whichfork);
+        else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+                 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+                logflags &= ~xfs_ilog_fbroot(whichfork);
+        /*
+         * Log inode even in the error case, if the transaction
+         * is dirty we'll need to shut down the filesystem.
+         */
+        if (logflags)
+                xfs_trans_log_inode(tp, ip, logflags);
+        if (cur) {
+                if (!error) {
+                        *firstblock = cur->bc_private.b.firstblock;
+                        cur->bc_private.b.allocated = 0;
+                }
+                xfs_btree_del_cursor(cur,
+                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        }
+        return error;
+}
+/*
+ * Shift extent records to the left to cover a hole.
+ *
+ * The maximum number of extents to be shifted in a single operation
+ * is @num_exts, and @current_ext keeps track of the current extent
+ * index we have shifted. @offset_shift_fsb is the length by which each
+ * extent is shifted. If there is no hole to shift the extents
+ * into, this will be considered invalid operation and we abort immediately.
+ */
+int
+xfs_bmap_shift_extents(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        int                     *done,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           offset_shift_fsb,
+        xfs_extnum_t            *current_ext,
+        xfs_fsblock_t           *firstblock,
+        struct xfs_bmap_free    *flist,
+        int                     num_exts)
+{
+        struct xfs_btree_cur            *cur;
+        struct xfs_bmbt_rec_host        *gotp;
+        struct xfs_bmbt_irec            got;
+        struct xfs_bmbt_irec            left;
+        struct xfs_mount                *mp = ip->i_mount;
+        struct xfs_ifork                *ifp;
+        xfs_extnum_t                    nexts = 0;
+        xfs_fileoff_t                   startoff;
+        int                             error = 0;
+        int                             i;
+        int                             whichfork = XFS_DATA_FORK;
+        int                             logflags;
+        xfs_filblks_t                   blockcount = 0;
+        int                             total_extents;
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+                                 XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        ASSERT(current_ext != NULL);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                /* Read in all the extents */
+                error = xfs_iread_extents(tp, ip, whichfork);
+                if (error)
+                        return error;
+        }
+        /*
+         * If *current_ext is 0, we would need to lookup the extent
+         * from where we would start shifting and store it in gotp.
+         */
+        if (!*current_ext) {
+                gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
+                /*
+                 * gotp can be null in 2 cases: 1) if there are no extents
+                 * or 2) start_fsb lies in a hole beyond which there are
+                 * no extents. Either way, we are done.
+                 */
+                if (!gotp) {
+                        *done = 1;
+                        return 0;
+                }
+        }
+        /* We are going to change core inode */
+        logflags = XFS_ILOG_CORE;
+        if (ifp->if_flags & XFS_IFBROOT) {
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+                cur->bc_private.b.firstblock = *firstblock;
+                cur->bc_private.b.flist = flist;
+                cur->bc_private.b.flags = 0;
+        } else {
+                cur = NULL;
+                logflags |= XFS_ILOG_DEXT;
+        }
+        /*
+         * There may be delalloc extents in the data fork before the range we
+         * are collapsing out, so we cannot
+         * use the count of real extents here. Instead we have to calculate it
+         * from the incore fork.
+         */
+        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        while (nexts++ < num_exts && *current_ext < total_extents) {
+                gotp = xfs_iext_get_ext(ifp, *current_ext);
+                xfs_bmbt_get_all(gotp, &got);
+                startoff = got.br_startoff - offset_shift_fsb;
+                /*
+                 * Before shifting extent into hole, make sure that the hole
+                 * is large enough to accomodate the shift.
+                 */
+                if (*current_ext) {
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+                                                *current_ext - 1), &left);
+                        if (startoff < left.br_startoff + left.br_blockcount)
+                                error = -EINVAL;
+                } else if (offset_shift_fsb > got.br_startoff) {
+                        /*
+                         * When first extent is shifted, offset_shift_fsb
+                         * should be less than the stating offset of
+                         * the first extent.
+                         */
+                        error = -EINVAL;
+                }
+                if (error)
+                        goto del_cursor;
+                if (cur) {
+                        error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                                   got.br_startblock,
+                                                   got.br_blockcount,
+                                                   &i);
+                        if (error)
+                                goto del_cursor;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                }
+                /* Check if we can merge 2 adjacent extents */
+                if (*current_ext &&
+                    left.br_startoff + left.br_blockcount == startoff &&
+                    left.br_startblock + left.br_blockcount ==
+                                got.br_startblock &&
+                    left.br_state == got.br_state &&
+                    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
+                        blockcount = left.br_blockcount +
+                                got.br_blockcount;
+                        xfs_iext_remove(ip, *current_ext, 1, 0);
+                        if (cur) {
+                                error = xfs_btree_delete(cur, &i);
+                                if (error)
+                                        goto del_cursor;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                        }
+                        XFS_IFORK_NEXT_SET(ip, whichfork,
+                                XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+                        gotp = xfs_iext_get_ext(ifp, --*current_ext);
+                        xfs_bmbt_get_all(gotp, &got);
+                        /* Make cursor point to the extent we will update */
+                        if (cur) {
+                                error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                                           got.br_startblock,
+                                                           got.br_blockcount,
+                                                           &i);
+                                if (error)
+                                        goto del_cursor;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                        }
+                        xfs_bmbt_set_blockcount(gotp, blockcount);
+                        got.br_blockcount = blockcount;
+                } else {
+                        /* We have to update the startoff */
+                        xfs_bmbt_set_startoff(gotp, startoff);
+                        got.br_startoff = startoff;
+                }
+                if (cur) {
+                        error = xfs_bmbt_update(cur, got.br_startoff,
+                                                got.br_startblock,
+                                                got.br_blockcount,
+                                                got.br_state);
+                        if (error)
+                                goto del_cursor;
+                }
+                (*current_ext)++;
+                total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        }
+        /* Check if we are done */
+        if (*current_ext == total_extents)
+                *done = 1;
+del_cursor:
+        if (cur)
+                xfs_btree_del_cursor(cur,
+                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        xfs_trans_log_inode(tp, ip, logflags);
+        return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
new file mode 100644
index 000000000000..b879ca56a64c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_H__
+#define __XFS_BMAP_H__
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+extern kmem_zone_t      *xfs_bmap_free_item_zone;
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+typedef struct xfs_bmap_free_item
+{
+        xfs_fsblock_t           xbfi_startblock;/* starting fs block number */
+        xfs_extlen_t            xbfi_blockcount;/* number of blocks in extent */
+        struct xfs_bmap_free_item *xbfi_next;   /* link to next entry */
+} xfs_bmap_free_item_t;
+/*
+ * Header for free extent list.
+ *
+ * xbf_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent.  In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs.  In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0.  If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
+ */
+typedef struct xfs_bmap_free
+{
+        xfs_bmap_free_item_t    *xbf_first;     /* list of to-be-free extents */
+        int                     xbf_count;      /* count of items on list */
+        int                     xbf_low;        /* alloc in low mode */
+} xfs_bmap_free_t;
+#define XFS_BMAP_MAX_NMAP       4
+/*
+ * Flags for xfs_bmapi_*
+ */
+#define XFS_BMAPI_ENTIRE        0x001   /* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA      0x002   /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK      0x004   /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC      0x008   /* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE       0x010   /* Ignore state - */
+                                        /* combine contig. space */
+#define XFS_BMAPI_CONTIG        0x020   /* must allocate only one extent */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT       0x040
+#define XFS_BMAPI_FLAGS \
+        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
+        { XFS_BMAPI_METADATA,   "METADATA" }, \
+        { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
+        { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
+        { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
+        { XFS_BMAPI_CONTIG,     "CONTIG" }, \
+        { XFS_BMAPI_CONVERT,    "CONVERT" }
+static inline int xfs_bmapi_aflag(int w)
+{
+        return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
+}
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define DELAYSTARTBLOCK         ((xfs_fsblock_t)-1LL)
+#define HOLESTARTBLOCK          ((xfs_fsblock_t)-2LL)
+static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
+{
+        ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
+                (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+}
+/*
+ * Flags for xfs_bmap_add_extent*.
+ */
+#define BMAP_LEFT_CONTIG        (1 << 0)
+#define BMAP_RIGHT_CONTIG       (1 << 1)
+#define BMAP_LEFT_FILLING       (1 << 2)
+#define BMAP_RIGHT_FILLING      (1 << 3)
+#define BMAP_LEFT_DELAY         (1 << 4)
+#define BMAP_RIGHT_DELAY        (1 << 5)
+#define BMAP_LEFT_VALID         (1 << 6)
+#define BMAP_RIGHT_VALID        (1 << 7)
+#define BMAP_ATTRFORK           (1 << 8)
+#define XFS_BMAP_EXT_FLAGS \
+        { BMAP_LEFT_CONTIG,     "LC" }, \
+        { BMAP_RIGHT_CONTIG,    "RC" }, \
+        { BMAP_LEFT_FILLING,    "LF" }, \
+        { BMAP_RIGHT_FILLING,   "RF" }, \
+        { BMAP_ATTRFORK,        "ATTR" }
+/*
+ * This macro is used to determine how many extents will be shifted
+ * in one write transaction. We could require two splits,
+ * an extent move on the first and an extent merge on the second,
+ * So it is proper that one extent is shifted inside write transaction
+ * at a time.
+ */
+#define XFS_BMAP_MAX_SHIFT_EXTENTS      1
+#ifdef DEBUG
+void    xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
+                int whichfork, unsigned long caller_ip);
+#define XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
+        xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
+#else
+#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
+#endif
+int     xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+void    xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+void    xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
+                struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void    xfs_bmap_cancel(struct xfs_bmap_free *flist);
+void    xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
+int     xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
+                xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
+int     xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
+                xfs_fileoff_t *last_block, int whichfork);
+int     xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
+                int whichfork);
+int     xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
+int     xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+                int whichfork);
+int     xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
+                xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+                int *nmap, int flags);
+int     xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
+                xfs_filblks_t len, struct xfs_bmbt_irec *mval,
+                int *nmap, int flags);
+int     xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
+                xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+                xfs_fsblock_t *firstblock, xfs_extlen_t total,
+                struct xfs_bmbt_irec *mval, int *nmap,
+                struct xfs_bmap_free *flist);
+int     xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+                xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+                xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+                struct xfs_bmap_free *flist, int *done);
+int     xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
+                xfs_extnum_t num);
+uint    xfs_default_attroffset(struct xfs_inode *ip);
+int     xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+                int *done, xfs_fileoff_t start_fsb,
+                xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
+                xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
+                int num_exts);
+#endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
new file mode 100644
index 000000000000..a388de4ceaa1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -0,0 +1,967 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+/*
+ * Determine the extent state.
+ */
+/* ARGSUSED */
+STATIC xfs_exntst_t
+xfs_extent_state(
+        xfs_filblks_t           blks,
+        int                     extent_flag)
+{
+        if (extent_flag) {
+                ASSERT(blks != 0);      /* saved for DMIG */
+                return XFS_EXT_UNWRITTEN;
+        }
+        return XFS_EXT_NORM;
+}
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+        struct xfs_inode        *ip,
+        xfs_bmdr_block_t        *dblock,
+        int                     dblocklen,
+        struct xfs_btree_block  *rblock,
+        int                     rblocklen)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     dmxr;
+        xfs_bmbt_key_t          *fkp;
+        __be64                  *fpp;
+        xfs_bmbt_key_t          *tkp;
+        __be64                  *tpp;
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+                                 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+                                 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+        else
+                xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+                                 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+                                 XFS_BTREE_LONG_PTRS);
+        rblock->bb_level = dblock->bb_level;
+        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+        rblock->bb_numrecs = dblock->bb_numrecs;
+        dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+        fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+        tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+        fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+        tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+        dmxr = be16_to_cpu(dblock->bb_numrecs);
+        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+/*
+ * Convert a compressed bmap extent record to an uncompressed form.
+ * This code must be in sync with the routines xfs_bmbt_get_startoff,
+ * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
+ */
+STATIC void
+__xfs_bmbt_get_all(
+                __uint64_t l0,
+                __uint64_t l1,
+                xfs_bmbt_irec_t *s)
+{
+        int     ext_flag;
+        xfs_exntst_t st;
+        ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
+        s->br_startoff = ((xfs_fileoff_t)l0 &
+                           xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+#if XFS_BIG_BLKNOS
+        s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
+                           (((xfs_fsblock_t)l1) >> 21);
+#else
+#ifdef DEBUG
+        {
+                xfs_dfsbno_t    b;
+                b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
+                    (((xfs_dfsbno_t)l1) >> 21);
+                ASSERT((b >> 32) == 0 || isnulldstartblock(b));
+                s->br_startblock = (xfs_fsblock_t)b;
+        }
+#else   /* !DEBUG */
+        s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
+#endif  /* DEBUG */
+#endif  /* XFS_BIG_BLKNOS */
+        s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
+        /* This is xfs_extent_state() in-line */
+        if (ext_flag) {
+                ASSERT(s->br_blockcount != 0);  /* saved for DMIG */
+                st = XFS_EXT_UNWRITTEN;
+        } else
+                st = XFS_EXT_NORM;
+        s->br_state = st;
+}
+void
+xfs_bmbt_get_all(
+        xfs_bmbt_rec_host_t *r,
+        xfs_bmbt_irec_t *s)
+{
+        __xfs_bmbt_get_all(r->l0, r->l1, s);
+}
+/*
+ * Extract the blockcount field from an in memory bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+        xfs_bmbt_rec_host_t     *r)
+{
+        return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
+}
+/*
+ * Extract the startblock field from an in memory bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+        xfs_bmbt_rec_host_t     *r)
+{
+#if XFS_BIG_BLKNOS
+        return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
+               (((xfs_fsblock_t)r->l1) >> 21);
+#else
+#ifdef DEBUG
+        xfs_dfsbno_t    b;
+        b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
+            (((xfs_dfsbno_t)r->l1) >> 21);
+        ASSERT((b >> 32) == 0 || isnulldstartblock(b));
+        return (xfs_fsblock_t)b;
+#else   /* !DEBUG */
+        return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
+#endif  /* DEBUG */
+#endif  /* XFS_BIG_BLKNOS */
+}
+/*
+ * Extract the startoff field from an in memory bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+        xfs_bmbt_rec_host_t     *r)
+{
+        return ((xfs_fileoff_t)r->l0 &
+                 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+xfs_exntst_t
+xfs_bmbt_get_state(
+        xfs_bmbt_rec_host_t     *r)
+{
+        int     ext_flag;
+        ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
+        return xfs_extent_state(xfs_bmbt_get_blockcount(r),
+                                ext_flag);
+}
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+        xfs_bmbt_rec_t  *r)
+{
+        return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
+}
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+        xfs_bmbt_rec_t  *r)
+{
+        return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+                 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+/*
+ * Set all the fields in a bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_set_allf(
+        xfs_bmbt_rec_host_t     *r,
+        xfs_fileoff_t           startoff,
+        xfs_fsblock_t           startblock,
+        xfs_filblks_t           blockcount,
+        xfs_exntst_t            state)
+{
+        int             extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+        ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+        ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+        ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+#if XFS_BIG_BLKNOS
+        ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+        r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                ((xfs_bmbt_rec_base_t)startoff << 9) |
+                ((xfs_bmbt_rec_base_t)startblock >> 43);
+        r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+                ((xfs_bmbt_rec_base_t)blockcount &
+                (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+#else   /* !XFS_BIG_BLKNOS */
+        if (isnullstartblock(startblock)) {
+                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)startoff << 9) |
+                         (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+                r->l1 = xfs_mask64hi(11) |
+                          ((xfs_bmbt_rec_base_t)startblock << 21) |
+                          ((xfs_bmbt_rec_base_t)blockcount &
+                           (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+        } else {
+                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)startoff << 9);
+                r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+                         ((xfs_bmbt_rec_base_t)blockcount &
+                         (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+        }
+#endif  /* XFS_BIG_BLKNOS */
+}
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_set_all(
+        xfs_bmbt_rec_host_t *r,
+        xfs_bmbt_irec_t *s)
+{
+        xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
+                             s->br_blockcount, s->br_state);
+}
+/*
+ * Set all the fields in a disk format bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_disk_set_allf(
+        xfs_bmbt_rec_t          *r,
+        xfs_fileoff_t           startoff,
+        xfs_fsblock_t           startblock,
+        xfs_filblks_t           blockcount,
+        xfs_exntst_t            state)
+{
+        int                     extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+        ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+        ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+        ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+#if XFS_BIG_BLKNOS
+        ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+        r->l0 = cpu_to_be64(
+                ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                 ((xfs_bmbt_rec_base_t)startoff << 9) |
+                 ((xfs_bmbt_rec_base_t)startblock >> 43));
+        r->l1 = cpu_to_be64(
+                ((xfs_bmbt_rec_base_t)startblock << 21) |
+                 ((xfs_bmbt_rec_base_t)blockcount &
+                  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+#else   /* !XFS_BIG_BLKNOS */
+        if (isnullstartblock(startblock)) {
+                r->l0 = cpu_to_be64(
+                        ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                         ((xfs_bmbt_rec_base_t)startoff << 9) |
+                          (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+                r->l1 = cpu_to_be64(xfs_mask64hi(11) |
+                          ((xfs_bmbt_rec_base_t)startblock << 21) |
+                          ((xfs_bmbt_rec_base_t)blockcount &
+                           (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+        } else {
+                r->l0 = cpu_to_be64(
+                        ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                         ((xfs_bmbt_rec_base_t)startoff << 9));
+                r->l1 = cpu_to_be64(
+                        ((xfs_bmbt_rec_base_t)startblock << 21) |
+                         ((xfs_bmbt_rec_base_t)blockcount &
+                          (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+        }
+#endif  /* XFS_BIG_BLKNOS */
+}
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+STATIC void
+xfs_bmbt_disk_set_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s)
+{
+        xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
+                                  s->br_blockcount, s->br_state);
+}
+/*
+ * Set the blockcount field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_blockcount(
+        xfs_bmbt_rec_host_t *r,
+        xfs_filblks_t   v)
+{
+        ASSERT((v & xfs_mask64hi(43)) == 0);
+        r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
+                  (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
+}
+/*
+ * Set the startblock field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startblock(
+        xfs_bmbt_rec_host_t *r,
+        xfs_fsblock_t   v)
+{
+#if XFS_BIG_BLKNOS
+        ASSERT((v & xfs_mask64hi(12)) == 0);
+        r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
+                  (xfs_bmbt_rec_base_t)(v >> 43);
+        r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
+                  (xfs_bmbt_rec_base_t)(v << 21);
+#else   /* !XFS_BIG_BLKNOS */
+        if (isnullstartblock(v)) {
+                r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+                r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
+                          ((xfs_bmbt_rec_base_t)v << 21) |
+                          (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+        } else {
+                r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+                r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
+                          (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+        }
+#endif  /* XFS_BIG_BLKNOS */
+}
+/*
+ * Set the startoff field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startoff(
+        xfs_bmbt_rec_host_t *r,
+        xfs_fileoff_t   v)
+{
+        ASSERT((v & xfs_mask64hi(9)) == 0);
+        r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
+                ((xfs_bmbt_rec_base_t)v << 9) |
+                  (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+}
+/*
+ * Set the extent state field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_state(
+        xfs_bmbt_rec_host_t *r,
+        xfs_exntst_t    v)
+{
+        ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
+        if (v == XFS_EXT_NORM)
+                r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
+        else
+                r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
+}
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+        struct xfs_mount        *mp,
+        struct xfs_btree_block  *rblock,
+        int                     rblocklen,
+        xfs_bmdr_block_t        *dblock,
+        int                     dblocklen)
+{
+        int                     dmxr;
+        xfs_bmbt_key_t          *fkp;
+        __be64                  *fpp;
+        xfs_bmbt_key_t          *tkp;
+        __be64                  *tpp;
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+                ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+                ASSERT(rblock->bb_u.l.bb_blkno ==
+                       cpu_to_be64(XFS_BUF_DADDR_NULL));
+        } else
+                ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+        ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
+        ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
+        ASSERT(rblock->bb_level != 0);
+        dblock->bb_level = rblock->bb_level;
+        dblock->bb_numrecs = rblock->bb_numrecs;
+        dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
+        fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+        tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+        fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+        tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+        dmxr = be16_to_cpu(dblock->bb_numrecs);
+        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+/*
+ * Check extent records, which have just been read, for
+ * any bit in the extent flag field. ASSERT on debug
+ * kernels, as this condition should not occur.
+ * Return an error condition (1) if any flags found,
+ * otherwise return 0.
+ */
+int
+xfs_check_nostate_extents(
+        xfs_ifork_t             *ifp,
+        xfs_extnum_t            idx,
+        xfs_extnum_t            num)
+{
+        for (; num > 0; num--, idx++) {
+                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+                if ((ep->l0 >>
+                     (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
+                        ASSERT(0);
+                        return 1;
+                }
+        }
+        return 0;
+}
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+        struct xfs_btree_cur    *cur)
+{
+        struct xfs_btree_cur    *new;
+        new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+        /*
+         * Copy the firstblock, flist, and flags values,
+         * since init cursor doesn't get them.
+         */
+        new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+        new->bc_private.b.flist = cur->bc_private.b.flist;
+        new->bc_private.b.flags = cur->bc_private.b.flags;
+        return new;
+}
+STATIC void
+xfs_bmbt_update_cursor(
+        struct xfs_btree_cur    *src,
+        struct xfs_btree_cur    *dst)
+{
+        ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+               (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+        ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+        dst->bc_private.b.allocated += src->bc_private.b.allocated;
+        dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+        src->bc_private.b.allocated = 0;
+}
+STATIC int
+xfs_bmbt_alloc_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *start,
+        union xfs_btree_ptr     *new,
+        int                     *stat)
+{
+        xfs_alloc_arg_t         args;           /* block allocation args */
+        int                     error;          /* error return value */
+        memset(&args, 0, sizeof(args));
+        args.tp = cur->bc_tp;
+        args.mp = cur->bc_mp;
+        args.fsbno = cur->bc_private.b.firstblock;
+        args.firstblock = args.fsbno;
+        if (args.fsbno == NULLFSBLOCK) {
+                args.fsbno = be64_to_cpu(start->l);
+                args.type = XFS_ALLOCTYPE_START_BNO;
+                /*
+                 * Make sure there is sufficient room left in the AG to
+                 * complete a full tree split for an extent insert.  If
+                 * we are converting the middle part of an extent then
+                 * we may need space for two tree splits.
+                 *
+                 * We are relying on the caller to make the correct block
+                 * reservation for this operation to succeed.  If the
+                 * reservation amount is insufficient then we may fail a
+                 * block allocation here and corrupt the filesystem.
+                 */
+                args.minleft = xfs_trans_get_block_res(args.tp);
+        } else if (cur->bc_private.b.flist->xbf_low) {
+                args.type = XFS_ALLOCTYPE_START_BNO;
+        } else {
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        }
+        args.minlen = args.maxlen = args.prod = 1;
+        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+                error = -ENOSPC;
+                goto error0;
+        }
+        error = xfs_alloc_vextent(&args);
+        if (error)
+                goto error0;
+        if (args.fsbno == NULLFSBLOCK && args.minleft) {
+                /*
+                 * Could not find an AG with enough free space to satisfy
+                 * a full btree split.  Try again without minleft and if
+                 * successful activate the lowspace algorithm.
+                 */
+                args.fsbno = 0;
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
+                args.minleft = 0;
+                error = xfs_alloc_vextent(&args);
+                if (error)
+                        goto error0;
+                cur->bc_private.b.flist->xbf_low = 1;
+        }
+        if (args.fsbno == NULLFSBLOCK) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        ASSERT(args.len == 1);
+        cur->bc_private.b.firstblock = args.fsbno;
+        cur->bc_private.b.allocated++;
+        cur->bc_private.b.ip->i_d.di_nblocks++;
+        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+        xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+                        XFS_TRANS_DQ_BCOUNT, 1L);
+        new->l = cpu_to_be64(args.fsbno);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+ error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+STATIC int
+xfs_bmbt_free_block(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        struct xfs_trans        *tp = cur->bc_tp;
+        xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+        xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+        ip->i_d.di_nblocks--;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_binval(tp, bp);
+        return 0;
+}
+STATIC int
+xfs_bmbt_get_minrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level == cur->bc_nlevels - 1) {
+                struct xfs_ifork        *ifp;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                    cur->bc_private.b.whichfork);
+                return xfs_bmbt_maxrecs(cur->bc_mp,
+                                        ifp->if_broot_bytes, level == 0) / 2;
+        }
+        return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+int
+xfs_bmbt_get_maxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level == cur->bc_nlevels - 1) {
+                struct xfs_ifork        *ifp;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                    cur->bc_private.b.whichfork);
+                return xfs_bmbt_maxrecs(cur->bc_mp,
+                                        ifp->if_broot_bytes, level == 0);
+        }
+        return cur->bc_mp->m_bmap_dmxr[level != 0];
+}
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level != cur->bc_nlevels - 1)
+                return cur->bc_mp->m_bmap_dmxr[level != 0];
+        return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
+}
+STATIC void
+xfs_bmbt_init_key_from_rec(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        key->bmbt.br_startoff =
+                cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+STATIC void
+xfs_bmbt_init_rec_from_key(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        ASSERT(key->bmbt.br_startoff != 0);
+        xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+                               0, 0, XFS_EXT_NORM);
+}
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        ptr->l = 0;
+}
+STATIC __int64_t
+xfs_bmbt_key_diff(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key)
+{
+        return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+                                      cur->bc_rec.b.br_startoff;
+}
+static bool
+xfs_bmbt_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        unsigned int            level;
+        switch (block->bb_magic) {
+        case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+                if (!xfs_sb_version_hascrc(&mp->m_sb))
+                        return false;
+                if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
+                        return false;
+                /*
+                 * XXX: need a better way of verifying the owner here. Right now
+                 * just make sure there has been one set.
+                 */
+                if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
+                        return false;
+                /* fall through */
+        case cpu_to_be32(XFS_BMAP_MAGIC):
+                break;
+        default:
+                return false;
+        }
+        /*
+         * numrecs and level verification.
+         *
+         * We don't know what fork we belong to, so just verify that the level
+         * is less than the maximum of the two. Later checks will be more
+         * precise.
+         */
+        level = be16_to_cpu(block->bb_level);
+        if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+                return false;
+        if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+                return false;
+        /* sibling pointer verification */
+        if (!block->bb_u.l.bb_leftsib ||
+            (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
+             !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
+                return false;
+        if (!block->bb_u.l.bb_rightsib ||
+            (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
+             !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
+                return false;
+        return true;
+}
+static void
+xfs_bmbt_read_verify(
+        struct xfs_buf  *bp)
+{
+        if (!xfs_btree_lblock_verify_crc(bp))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_bmbt_verify(bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                xfs_verifier_error(bp);
+        }
+}
+static void
+xfs_bmbt_write_verify(
+        struct xfs_buf  *bp)
+{
+        if (!xfs_bmbt_verify(bp)) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        xfs_btree_lblock_calc_crc(bp);
+}
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+        .verify_read = xfs_bmbt_read_verify,
+        .verify_write = xfs_bmbt_write_verify,
+};
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_bmbt_keys_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *k1,
+        union xfs_btree_key     *k2)
+{
+        return be64_to_cpu(k1->bmbt.br_startoff) <
+                be64_to_cpu(k2->bmbt.br_startoff);
+}
+STATIC int
+xfs_bmbt_recs_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *r1,
+        union xfs_btree_rec     *r2)
+{
+        return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+                xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+                xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif  /* DEBUG */
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+        .rec_len                = sizeof(xfs_bmbt_rec_t),
+        .key_len                = sizeof(xfs_bmbt_key_t),
+        .dup_cursor             = xfs_bmbt_dup_cursor,
+        .update_cursor          = xfs_bmbt_update_cursor,
+        .alloc_block            = xfs_bmbt_alloc_block,
+        .free_block             = xfs_bmbt_free_block,
+        .get_maxrecs            = xfs_bmbt_get_maxrecs,
+        .get_minrecs            = xfs_bmbt_get_minrecs,
+        .get_dmaxrecs           = xfs_bmbt_get_dmaxrecs,
+        .init_key_from_rec      = xfs_bmbt_init_key_from_rec,
+        .init_rec_from_key      = xfs_bmbt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
+        .key_diff               = xfs_bmbt_key_diff,
+        .buf_ops                = &xfs_bmbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+        .keys_inorder           = xfs_bmbt_keys_inorder,
+        .recs_inorder           = xfs_bmbt_recs_inorder,
+#endif
+};
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *                          /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+        struct xfs_mount        *mp,            /* file system mount point */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* inode owning the btree */
+        int                     whichfork)      /* data or attr fork */
+{
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        struct xfs_btree_cur    *cur;
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+        cur->bc_tp = tp;
+        cur->bc_mp = mp;
+        cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+        cur->bc_btnum = XFS_BTNUM_BMAP;
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
+        cur->bc_ops = &xfs_bmbt_ops;
+        cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+        cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+        cur->bc_private.b.ip = ip;
+        cur->bc_private.b.firstblock = NULLFSBLOCK;
+        cur->bc_private.b.flist = NULL;
+        cur->bc_private.b.allocated = 0;
+        cur->bc_private.b.flags = 0;
+        cur->bc_private.b.whichfork = whichfork;
+        return cur;
+}
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+        struct xfs_mount        *mp,
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+        if (leaf)
+                return blocklen / sizeof(xfs_bmbt_rec_t);
+        return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= sizeof(xfs_bmdr_block_t);
+        if (leaf)
+                return blocklen / sizeof(xfs_bmdr_rec_t);
+        return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        int                     whichfork,
+        xfs_ino_t               new_owner,
+        struct list_head        *buffer_list)
+{
+        struct xfs_btree_cur    *cur;
+        int                     error;
+        ASSERT(tp || buffer_list);
+        ASSERT(!(tp && buffer_list));
+        if (whichfork == XFS_DATA_FORK)
+                ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+        else
+                ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+        cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+        if (!cur)
+                return -ENOMEM;
+        error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
new file mode 100644
index 000000000000..819a8a4dee95
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+struct xfs_btree_cur;
+struct xfs_btree_block;
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_trans;
+/*
+ * Extent state and extent format macros.
+ */
+#define XFS_EXTFMT_INODE(x)     \
+        (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
+                XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
+#define ISUNWRITTEN(x)  ((x)->br_state == XFS_EXT_UNWRITTEN)
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_BMBT_BLOCK_LEN(mp) \
+        (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+                XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+        ((xfs_bmbt_rec_t *) \
+                ((char *)(block) + \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+        ((xfs_bmbt_key_t *) \
+                ((char *)(block) + \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_bmbt_ptr_t *) \
+                ((char *)(block) + \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
+                 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+                 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+#define XFS_BMDR_REC_ADDR(block, index) \
+        ((xfs_bmdr_rec_t *) \
+                ((char *)(block) + \
+                 sizeof(struct xfs_bmdr_block) + \
+                 ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+#define XFS_BMDR_KEY_ADDR(block, index) \
+        ((xfs_bmdr_key_t *) \
+                ((char *)(block) + \
+                 sizeof(struct xfs_bmdr_block) + \
+                 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+        ((xfs_bmdr_ptr_t *) \
+                ((char *)(block) + \
+                 sizeof(struct xfs_bmdr_block) + \
+                 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+                 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+        XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
+#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
+        (int)(XFS_BMBT_BLOCK_LEN(mp) + \
+               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BROOT_SPACE(mp, bb) \
+        (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMDR_SPACE_CALC(nrecs) \
+        (int)(sizeof(xfs_bmdr_block_t) + \
+               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+        (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+/*
+ * Maximum number of bmap btree levels.
+ */
+#define XFS_BM_MAXLEVELS(mp,w)          ((mp)->m_bm_maxlevels[(w)])
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
+                        struct xfs_btree_block *, int);
+extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
+extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
+extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
+extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
+extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
+extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
+extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
+                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
+extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
+extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
+extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
+extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
+                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+                        xfs_bmdr_block_t *, int);
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+                                 int whichfork, xfs_ino_t new_owner,
+                                 struct list_head *buffer_list);
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_inode *, int);
+#endif  /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
new file mode 100644
index 000000000000..ba35c9ccb8f9
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -0,0 +1,4069 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_alloc.h"
+/*
+ * Cursor allocation zone.
+ */
+kmem_zone_t     *xfs_btree_cur_zone;
+/*
+ * Btree magic numbers.
+ */
+static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+          XFS_FIBT_MAGIC },
+        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
+};
+#define xfs_btree_magic(cur) \
+        xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
+STATIC int                              /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_btree_block  *block, /* btree long form block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp)    /* buffer for block, if any */
+{
+        int                     lblock_ok = 1; /* block passes checks */
+        struct xfs_mount        *mp;    /* file system mount point */
+        mp = cur->bc_mp;
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                lblock_ok = lblock_ok &&
+                        uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+                        block->bb_u.l.bb_blkno == cpu_to_be64(
+                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+        }
+        lblock_ok = lblock_ok &&
+                be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+                be16_to_cpu(block->bb_level) == level &&
+                be16_to_cpu(block->bb_numrecs) <=
+                        cur->bc_ops->get_maxrecs(cur, level) &&
+                block->bb_u.l.bb_leftsib &&
+                (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+                 XFS_FSB_SANITY_CHECK(mp,
+                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+                block->bb_u.l.bb_rightsib &&
+                (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+                 XFS_FSB_SANITY_CHECK(mp,
+                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
+        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+                        XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
+                if (bp)
+                        trace_xfs_btree_corrupt(bp, _RET_IP_);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        return 0;
+}
+STATIC int                              /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_btree_block  *block, /* btree short form block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp)    /* buffer containing block */
+{
+        struct xfs_mount        *mp;    /* file system mount point */
+        struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
+        struct xfs_agf          *agf;   /* ag. freespace structure */
+        xfs_agblock_t           agflen; /* native ag. freespace length */
+        int                     sblock_ok = 1; /* block passes checks */
+        mp = cur->bc_mp;
+        agbp = cur->bc_private.a.agbp;
+        agf = XFS_BUF_TO_AGF(agbp);
+        agflen = be32_to_cpu(agf->agf_length);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                sblock_ok = sblock_ok &&
+                        uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+                        block->bb_u.s.bb_blkno == cpu_to_be64(
+                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+        }
+        sblock_ok = sblock_ok &&
+                be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+                be16_to_cpu(block->bb_level) == level &&
+                be16_to_cpu(block->bb_numrecs) <=
+                        cur->bc_ops->get_maxrecs(cur, level) &&
+                (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+                block->bb_u.s.bb_leftsib &&
+                (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+                block->bb_u.s.bb_rightsib;
+        if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
+                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
+                if (bp)
+                        trace_xfs_btree_corrupt(bp, _RET_IP_);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        return 0;
+}
+/*
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_btree_block  *block, /* generic btree block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp)    /* buffer containing block, if any */
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return xfs_btree_check_lblock(cur, block, level, bp);
+        else
+                return xfs_btree_check_sblock(cur, block, level, bp);
+}
+/*
+ * Check that (long) pointer is ok.
+ */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_dfsbno_t            bno,    /* btree block disk address */
+        int                     level)  /* btree block level */
+{
+        XFS_WANT_CORRUPTED_RETURN(
+                level > 0 &&
+                bno != NULLDFSBNO &&
+                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int                              /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* btree block disk address */
+        int                     level)  /* btree block level */
+{
+        xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
+        XFS_WANT_CORRUPTED_RETURN(
+                level > 0 &&
+                bno != NULLAGBLOCK &&
+                bno != 0 &&
+                bno < agblocks);
+        return 0;
+}
+/*
+ * Check that block ptr is ok.
+ */
+STATIC int                              /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        union xfs_btree_ptr     *ptr,   /* btree block disk address */
+        int                     index,  /* offset from ptr to check */
+        int                     level)  /* btree block level */
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                return xfs_btree_check_lptr(cur,
+                                be64_to_cpu((&ptr->l)[index]), level);
+        } else {
+                return xfs_btree_check_sptr(cur,
+                                be32_to_cpu((&ptr->s)[index]), level);
+        }
+}
+#endif
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_lblock_calc_crc(
+        struct xfs_buf          *bp)
+{
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+                return;
+        if (bip)
+                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+}
+bool
+xfs_btree_lblock_verify_crc(
+        struct xfs_buf          *bp)
+{
+        if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+        return true;
+}
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_sblock_calc_crc(
+        struct xfs_buf          *bp)
+{
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+                return;
+        if (bip)
+                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+}
+bool
+xfs_btree_sblock_verify_crc(
+        struct xfs_buf          *bp)
+{
+        if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+        return true;
+}
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        int             error)          /* del because of error */
+{
+        int             i;              /* btree level */
+        /*
+         * Clear the buffer pointers, and release the buffers.
+         * If we're doing this in the face of an error, we
+         * need to make sure to inspect all of the entries
+         * in the bc_bufs array for buffers to be unlocked.
+         * This is because some of the btree code works from
+         * level n down to 0, and if we get an error along
+         * the way we won't have initialized all the entries
+         * down to 0.
+         */
+        for (i = 0; i < cur->bc_nlevels; i++) {
+                if (cur->bc_bufs[i])
+                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+                else if (!error)
+                        break;
+        }
+        /*
+         * Can't free a bmap cursor without having dealt with the
+         * allocated indirect blocks' accounting.
+         */
+        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
+               cur->bc_private.b.allocated == 0);
+        /*
+         * Free the cursor.
+         */
+        kmem_zone_free(xfs_btree_cur_zone, cur);
+}
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int                                     /* error */
+xfs_btree_dup_cursor(
+        xfs_btree_cur_t *cur,           /* input cursor */
+        xfs_btree_cur_t **ncur)         /* output cursor */
+{
+        xfs_buf_t       *bp;            /* btree block's buffer pointer */
+        int             error;          /* error return value */
+        int             i;              /* level number of btree block */
+        xfs_mount_t     *mp;            /* mount structure for filesystem */
+        xfs_btree_cur_t *new;           /* new cursor value */
+        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
+        tp = cur->bc_tp;
+        mp = cur->bc_mp;
+        /*
+         * Allocate a new cursor like the old one.
+         */
+        new = cur->bc_ops->dup_cursor(cur);
+        /*
+         * Copy the record currently in the cursor.
+         */
+        new->bc_rec = cur->bc_rec;
+        /*
+         * For each level current, re-get the buffer and copy the ptr value.
+         */
+        for (i = 0; i < new->bc_nlevels; i++) {
+                new->bc_ptrs[i] = cur->bc_ptrs[i];
+                new->bc_ra[i] = cur->bc_ra[i];
+                bp = cur->bc_bufs[i];
+                if (bp) {
+                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                                   XFS_BUF_ADDR(bp), mp->m_bsize,
+                                                   0, &bp,
+                                                   cur->bc_ops->buf_ops);
+                        if (error) {
+                                xfs_btree_del_cursor(new, error);
+                                *ncur = NULL;
+                                return error;
+                        }
+                }
+                new->bc_bufs[i] = bp;
+        }
+        *ncur = new;
+        return 0;
+}
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ *
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+                        return XFS_BTREE_LBLOCK_CRC_LEN;
+                return XFS_BTREE_LBLOCK_LEN;
+        }
+        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+                return XFS_BTREE_SBLOCK_CRC_LEN;
+        return XFS_BTREE_SBLOCK_LEN;
+}
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                sizeof(__be64) : sizeof(__be32);
+}
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n)
+{
+        return xfs_btree_block_len(cur) +
+                (n - 1) * cur->bc_ops->rec_len;
+}
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n)
+{
+        return xfs_btree_block_len(cur) +
+                (n - 1) * cur->bc_ops->key_len;
+}
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        int                     level)
+{
+        return xfs_btree_block_len(cur) +
+                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+                (n - 1) * xfs_btree_ptr_len(cur);
+}
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        return (union xfs_btree_rec *)
+                ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        return (union xfs_btree_key *)
+                ((char *)block + xfs_btree_key_offset(cur, n));
+}
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        int                     level = xfs_btree_get_level(block);
+        ASSERT(block->bb_level != 0);
+        return (union xfs_btree_ptr *)
+                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+/*
+ * Get the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be an inode btree root or from a buffer.
+ */
+STATIC struct xfs_btree_block *         /* generic btree block pointer */
+xfs_btree_get_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level in btree */
+        struct xfs_buf          **bpp)  /* buffer containing the block */
+{
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level == cur->bc_nlevels - 1)) {
+                *bpp = NULL;
+                return xfs_btree_get_iroot(cur);
+        }
+        *bpp = cur->bc_bufs[level];
+        return XFS_BUF_TO_BLOCK(*bpp);
+}
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+xfs_buf_t *                             /* buffer for fsbno */
+xfs_btree_get_bufl(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_fsblock_t   fsbno,          /* file system block number */
+        uint            lock)           /* lock flags for get_buf */
+{
+        xfs_daddr_t             d;              /* real disk block address */
+        ASSERT(fsbno != NULLFSBLOCK);
+        d = XFS_FSB_TO_DADDR(mp, fsbno);
+        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+}
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+xfs_buf_t *                             /* buffer for agno/agbno */
+xfs_btree_get_bufs(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agblock_t   agbno,          /* allocation group block number */
+        uint            lock)           /* lock flags for get_buf */
+{
+        xfs_daddr_t             d;              /* real disk block address */
+        ASSERT(agno != NULLAGNUMBER);
+        ASSERT(agbno != NULLAGBLOCK);
+        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+}
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int                                     /* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level)  /* level to check */
+{
+        struct xfs_btree_block  *block; /* generic btree block pointer */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        block = xfs_btree_get_block(cur, level, &bp);
+        xfs_btree_check_block(cur, block, level, bp);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
+        else
+                return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
+}
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+STATIC int                              /* success=1, failure=0 */
+xfs_btree_firstrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level)  /* level to change */
+{
+        struct xfs_btree_block  *block; /* generic btree block pointer */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        /*
+         * Get the block pointer for this level.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        xfs_btree_check_block(cur, block, level, bp);
+        /*
+         * It's empty, there is no such record.
+         */
+        if (!block->bb_numrecs)
+                return 0;
+        /*
+         * Set the ptr value to 1, that's the first record/key.
+         */
+        cur->bc_ptrs[level] = 1;
+        return 1;
+}
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.  Other levels are unaffected.
+ */
+STATIC int                              /* success=1, failure=0 */
+xfs_btree_lastrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level)  /* level to change */
+{
+        struct xfs_btree_block  *block; /* generic btree block pointer */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        /*
+         * Get the block pointer for this level.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        xfs_btree_check_block(cur, block, level, bp);
+        /*
+         * It's empty, there is no such record.
+         */
+        if (!block->bb_numrecs)
+                return 0;
+        /*
+         * Set the ptr value to numrecs, that's the last record/key.
+         */
+        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+        return 1;
+}
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+        __int64_t       fields,         /* bitmask of fields */
+        const short     *offsets,       /* table of field offsets */
+        int             nbits,          /* number of bits to inspect */
+        int             *first,         /* output: first byte offset */
+        int             *last)          /* output: last byte offset */
+{
+        int             i;              /* current bit number */
+        __int64_t       imask;          /* mask for current bit number */
+        ASSERT(fields != 0);
+        /*
+         * Find the lowest bit, so the first byte offset.
+         */
+        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+                if (imask & fields) {
+                        *first = offsets[i];
+                        break;
+                }
+        }
+        /*
+         * Find the highest bit, so the last byte offset.
+         */
+        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+                if (imask & fields) {
+                        *last = offsets[i + 1] - 1;
+                        break;
+                }
+        }
+}
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int
+xfs_btree_read_bufl(
+        struct xfs_mount        *mp,            /* file system mount point */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        xfs_fsblock_t           fsbno,          /* file system block number */
+        uint                    lock,           /* lock flags for read_buf */
+        struct xfs_buf          **bpp,          /* buffer for fsbno */
+        int                     refval,         /* ref count value for buffer */
+        const struct xfs_buf_ops *ops)
+{
+        struct xfs_buf          *bp;            /* return value */
+        xfs_daddr_t             d;              /* real disk block address */
+        int                     error;
+        ASSERT(fsbno != NULLFSBLOCK);
+        d = XFS_FSB_TO_DADDR(mp, fsbno);
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+                                   mp->m_bsize, lock, &bp, ops);
+        if (error)
+                return error;
+        if (bp)
+                xfs_buf_set_ref(bp, refval);
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+        struct xfs_mount        *mp,            /* file system mount point */
+        xfs_fsblock_t           fsbno,          /* file system block number */
+        xfs_extlen_t            count,          /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops)
+{
+        xfs_daddr_t             d;
+        ASSERT(fsbno != NULLFSBLOCK);
+        d = XFS_FSB_TO_DADDR(mp, fsbno);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+        struct xfs_mount        *mp,            /* file system mount point */
+        xfs_agnumber_t          agno,           /* allocation group number */
+        xfs_agblock_t           agbno,          /* allocation group block number */
+        xfs_extlen_t            count,          /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops)
+{
+        xfs_daddr_t             d;
+        ASSERT(agno != NULLAGNUMBER);
+        ASSERT(agbno != NULLAGBLOCK);
+        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+}
+STATIC int
+xfs_btree_readahead_lblock(
+        struct xfs_btree_cur    *cur,
+        int                     lr,
+        struct xfs_btree_block  *block)
+{
+        int                     rval = 0;
+        xfs_dfsbno_t            left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+        xfs_dfsbno_t            right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+                                     cur->bc_ops->buf_ops);
+                rval++;
+        }
+        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+                                     cur->bc_ops->buf_ops);
+                rval++;
+        }
+        return rval;
+}
+STATIC int
+xfs_btree_readahead_sblock(
+        struct xfs_btree_cur    *cur,
+        int                     lr,
+        struct xfs_btree_block *block)
+{
+        int                     rval = 0;
+        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                     left, 1, cur->bc_ops->buf_ops);
+                rval++;
+        }
+        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                     right, 1, cur->bc_ops->buf_ops);
+                rval++;
+        }
+        return rval;
+}
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     lev,            /* level in btree */
+        int                     lr)             /* left/right bits */
+{
+        struct xfs_btree_block  *block;
+        /*
+         * No readahead needed if we are at the root level and the
+         * btree root is stored in the inode.
+         */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (lev == cur->bc_nlevels - 1))
+                return 0;
+        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+                return 0;
+        cur->bc_ra[lev] |= lr;
+        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return xfs_btree_readahead_lblock(cur, lr, block);
+        return xfs_btree_readahead_sblock(cur, lr, block);
+}
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+                return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+        } else {
+                ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+                ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+                return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+                                        be32_to_cpu(ptr->s));
+        }
+}
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        xfs_extlen_t            count)
+{
+        xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+                          xfs_btree_ptr_to_daddr(cur, ptr),
+                          cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+STATIC void
+xfs_btree_setbuf(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     lev,    /* level in btree */
+        xfs_buf_t               *bp)    /* new buffer to set */
+{
+        struct xfs_btree_block  *b;     /* btree block */
+        if (cur->bc_bufs[lev])
+                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
+        cur->bc_bufs[lev] = bp;
+        cur->bc_ra[lev] = 0;
+        b = XFS_BUF_TO_BLOCK(bp);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
+                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
+                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+        } else {
+                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
+                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+                if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+        }
+}
+STATIC int
+xfs_btree_ptr_is_null(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return ptr->l == cpu_to_be64(NULLDFSBNO);
+        else
+                return ptr->s == cpu_to_be32(NULLAGBLOCK);
+}
+STATIC void
+xfs_btree_set_ptr_null(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                ptr->l = cpu_to_be64(NULLDFSBNO);
+        else
+                ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        union xfs_btree_ptr     *ptr,
+        int                     lr)
+{
+        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (lr == XFS_BB_RIGHTSIB)
+                        ptr->l = block->bb_u.l.bb_rightsib;
+                else
+                        ptr->l = block->bb_u.l.bb_leftsib;
+        } else {
+                if (lr == XFS_BB_RIGHTSIB)
+                        ptr->s = block->bb_u.s.bb_rightsib;
+                else
+                        ptr->s = block->bb_u.s.bb_leftsib;
+        }
+}
+STATIC void
+xfs_btree_set_sibling(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        union xfs_btree_ptr     *ptr,
+        int                     lr)
+{
+        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (lr == XFS_BB_RIGHTSIB)
+                        block->bb_u.l.bb_rightsib = ptr->l;
+                else
+                        block->bb_u.l.bb_leftsib = ptr->l;
+        } else {
+                if (lr == XFS_BB_RIGHTSIB)
+                        block->bb_u.s.bb_rightsib = ptr->s;
+                else
+                        block->bb_u.s.bb_leftsib = ptr->s;
+        }
+}
+void
+xfs_btree_init_block_int(
+        struct xfs_mount        *mp,
+        struct xfs_btree_block  *buf,
+        xfs_daddr_t             blkno,
+        __u32                   magic,
+        __u16                   level,
+        __u16                   numrecs,
+        __u64                   owner,
+        unsigned int            flags)
+{
+        buf->bb_magic = cpu_to_be32(magic);
+        buf->bb_level = cpu_to_be16(level);
+        buf->bb_numrecs = cpu_to_be16(numrecs);
+        if (flags & XFS_BTREE_LONG_PTRS) {
+                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+                if (flags & XFS_BTREE_CRC_BLOCKS) {
+                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+                        buf->bb_u.l.bb_pad = 0;
+                        buf->bb_u.l.bb_lsn = 0;
+                }
+        } else {
+                /* owner is a 32 bit value on short blocks */
+                __u32 __owner = (__u32)owner;
+                buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                if (flags & XFS_BTREE_CRC_BLOCKS) {
+                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+                        buf->bb_u.s.bb_lsn = 0;
+                }
+        }
+}
+void
+xfs_btree_init_block(
+        struct xfs_mount *mp,
+        struct xfs_buf  *bp,
+        __u32           magic,
+        __u16           level,
+        __u16           numrecs,
+        __u64           owner,
+        unsigned int    flags)
+{
+        xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+                                 magic, level, numrecs, owner, flags);
+}
+STATIC void
+xfs_btree_init_block_cur(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     level,
+        int                     numrecs)
+{
+        __u64 owner;
+        /*
+         * we can pull the owner from the cursor right now as the different
+         * owners align directly with the pointer size of the btree. This may
+         * change in future, but is safe for current users of the generic btree
+         * code.
+         */
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                owner = cur->bc_private.b.ip->i_ino;
+        else
+                owner = cur->bc_private.a.agno;
+        xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+                                 xfs_btree_magic(cur), level, numrecs,
+                                 owner, cur->bc_flags);
+}
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updates to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        int                     level)
+{
+        union xfs_btree_ptr     ptr;
+        if (level > 0)
+                return 0;
+        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+                return 0;
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &ptr))
+                return 0;
+        return 1;
+}
+STATIC void
+xfs_btree_buf_to_ptr(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+                                        XFS_BUF_ADDR(bp)));
+        else {
+                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
+                                        XFS_BUF_ADDR(bp)));
+        }
+}
+STATIC void
+xfs_btree_set_refs(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        switch (cur->bc_btnum) {
+        case XFS_BTNUM_BNO:
+        case XFS_BTNUM_CNT:
+                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
+                break;
+        case XFS_BTNUM_INO:
+        case XFS_BTNUM_FINO:
+                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
+                break;
+        case XFS_BTNUM_BMAP:
+                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+                break;
+        default:
+                ASSERT(0);
+        }
+}
+STATIC int
+xfs_btree_get_buf_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     flags,
+        struct xfs_btree_block  **block,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        xfs_daddr_t             d;
+        /* need to sort out how callers deal with failures first */
+        ASSERT(!(flags & XBF_TRYLOCK));
+        d = xfs_btree_ptr_to_daddr(cur, ptr);
+        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+                                 mp->m_bsize, flags);
+        if (!*bpp)
+                return -ENOMEM;
+        (*bpp)->b_ops = cur->bc_ops->buf_ops;
+        *block = XFS_BUF_TO_BLOCK(*bpp);
+        return 0;
+}
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     flags,
+        struct xfs_btree_block  **block,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        xfs_daddr_t             d;
+        int                     error;
+        /* need to sort out how callers deal with failures first */
+        ASSERT(!(flags & XBF_TRYLOCK));
+        d = xfs_btree_ptr_to_daddr(cur, ptr);
+        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+                                   mp->m_bsize, flags, bpp,
+                                   cur->bc_ops->buf_ops);
+        if (error)
+                return error;
+        xfs_btree_set_refs(cur, *bpp);
+        *block = XFS_BUF_TO_BLOCK(*bpp);
+        return 0;
+}
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *dst_key,
+        union xfs_btree_key     *src_key,
+        int                     numkeys)
+{
+        ASSERT(numkeys >= 0);
+        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *dst_rec,
+        union xfs_btree_rec     *src_rec,
+        int                     numrecs)
+{
+        ASSERT(numrecs >= 0);
+        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *dst_ptr,
+        union xfs_btree_ptr     *src_ptr,
+        int                     numptrs)
+{
+        ASSERT(numptrs >= 0);
+        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
+        int                     dir,
+        int                     numkeys)
+{
+        char                    *dst_key;
+        ASSERT(numkeys >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        int                     dir,
+        int                     numrecs)
+{
+        char                    *dst_rec;
+        ASSERT(numrecs >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     dir,
+        int                     numptrs)
+{
+        char                    *dst_ptr;
+        ASSERT(numptrs >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     first,
+        int                     last)
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        if (bp) {
+                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+                xfs_trans_log_buf(cur->bc_tp, bp,
+                                  xfs_btree_key_offset(cur, first),
+                                  xfs_btree_key_offset(cur, last + 1) - 1);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                                xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     first,
+        int                     last)
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+        xfs_trans_log_buf(cur->bc_tp, bp,
+                          xfs_btree_rec_offset(cur, first),
+                          xfs_btree_rec_offset(cur, last + 1) - 1);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_buf          *bp,    /* buffer containing btree block */
+        int                     first,  /* index of first pointer to log */
+        int                     last)   /* index of last pointer to log */
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        if (bp) {
+                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+                int                     level = xfs_btree_get_level(block);
+                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+                xfs_trans_log_buf(cur->bc_tp, bp,
+                                xfs_btree_ptr_offset(cur, first, level),
+                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_buf          *bp,    /* buffer containing btree block */
+        int                     fields) /* mask of fields: XFS_BB_... */
+{
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        static const short      soffsets[] = {  /* table of offsets (short) */
+                offsetof(struct xfs_btree_block, bb_magic),
+                offsetof(struct xfs_btree_block, bb_level),
+                offsetof(struct xfs_btree_block, bb_numrecs),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+                XFS_BTREE_SBLOCK_CRC_LEN
+        };
+        static const short      loffsets[] = {  /* table of offsets (long) */
+                offsetof(struct xfs_btree_block, bb_magic),
+                offsetof(struct xfs_btree_block, bb_level),
+                offsetof(struct xfs_btree_block, bb_numrecs),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+                XFS_BTREE_LBLOCK_CRC_LEN
+        };
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+        if (bp) {
+                int nbits;
+                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+                        /*
+                         * We don't log the CRC when updating a btree
+                         * block but instead recreate it during log
+                         * recovery.  As the log buffers have checksums
+                         * of their own this is safe and avoids logging a crc
+                         * update in a lot of places.
+                         */
+                        if (fields == XFS_BB_ALL_BITS)
+                                fields = XFS_BB_ALL_BITS_CRC;
+                        nbits = XFS_BB_NUM_BITS_CRC;
+                } else {
+                        nbits = XFS_BB_NUM_BITS;
+                }
+                xfs_btree_offsets(fields,
+                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                                        loffsets : soffsets,
+                                  nbits, &first, &last);
+                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
+                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_btree_increment(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_block  *block;
+        union xfs_btree_ptr     ptr;
+        struct xfs_buf          *bp;
+        int                     error;          /* error return value */
+        int                     lev;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        /* Read-ahead to the right at this level. */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+        /* Get a pointer to the btree block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* We're done if we remain in the block after the increment. */
+        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+                goto out1;
+        /* Fail if we just went off the right edge of the tree. */
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        if (xfs_btree_ptr_is_null(cur, &ptr))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, increment);
+        /*
+         * March up the tree incrementing pointers.
+         * Stop when we don't go off the right edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                block = xfs_btree_get_block(cur, lev, &bp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, block, lev, bp);
+                if (error)
+                        goto error0;
+#endif
+                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+                        break;
+                /* Read-ahead the right block for the next loop. */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+        }
+        /*
+         * If we went off the root then we are either seriously
+         * confused or have the tree root in an inode.
+         */
+        if (lev == cur->bc_nlevels) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                        goto out0;
+                ASSERT(0);
+                error = -EFSCORRUPTED;
+                goto error0;
+        }
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+                union xfs_btree_ptr     *ptrp;
+                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+                --lev;
+                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+                if (error)
+                        goto error0;
+                xfs_btree_setbuf(cur, lev, bp);
+                cur->bc_ptrs[lev] = 1;
+        }
+out1:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_btree_decrement(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_block  *block;
+        xfs_buf_t               *bp;
+        int                     error;          /* error return value */
+        int                     lev;
+        union xfs_btree_ptr     ptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        /* Read-ahead to the left at this level. */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+        /* We're done if we remain in the block after the decrement. */
+        if (--cur->bc_ptrs[level] > 0)
+                goto out1;
+        /* Get a pointer to the btree block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Fail if we just went off the left edge of the tree. */
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+        if (xfs_btree_ptr_is_null(cur, &ptr))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, decrement);
+        /*
+         * March up the tree decrementing pointers.
+         * Stop when we don't go off the left edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                if (--cur->bc_ptrs[lev] > 0)
+                        break;
+                /* Read-ahead the left block for the next loop. */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+        }
+        /*
+         * If we went off the root then we are seriously confused.
+         * or the root of the tree is in an inode.
+         */
+        if (lev == cur->bc_nlevels) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                        goto out0;
+                ASSERT(0);
+                error = -EFSCORRUPTED;
+                goto error0;
+        }
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+                union xfs_btree_ptr     *ptrp;
+                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+                --lev;
+                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
+                if (error)
+                        goto error0;
+                xfs_btree_setbuf(cur, lev, bp);
+                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+        }
+out1:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+STATIC int
+xfs_btree_lookup_get_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level in the btree */
+        union xfs_btree_ptr     *pp,    /* ptr to btree block */
+        struct xfs_btree_block  **blkp) /* return btree block */
+{
+        struct xfs_buf          *bp;    /* buffer pointer for btree block */
+        int                     error = 0;
+        /* special case the root block if in an inode */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level == cur->bc_nlevels - 1)) {
+                *blkp = xfs_btree_get_iroot(cur);
+                return 0;
+        }
+        /*
+         * If the old buffer at this level for the disk address we are
+         * looking for re-use it.
+         *
+         * Otherwise throw it away and get a new one.
+         */
+        bp = cur->bc_bufs[level];
+        if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+                *blkp = XFS_BUF_TO_BLOCK(bp);
+                return 0;
+        }
+        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
+        if (error)
+                return error;
+        xfs_btree_setbuf(cur, level, bp);
+        return 0;
+}
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     keyno,
+        struct xfs_btree_block  *block,
+        union xfs_btree_key     *kp)
+{
+        if (level == 0) {
+                cur->bc_ops->init_key_from_rec(kp,
+                                xfs_btree_rec_addr(cur, keyno, block));
+                return kp;
+        }
+        return xfs_btree_key_addr(cur, keyno, block);
+}
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * stat is set to 0 if can't find any such record, 1 for success.
+ */
+int                                     /* error */
+xfs_btree_lookup(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_lookup_t            dir,    /* <=, ==, or >= */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* current btree block */
+        __int64_t               diff;   /* difference for the current key */
+        int                     error;  /* error return value */
+        int                     keyno;  /* current key number */
+        int                     level;  /* level in the btree */
+        union xfs_btree_ptr     *pp;    /* ptr to btree block */
+        union xfs_btree_ptr     ptr;    /* ptr to btree block */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, dir);
+        XFS_BTREE_STATS_INC(cur, lookup);
+        block = NULL;
+        keyno = 0;
+        /* initialise start pointer from cursor */
+        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+        pp = &ptr;
+        /*
+         * Iterate over each level in the btree, starting at the root.
+         * For each level above the leaves, find the key we need, based
+         * on the lookup record, then follow the corresponding block
+         * pointer down to the next level.
+         */
+        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+                /* Get the block we need to do the lookup on. */
+                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+                if (error)
+                        goto error0;
+                if (diff == 0) {
+                        /*
+                         * If we already had a key match at a higher level, we
+                         * know we need to use the first entry in this block.
+                         */
+                        keyno = 1;
+                } else {
+                        /* Otherwise search this block. Do a binary search. */
+                        int     high;   /* high entry number */
+                        int     low;    /* low entry number */
+                        /* Set low and high entry numbers, 1-based. */
+                        low = 1;
+                        high = xfs_btree_get_numrecs(block);
+                        if (!high) {
+                                /* Block is empty, must be an empty leaf. */
+                                ASSERT(level == 0 && cur->bc_nlevels == 1);
+                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                                *stat = 0;
+                                return 0;
+                        }
+                        /* Binary search the block. */
+                        while (low <= high) {
+                                union xfs_btree_key     key;
+                                union xfs_btree_key     *kp;
+                                XFS_BTREE_STATS_INC(cur, compare);
+                                /* keyno is average of low and high. */
+                                keyno = (low + high) >> 1;
+                                /* Get current search key */
+                                kp = xfs_lookup_get_search_key(cur, level,
+                                                keyno, block, &key);
+                                /*
+                                 * Compute difference to get next direction:
+                                 *  - less than, move right
+                                 *  - greater than, move left
+                                 *  - equal, we're done
+                                 */
+                                diff = cur->bc_ops->key_diff(cur, kp);
+                                if (diff < 0)
+                                        low = keyno + 1;
+                                else if (diff > 0)
+                                        high = keyno - 1;
+                                else
+                                        break;
+                        }
+                }
+                /*
+                 * If there are more levels, set up for the next level
+                 * by getting the block number and filling in the cursor.
+                 */
+                if (level > 0) {
+                        /*
+                         * If we moved left, need the previous key number,
+                         * unless there isn't one.
+                         */
+                        if (diff > 0 && --keyno < 1)
+                                keyno = 1;
+                        pp = xfs_btree_ptr_addr(cur, keyno, block);
+#ifdef DEBUG
+                        error = xfs_btree_check_ptr(cur, pp, 0, level);
+                        if (error)
+                                goto error0;
+#endif
+                        cur->bc_ptrs[level] = keyno;
+                }
+        }
+        /* Done with the search. See if we need to adjust the results. */
+        if (dir != XFS_LOOKUP_LE && diff < 0) {
+                keyno++;
+                /*
+                 * If ge search and we went off the end of the block, but it's
+                 * not the last block, we're in the wrong block.
+                 */
+                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+                if (dir == XFS_LOOKUP_GE &&
+                    keyno > xfs_btree_get_numrecs(block) &&
+                    !xfs_btree_ptr_is_null(cur, &ptr)) {
+                        int     i;
+                        cur->bc_ptrs[0] = keyno;
+                        error = xfs_btree_increment(cur, 0, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                        *stat = 1;
+                        return 0;
+                }
+        } else if (dir == XFS_LOOKUP_LE && diff > 0)
+                keyno--;
+        cur->bc_ptrs[0] = keyno;
+        /* Return if we succeeded or not. */
+        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+                *stat = 0;
+        else if (dir != XFS_LOOKUP_EQ || diff == 0)
+                *stat = 1;
+        else
+                *stat = 0;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *keyp,
+        int                     level)
+{
+        struct xfs_btree_block  *block;
+        struct xfs_buf          *bp;
+        union xfs_btree_key     *kp;
+        int                     ptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+        ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+        /*
+         * Go up the tree from this level toward the root.
+         * At each level, update the key value to the value input.
+         * Stop when we reach a level where the cursor isn't pointing
+         * at the first entry in the block.
+         */
+        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+                int             error;
+#endif
+                block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, block, level, bp);
+                if (error) {
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                        return error;
+                }
+#endif
+                ptr = cur->bc_ptrs[level];
+                kp = xfs_btree_key_addr(cur, ptr, block);
+                xfs_btree_copy_keys(cur, kp, keyp, 1);
+                xfs_btree_log_keys(cur, bp, ptr, ptr);
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        struct xfs_btree_block  *block;
+        struct xfs_buf          *bp;
+        int                     error;
+        int                     ptr;
+        union xfs_btree_rec     *rp;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGR(cur, rec);
+        /* Pick up the current block. */
+        block = xfs_btree_get_block(cur, 0, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, 0, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Get the address of the rec to be updated. */
+        ptr = cur->bc_ptrs[0];
+        rp = xfs_btree_rec_addr(cur, ptr, block);
+        /* Fill in the new contents and log them. */
+        xfs_btree_copy_recs(cur, rp, rec, 1);
+        xfs_btree_log_recs(cur, bp, ptr, ptr);
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, 0)) {
+                cur->bc_ops->update_lastrec(cur, block, rec,
+                                            ptr, LASTREC_UPDATE);
+        }
+        /* Updating first rec in leaf. Pass new key value up to our parent. */
+        if (ptr == 1) {
+                union xfs_btree_key     key;
+                cur->bc_ops->init_key_from_rec(&key, rec);
+                error = xfs_btree_updkey(cur, &key, 1);
+                if (error)
+                        goto error0;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_btree_lshift(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_key     key;            /* btree key */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        int                     lrecs;          /* left record count */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        int                     rrecs;          /* right record count */
+        union xfs_btree_ptr     lptr;           /* left btree pointer */
+        union xfs_btree_key     *rkp = NULL;    /* right btree key */
+        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
+        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
+        int                     error;          /* error return value */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            level == cur->bc_nlevels - 1)
+                goto out0;
+        /* Set up variables for this block as "right". */
+        right = xfs_btree_get_block(cur, level, &rbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, right, level, rbp);
+        if (error)
+                goto error0;
+#endif
+        /* If we've got no left sibling then we can't shift an entry left. */
+        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+        if (xfs_btree_ptr_is_null(cur, &lptr))
+                goto out0;
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        if (cur->bc_ptrs[level] <= 1)
+                goto out0;
+        /* Set up the left neighbor as "left". */
+        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+        if (error)
+                goto error0;
+        /* If it's full, it can't take another entry. */
+        lrecs = xfs_btree_get_numrecs(left);
+        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+                goto out0;
+        rrecs = xfs_btree_get_numrecs(right);
+        /*
+         * We add one entry to the left side and remove one for the right side.
+         * Account for it here, the changes will be updated on disk and logged
+         * later.
+         */
+        lrecs++;
+        rrecs--;
+        XFS_BTREE_STATS_INC(cur, lshift);
+        XFS_BTREE_STATS_ADD(cur, moves, 1);
+        /*
+         * If non-leaf, copy a key and a ptr to the left block.
+         * Log the changes to the left block.
+         */
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                lkp = xfs_btree_key_addr(cur, lrecs, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, rpp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                xfs_btree_copy_keys(cur, lkp, rkp, 1);
+                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+                ASSERT(cur->bc_ops->keys_inorder(cur,
+                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                lrp = xfs_btree_rec_addr(cur, lrecs, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, lrp, rrp, 1);
+                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+                ASSERT(cur->bc_ops->recs_inorder(cur,
+                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+        }
+        xfs_btree_set_numrecs(left, lrecs);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+        xfs_btree_set_numrecs(right, rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+        /*
+         * Slide the contents of right down one entry.
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+        if (level > 0) {
+                /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+                int                     i;              /* loop index */
+                for (i = 0; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_shift_keys(cur,
+                                xfs_btree_key_addr(cur, 2, right),
+                                -1, rrecs);
+                xfs_btree_shift_ptrs(cur,
+                                xfs_btree_ptr_addr(cur, 2, right),
+                                -1, rrecs);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+        } else {
+                /* It's a leaf. operate on records */
+                xfs_btree_shift_recs(cur,
+                        xfs_btree_rec_addr(cur, 2, right),
+                        -1, rrecs);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs);
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                cur->bc_ops->init_key_from_rec(&key,
+                        xfs_btree_rec_addr(cur, 1, right));
+                rkp = &key;
+        }
+        /* Update the parent key values of right. */
+        error = xfs_btree_updkey(cur, rkp, level + 1);
+        if (error)
+                goto error0;
+        /* Slide the cursor value left one. */
+        cur->bc_ptrs[level]--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_btree_rshift(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_key     key;            /* btree key */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+        union xfs_btree_ptr     rptr;           /* right block pointer */
+        union xfs_btree_key     *rkp;           /* right btree key */
+        int                     rrecs;          /* right record count */
+        int                     lrecs;          /* left record count */
+        int                     error;          /* error return value */
+        int                     i;              /* loop counter */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level == cur->bc_nlevels - 1))
+                goto out0;
+        /* Set up variables for this block as "left". */
+        left = xfs_btree_get_block(cur, level, &lbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, left, level, lbp);
+        if (error)
+                goto error0;
+#endif
+        /* If we've got no right sibling then we can't shift an entry right. */
+        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+        if (xfs_btree_ptr_is_null(cur, &rptr))
+                goto out0;
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        lrecs = xfs_btree_get_numrecs(left);
+        if (cur->bc_ptrs[level] >= lrecs)
+                goto out0;
+        /* Set up the right neighbor as "right". */
+        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+        if (error)
+                goto error0;
+        /* If it's full, it can't take another entry. */
+        rrecs = xfs_btree_get_numrecs(right);
+        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, rshift);
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        /*
+         * Make a hole at the start of the right neighbor block, then
+         * copy the last left block entry to the hole.
+         */
+        if (level > 0) {
+                /* It's a nonleaf. make a hole in the keys and ptrs */
+                union xfs_btree_key     *lkp;
+                union xfs_btree_ptr     *lpp;
+                union xfs_btree_ptr     *rpp;
+                lkp = xfs_btree_key_addr(cur, lrecs, left);
+                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = rrecs - 1; i >= 0; i--) {
+                        error = xfs_btree_check_ptr(cur, rpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, lpp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                /* Now put the new data in, and log it. */
+                xfs_btree_copy_keys(cur, rkp, lkp, 1);
+                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+                        xfs_btree_key_addr(cur, 2, right)));
+        } else {
+                /* It's a leaf. make a hole in the records */
+                union xfs_btree_rec     *lrp;
+                union xfs_btree_rec     *rrp;
+                lrp = xfs_btree_rec_addr(cur, lrecs, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+                /* Now put the new data in, and log it. */
+                xfs_btree_copy_recs(cur, rrp, lrp, 1);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+                cur->bc_ops->init_key_from_rec(&key, rrp);
+                rkp = &key;
+                ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+                        xfs_btree_rec_addr(cur, 2, right)));
+        }
+        /*
+         * Decrement and log left's numrecs, bump and log right's numrecs.
+         */
+        xfs_btree_set_numrecs(left, --lrecs);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+        xfs_btree_set_numrecs(right, ++rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+        /*
+         * Using a temporary cursor, update the parent key values of the
+         * block on the right.
+         */
+        error = xfs_btree_dup_cursor(cur, &tcur);
+        if (error)
+                goto error0;
+        i = xfs_btree_lastrec(tcur, level);
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        error = xfs_btree_increment(tcur, level, &i);
+        if (error)
+                goto error1;
+        error = xfs_btree_updkey(tcur, rkp, level + 1);
+        if (error)
+                goto error1;
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+error1:
+        XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int                                      /* error */
+__xfs_btree_split(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        union xfs_btree_ptr     *ptrp,
+        union xfs_btree_key     *key,
+        struct xfs_btree_cur    **curp,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
+        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+        struct xfs_btree_block  *rrblock;       /* right-right btree block */
+        int                     lrecs;
+        int                     rrecs;
+        int                     src_index;
+        int                     error;          /* error return value */
+#ifdef DEBUG
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+        XFS_BTREE_STATS_INC(cur, split);
+        /* Set up left block (current one). */
+        left = xfs_btree_get_block(cur, level, &lbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, left, level, lbp);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0)
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Set up the new block as "right". */
+        error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+        if (error)
+                goto error0;
+        /* Fill in the btree header for the new right block. */
+        xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
+        /*
+         * Split the entries between the old and the new block evenly.
+         * Make sure that if there's an odd number of entries now, that
+         * each new block will have the same number of entries.
+         */
+        lrecs = xfs_btree_get_numrecs(left);
+        rrecs = lrecs / 2;
+        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+                rrecs++;
+        src_index = (lrecs - rrecs + 1);
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        /*
+         * Copy btree block entries from the left block over to the
+         * new block, the right. Update the right block and log the
+         * changes.
+         */
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                union xfs_btree_key     *rkp;   /* right btree key */
+                union xfs_btree_ptr     *rpp;   /* right address pointer */
+                lkp = xfs_btree_key_addr(cur, src_index, left);
+                lpp = xfs_btree_ptr_addr(cur, src_index, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = src_index; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, lpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+                /* Grab the keys to the entries moved to the right block */
+                xfs_btree_copy_keys(cur, key, rkp, 1);
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                union xfs_btree_rec     *rrp;   /* right record pointer */
+                lrp = xfs_btree_rec_addr(cur, src_index, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs);
+                cur->bc_ops->init_key_from_rec(key,
+                        xfs_btree_rec_addr(cur, 1, right));
+        }
+        /*
+         * Find the left block number by looking in the buffer.
+         * Adjust numrecs, sibling pointers.
+         */
+        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+        lrecs -= rrecs;
+        xfs_btree_set_numrecs(left, lrecs);
+        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /*
+         * If there's a block to the new block's right, make that block
+         * point back to right instead of to left.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+                error = xfs_btree_read_buf_block(cur, &rrptr,
+                                                        0, &rrblock, &rrbp);
+                if (error)
+                        goto error0;
+                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        /*
+         * If the cursor is really in the right block, move it there.
+         * If it's just pointing past the last entry in left, then we'll
+         * insert there, so don't change anything in that case.
+         */
+        if (cur->bc_ptrs[level] > lrecs + 1) {
+                xfs_btree_setbuf(cur, level, rbp);
+                cur->bc_ptrs[level] -= lrecs;
+        }
+        /*
+         * If there are more levels, we'll need another cursor which refers
+         * the right block, no matter where this cursor was.
+         */
+        if (level + 1 < cur->bc_nlevels) {
+                error = xfs_btree_dup_cursor(cur, curp);
+                if (error)
+                        goto error0;
+                (*curp)->bc_ptrs[level + 1]++;
+        }
+        *ptrp = rptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+struct xfs_btree_split_args {
+        struct xfs_btree_cur    *cur;
+        int                     level;
+        union xfs_btree_ptr     *ptrp;
+        union xfs_btree_key     *key;
+        struct xfs_btree_cur    **curp;
+        int                     *stat;          /* success/failure */
+        int                     result;
+        bool                    kswapd; /* allocation in kswapd context */
+        struct completion       *done;
+        struct work_struct      work;
+};
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+        struct work_struct      *work)
+{
+        struct xfs_btree_split_args     *args = container_of(work,
+                                                struct xfs_btree_split_args, work);
+        unsigned long           pflags;
+        unsigned long           new_pflags = PF_FSTRANS;
+        /*
+         * we are in a transaction context here, but may also be doing work
+         * in kswapd context, and hence we may need to inherit that state
+         * temporarily to ensure that we don't block waiting for memory reclaim
+         * in any way.
+         */
+        if (args->kswapd)
+                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+        current_set_flags_nested(&pflags, new_pflags);
+        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+                                         args->key, args->curp, args->stat);
+        complete(args->done);
+        current_restore_flags_nested(&pflags, new_pflags);
+}
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int                                      /* error */
+xfs_btree_split(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        union xfs_btree_ptr     *ptrp,
+        union xfs_btree_key     *key,
+        struct xfs_btree_cur    **curp,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_split_args     args;
+        DECLARE_COMPLETION_ONSTACK(done);
+        if (cur->bc_btnum != XFS_BTNUM_BMAP)
+                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+        args.cur = cur;
+        args.level = level;
+        args.ptrp = ptrp;
+        args.key = key;
+        args.curp = curp;
+        args.stat = stat;
+        args.done = &done;
+        args.kswapd = current_is_kswapd();
+        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+        queue_work(xfs_alloc_wq, &args.work);
+        wait_for_completion(&done);
+        destroy_work_on_stack(&args.work);
+        return args.result;
+}
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int                                             /* error */
+xfs_btree_new_iroot(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     *logflags,      /* logging flags for inode */
+        int                     *stat)          /* return status - 0 fail */
+{
+        struct xfs_buf          *cbp;           /* buffer for cblock */
+        struct xfs_btree_block  *block;         /* btree block */
+        struct xfs_btree_block  *cblock;        /* child btree block */
+        union xfs_btree_key     *ckp;           /* child key pointer */
+        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
+        union xfs_btree_key     *kp;            /* pointer to btree key */
+        union xfs_btree_ptr     *pp;            /* pointer to block addr */
+        union xfs_btree_ptr     nptr;           /* new block addr */
+        int                     level;          /* btree level */
+        int                     error;          /* error return code */
+#ifdef DEBUG
+        int                     i;              /* loop counter */
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, newroot);
+        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+        level = cur->bc_nlevels - 1;
+        block = xfs_btree_get_iroot(cur);
+        pp = xfs_btree_ptr_addr(cur, 1, block);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                return 0;
+        }
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Copy the root into a real block. */
+        error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+        if (error)
+                goto error0;
+        /*
+         * we can't just memcpy() the root in for CRC enabled btree blocks.
+         * In that case have to also ensure the blkno remains correct
+         */
+        memcpy(cblock, block, xfs_btree_block_len(cur));
+        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                        cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+                else
+                        cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+        }
+        be16_add_cpu(&block->bb_level, 1);
+        xfs_btree_set_numrecs(block, 1);
+        cur->bc_nlevels++;
+        cur->bc_ptrs[level + 1] = 1;
+        kp = xfs_btree_key_addr(cur, 1, block);
+        ckp = xfs_btree_key_addr(cur, 1, cblock);
+        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+                error = xfs_btree_check_ptr(cur, pp, i, level);
+                if (error)
+                        goto error0;
+        }
+#endif
+        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+#ifdef DEBUG
+        error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+        xfs_iroot_realloc(cur->bc_private.b.ip,
+                          1 - xfs_btree_get_numrecs(cblock),
+                          cur->bc_private.b.whichfork);
+        xfs_btree_setbuf(cur, level, cbp);
+        /*
+         * Do all this logging at the end so that
+         * the root is at the right level.
+         */
+        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+        *logflags |=
+                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+        *stat = 1;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int                              /* error */
+xfs_btree_new_root(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* one half of the old root block */
+        struct xfs_buf          *bp;    /* buffer containing block */
+        int                     error;  /* error return value */
+        struct xfs_buf          *lbp;   /* left buffer pointer */
+        struct xfs_btree_block  *left;  /* left btree block */
+        struct xfs_buf          *nbp;   /* new (root) buffer */
+        struct xfs_btree_block  *new;   /* new (root) btree block */
+        int                     nptr;   /* new value for key index, 1 or 2 */
+        struct xfs_buf          *rbp;   /* right buffer pointer */
+        struct xfs_btree_block  *right; /* right btree block */
+        union xfs_btree_ptr     rptr;
+        union xfs_btree_ptr     lptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, newroot);
+        /* initialise our start point from the cursor */
+        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0)
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Set up the new block. */
+        error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+        if (error)
+                goto error0;
+        /* Set the root in the holding structure  increasing the level by 1. */
+        cur->bc_ops->set_root(cur, &lptr, 1);
+        /*
+         * At the previous root level there are now two blocks: the old root,
+         * and the new block generated when it was split.  We don't know which
+         * one the cursor is pointing at, so we set up variables "left" and
+         * "right" for each case.
+         */
+        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+                /* Our block is left, pick up the right block. */
+                lbp = bp;
+                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+                left = block;
+                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+                if (error)
+                        goto error0;
+                bp = rbp;
+                nptr = 1;
+        } else {
+                /* Our block is right, pick up the left block. */
+                rbp = bp;
+                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+                right = block;
+                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+                if (error)
+                        goto error0;
+                bp = lbp;
+                nptr = 2;
+        }
+        /* Fill in the new block's btree header and log it. */
+        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
+        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+                        !xfs_btree_ptr_is_null(cur, &rptr));
+        /* Fill in the key data in the new root. */
+        if (xfs_btree_get_level(left) > 0) {
+                xfs_btree_copy_keys(cur,
+                                xfs_btree_key_addr(cur, 1, new),
+                                xfs_btree_key_addr(cur, 1, left), 1);
+                xfs_btree_copy_keys(cur,
+                                xfs_btree_key_addr(cur, 2, new),
+                                xfs_btree_key_addr(cur, 1, right), 1);
+        } else {
+                cur->bc_ops->init_key_from_rec(
+                                xfs_btree_key_addr(cur, 1, new),
+                                xfs_btree_rec_addr(cur, 1, left));
+                cur->bc_ops->init_key_from_rec(
+                                xfs_btree_key_addr(cur, 2, new),
+                                xfs_btree_rec_addr(cur, 1, right));
+        }
+        xfs_btree_log_keys(cur, nbp, 1, 2);
+        /* Fill in the pointer data in the new root. */
+        xfs_btree_copy_ptrs(cur,
+                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+        xfs_btree_copy_ptrs(cur,
+                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+        xfs_btree_log_ptrs(cur, nbp, 1, 2);
+        /* Fix up the cursor. */
+        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+        cur->bc_ptrs[cur->bc_nlevels] = nptr;
+        cur->bc_nlevels++;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+}
+STATIC int
+xfs_btree_make_block_unfull(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* btree level */
+        int                     numrecs,/* # of recs in block */
+        int                     *oindex,/* old tree index */
+        int                     *index, /* new tree index */
+        union xfs_btree_ptr     *nptr,  /* new btree ptr */
+        struct xfs_btree_cur    **ncur, /* new btree cursor */
+        union xfs_btree_rec     *nrec,  /* new record */
+        int                     *stat)
+{
+        union xfs_btree_key     key;    /* new btree key value */
+        int                     error = 0;
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            level == cur->bc_nlevels - 1) {
+                struct xfs_inode *ip = cur->bc_private.b.ip;
+                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+                        /* A root block that can be made bigger. */
+                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+                } else {
+                        /* A root block that needs replacing */
+                        int     logflags = 0;
+                        error = xfs_btree_new_iroot(cur, &logflags, stat);
+                        if (error || *stat == 0)
+                                return error;
+                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+                }
+                return 0;
+        }
+        /* First, try shifting an entry to the right neighbor. */
+        error = xfs_btree_rshift(cur, level, stat);
+        if (error || *stat)
+                return error;
+        /* Next, try shifting an entry to the left neighbor. */
+        error = xfs_btree_lshift(cur, level, stat);
+        if (error)
+                return error;
+        if (*stat) {
+                *oindex = *index = cur->bc_ptrs[level];
+                return 0;
+        }
+        /*
+         * Next, try splitting the current block in half.
+         *
+         * If this works we have to re-set our variables because we
+         * could be in a different block now.
+         */
+        error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+        if (error || *stat == 0)
+                return error;
+        *index = cur->bc_ptrs[level];
+        cur->bc_ops->init_rec_from_key(&key, nrec);
+        return 0;
+}
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level to insert record at */
+        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
+        union xfs_btree_rec     *recp,  /* i/o: record data inserted */
+        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* btree block */
+        struct xfs_buf          *bp;    /* buffer for block */
+        union xfs_btree_key     key;    /* btree key */
+        union xfs_btree_ptr     nptr;   /* new block ptr */
+        struct xfs_btree_cur    *ncur;  /* new btree cursor */
+        union xfs_btree_rec     nrec;   /* new record count */
+        int                     optr;   /* old key/record index */
+        int                     ptr;    /* key/record index */
+        int                     numrecs;/* number of records */
+        int                     error;  /* error return value */
+#ifdef DEBUG
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+        ncur = NULL;
+        /*
+         * If we have an external root pointer, and we've made it to the
+         * root level, allocate a new root block and we're done.
+         */
+        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level >= cur->bc_nlevels)) {
+                error = xfs_btree_new_root(cur, stat);
+                xfs_btree_set_ptr_null(cur, ptrp);
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                return error;
+        }
+        /* If we're off the left edge, return failure. */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        /* Make a key out of the record data to be inserted, and save it. */
+        cur->bc_ops->init_key_from_rec(&key, recp);
+        optr = ptr;
+        XFS_BTREE_STATS_INC(cur, insrec);
+        /* Get pointers to the btree buffer and block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+        /* Check that the new entry is being inserted in the right place. */
+        if (ptr <= numrecs) {
+                if (level == 0) {
+                        ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+                                xfs_btree_rec_addr(cur, ptr, block)));
+                } else {
+                        ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+                                xfs_btree_key_addr(cur, ptr, block)));
+                }
+        }
+#endif
+        /*
+         * If the block is full, we can't insert the new entry until we
+         * make the block un-full.
+         */
+        xfs_btree_set_ptr_null(cur, &nptr);
+        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+                error = xfs_btree_make_block_unfull(cur, level, numrecs,
+                                        &optr, &ptr, &nptr, &ncur, &nrec, stat);
+                if (error || *stat == 0)
+                        goto error0;
+        }
+        /*
+         * The current block may have changed if the block was
+         * previously full and we have just made space in it.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                return error;
+#endif
+        /*
+         * At this point we know there's room for our new entry in the block
+         * we're pointing at.
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+        if (level > 0) {
+                /* It's a nonleaf. make a hole in the keys and ptrs */
+                union xfs_btree_key     *kp;
+                union xfs_btree_ptr     *pp;
+                kp = xfs_btree_key_addr(cur, ptr, block);
+                pp = xfs_btree_ptr_addr(cur, ptr, block);
+#ifdef DEBUG
+                for (i = numrecs - ptr; i >= 0; i--) {
+                        error = xfs_btree_check_ptr(cur, pp, i, level);
+                        if (error)
+                                return error;
+                }
+#endif
+                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                /* Now put the new data in, bump numrecs and log it. */
+                xfs_btree_copy_keys(cur, kp, &key, 1);
+                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+                numrecs++;
+                xfs_btree_set_numrecs(block, numrecs);
+                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+                xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+                if (ptr < numrecs) {
+                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+                                xfs_btree_key_addr(cur, ptr + 1, block)));
+                }
+#endif
+        } else {
+                /* It's a leaf. make a hole in the records */
+                union xfs_btree_rec             *rp;
+                rp = xfs_btree_rec_addr(cur, ptr, block);
+                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+                /* Now put the new data in, bump numrecs and log it. */
+                xfs_btree_copy_recs(cur, rp, recp, 1);
+                xfs_btree_set_numrecs(block, ++numrecs);
+                xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+                if (ptr < numrecs) {
+                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+                                xfs_btree_rec_addr(cur, ptr + 1, block)));
+                }
+#endif
+        }
+        /* Log the new number of records in the btree header. */
+        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+        /* If we inserted at the start of a block, update the parents' keys. */
+        if (optr == 1) {
+                error = xfs_btree_updkey(cur, &key, level + 1);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, level)) {
+                cur->bc_ops->update_lastrec(cur, block, recp,
+                                            ptr, LASTREC_INSREC);
+        }
+        /*
+         * Return the new block number, if any.
+         * If there is one, give back a record value and a cursor too.
+         */
+        *ptrp = nptr;
+        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+                *recp = nrec;
+                *curp = ncur;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+        struct xfs_btree_cur    *cur,
+        int                     *stat)
+{
+        int                     error;  /* error return value */
+        int                     i;      /* result value, 0 for failure */
+        int                     level;  /* current level number in btree */
+        union xfs_btree_ptr     nptr;   /* new block number (split result) */
+        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
+        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
+        union xfs_btree_rec     rec;    /* record to insert */
+        level = 0;
+        ncur = NULL;
+        pcur = cur;
+        xfs_btree_set_ptr_null(cur, &nptr);
+        cur->bc_ops->init_rec_from_cur(cur, &rec);
+        /*
+         * Loop going up the tree, starting at the leaf level.
+         * Stop when we don't get a split block, that must mean that
+         * the insert is finished with this level.
+         */
+        do {
+                /*
+                 * Insert nrec/nptr into this level of the tree.
+                 * Note if we fail, nptr will be null.
+                 */
+                error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+                if (error) {
+                        if (pcur != cur)
+                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                        goto error0;
+                }
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                level++;
+                /*
+                 * See if the cursor we just used is trash.
+                 * Can't trash the caller's cursor, but otherwise we should
+                 * if ncur is a new cursor or we're about to be done.
+                 */
+                if (pcur != cur &&
+                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+                        /* Save the state from the cursor before we trash it */
+                        if (cur->bc_ops->update_cursor)
+                                cur->bc_ops->update_cursor(pcur, cur);
+                        cur->bc_nlevels = pcur->bc_nlevels;
+                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+                }
+                /* If we got a new cursor, switch to it. */
+                if (ncur) {
+                        pcur = ncur;
+                        ncur = NULL;
+                }
+        } while (!xfs_btree_ptr_is_null(cur, &nptr));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = i;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+STATIC int
+xfs_btree_kill_iroot(
+        struct xfs_btree_cur    *cur)
+{
+        int                     whichfork = cur->bc_private.b.whichfork;
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        struct xfs_btree_block  *block;
+        struct xfs_btree_block  *cblock;
+        union xfs_btree_key     *kp;
+        union xfs_btree_key     *ckp;
+        union xfs_btree_ptr     *pp;
+        union xfs_btree_ptr     *cpp;
+        struct xfs_buf          *cbp;
+        int                     level;
+        int                     index;
+        int                     numrecs;
+#ifdef DEBUG
+        union xfs_btree_ptr     ptr;
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+        ASSERT(cur->bc_nlevels > 1);
+        /*
+         * Don't deal with the root block needs to be a leaf case.
+         * We're just going to turn the thing back into extents anyway.
+         */
+        level = cur->bc_nlevels - 1;
+        if (level == 1)
+                goto out0;
+        /*
+         * Give up if the root has multiple children.
+         */
+        block = xfs_btree_get_iroot(cur);
+        if (xfs_btree_get_numrecs(block) != 1)
+                goto out0;
+        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+        numrecs = xfs_btree_get_numrecs(cblock);
+        /*
+         * Only do this if the next level will fit.
+         * Then the data must be copied up to the inode,
+         * instead of freeing the root you free the next level.
+         */
+        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, killroot);
+#ifdef DEBUG
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+        if (index) {
+                xfs_iroot_realloc(cur->bc_private.b.ip, index,
+                                  cur->bc_private.b.whichfork);
+                block = ifp->if_broot;
+        }
+        be16_add_cpu(&block->bb_numrecs, index);
+        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+        kp = xfs_btree_key_addr(cur, 1, block);
+        ckp = xfs_btree_key_addr(cur, 1, cblock);
+        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+        pp = xfs_btree_ptr_addr(cur, 1, block);
+        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+        for (i = 0; i < numrecs; i++) {
+                int             error;
+                error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+                if (error) {
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                        return error;
+                }
+        }
+#endif
+        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+        cur->bc_ops->free_block(cur, cbp);
+        XFS_BTREE_STATS_INC(cur, free);
+        cur->bc_bufs[level - 1] = NULL;
+        be16_add_cpu(&block->bb_level, -1);
+        xfs_trans_log_inode(cur->bc_tp, ip,
+                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        cur->bc_nlevels--;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     level,
+        union xfs_btree_ptr     *newroot)
+{
+        int                     error;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, killroot);
+        /*
+         * Update the root pointer, decreasing the level by 1 and then
+         * free the old root.
+         */
+        cur->bc_ops->set_root(cur, newroot, -1);
+        error = cur->bc_ops->free_block(cur, bp);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                return error;
+        }
+        XFS_BTREE_STATS_INC(cur, free);
+        cur->bc_bufs[level] = NULL;
+        cur->bc_ra[level] = 0;
+        cur->bc_nlevels--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
+STATIC int
+xfs_btree_dec_cursor(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)
+{
+        int                     error;
+        int                     i;
+        if (level > 0) {
+                error = xfs_btree_decrement(cur, level, &i);
+                if (error)
+                        return error;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int                                      /* error */
+xfs_btree_delrec(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     level,          /* level removing record from */
+        int                     *stat)          /* fail/done/go-on */
+{
+        struct xfs_btree_block  *block;         /* btree block */
+        union xfs_btree_ptr     cptr;           /* current block ptr */
+        struct xfs_buf          *bp;            /* buffer for block */
+        int                     error;          /* error return value */
+        int                     i;              /* loop counter */
+        union xfs_btree_key     key;            /* storage for keyp */
+        union xfs_btree_key     *keyp = &key;   /* passed to the next level */
+        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        int                     lrecs = 0;      /* left record count */
+        int                     ptr;            /* key/record index */
+        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        struct xfs_btree_block  *rrblock;       /* right-right btree block */
+        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+        int                     rrecs = 0;      /* right record count */
+        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+        int                     numrecs;        /* temporary numrec count */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        tcur = NULL;
+        /* Get the index of the entry being deleted, check for nothing there. */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        /* Get the buffer & block containing the record or key/ptr. */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Fail if we're off the end of the block. */
+        if (ptr > numrecs) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        XFS_BTREE_STATS_INC(cur, delrec);
+        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+        /* Excise the entries being deleted. */
+        if (level > 0) {
+                /* It's a nonleaf. operate on keys and ptrs */
+                union xfs_btree_key     *lkp;
+                union xfs_btree_ptr     *lpp;
+                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+#ifdef DEBUG
+                for (i = 0; i < numrecs - ptr; i++) {
+                        error = xfs_btree_check_ptr(cur, lpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                if (ptr < numrecs) {
+                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need to pass a
+                 * key up to the next level (updkey).
+                 */
+                if (ptr == 1)
+                        keyp = xfs_btree_key_addr(cur, 1, block);
+        } else {
+                /* It's a leaf. operate on records */
+                if (ptr < numrecs) {
+                        xfs_btree_shift_recs(cur,
+                                xfs_btree_rec_addr(cur, ptr + 1, block),
+                                -1, numrecs - ptr);
+                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                if (ptr == 1) {
+                        cur->bc_ops->init_key_from_rec(&key,
+                                        xfs_btree_rec_addr(cur, 1, block));
+                        keyp = &key;
+                }
+        }
+        /*
+         * Decrement and log the number of entries in the block.
+         */
+        xfs_btree_set_numrecs(block, --numrecs);
+        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, level)) {
+                cur->bc_ops->update_lastrec(cur, block, NULL,
+                                            ptr, LASTREC_DELREC);
+        }
+        /*
+         * We're at the root level.  First, shrink the root block in-memory.
+         * Try to get rid of the next level down.  If we can't then there's
+         * nothing left to do.
+         */
+        if (level == cur->bc_nlevels - 1) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                        xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+                                          cur->bc_private.b.whichfork);
+                        error = xfs_btree_kill_iroot(cur);
+                        if (error)
+                                goto error0;
+                        error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                        *stat = 1;
+                        return 0;
+                }
+                /*
+                 * If this is the root level, and there's only one entry left,
+                 * and it's NOT the leaf level, then we can get rid of this
+                 * level.
+                 */
+                if (numrecs == 1 && level > 0) {
+                        union xfs_btree_ptr     *pp;
+                        /*
+                         * pp is still set to the first pointer in the block.
+                         * Make it the new root of the btree.
+                         */
+                        pp = xfs_btree_ptr_addr(cur, 1, block);
+                        error = xfs_btree_kill_root(cur, bp, level, pp);
+                        if (error)
+                                goto error0;
+                } else if (level > 0) {
+                        error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                }
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * If we deleted the leftmost entry in the block, update the
+         * key values above us in the tree.
+         */
+        if (ptr == 1) {
+                error = xfs_btree_updkey(cur, keyp, level + 1);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * If the number of records remaining in the block is at least
+         * the minimum, we're done.
+         */
+        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+                error = xfs_btree_dec_cursor(cur, level, stat);
+                if (error)
+                        goto error0;
+                return 0;
+        }
+        /*
+         * Otherwise, we have to move some records around to keep the
+         * tree balanced.  Look at the left and right sibling blocks to
+         * see if we can re-balance by moving only one record.
+         */
+        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                /*
+                 * One child of root, need to get a chance to copy its contents
+                 * into the root and delete it. Can't go up to next level,
+                 * there's nothing to delete there.
+                 */
+                if (xfs_btree_ptr_is_null(cur, &rptr) &&
+                    xfs_btree_ptr_is_null(cur, &lptr) &&
+                    level == cur->bc_nlevels - 2) {
+                        error = xfs_btree_kill_iroot(cur);
+                        if (!error)
+                                error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                        return 0;
+                }
+        }
+        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+               !xfs_btree_ptr_is_null(cur, &lptr));
+        /*
+         * Duplicate the cursor so our btree manipulations here won't
+         * disrupt the next level up.
+         */
+        error = xfs_btree_dup_cursor(cur, &tcur);
+        if (error)
+                goto error0;
+        /*
+         * If there's a right sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+                /*
+                 * Move the temp cursor to the last entry in the next block.
+                 * Actually any entry but the first would suffice.
+                 */
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_btree_increment(tcur, level, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /* Grab a pointer to the block. */
+                right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(tcur, right, level, rbp);
+                if (error)
+                        goto error0;
+#endif
+                /* Grab the current block number, for future use. */
+                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+                /*
+                 * If right block is full enough so that removing one entry
+                 * won't make it too empty, and left-shifting an entry out
+                 * of right to us works, we're done.
+                 */
+                if (xfs_btree_get_numrecs(right) - 1 >=
+                    cur->bc_ops->get_minrecs(tcur, level)) {
+                        error = xfs_btree_lshift(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        if (i) {
+                                ASSERT(xfs_btree_get_numrecs(block) >=
+                                       cur->bc_ops->get_minrecs(tcur, level));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                error = xfs_btree_dec_cursor(cur, level, stat);
+                                if (error)
+                                        goto error0;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference, and fix up the temp cursor to point
+                 * to our block again (last record).
+                 */
+                rrecs = xfs_btree_get_numrecs(right);
+                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                        i = xfs_btree_firstrec(tcur, level);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        error = xfs_btree_decrement(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                }
+        }
+        /*
+         * If there's a left sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                /*
+                 * Move the temp cursor to the first entry in the
+                 * previous block.
+                 */
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_btree_decrement(tcur, level, &i);
+                if (error)
+                        goto error0;
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /* Grab a pointer to the block. */
+                left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, left, level, lbp);
+                if (error)
+                        goto error0;
+#endif
+                /* Grab the current block number, for future use. */
+                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+                /*
+                 * If left block is full enough so that removing one entry
+                 * won't make it too empty, and right-shifting an entry out
+                 * of left to us works, we're done.
+                 */
+                if (xfs_btree_get_numrecs(left) - 1 >=
+                    cur->bc_ops->get_minrecs(tcur, level)) {
+                        error = xfs_btree_rshift(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        if (i) {
+                                ASSERT(xfs_btree_get_numrecs(block) >=
+                                       cur->bc_ops->get_minrecs(tcur, level));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                if (level == 0)
+                                        cur->bc_ptrs[0]++;
+                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference.
+                 */
+                lrecs = xfs_btree_get_numrecs(left);
+        }
+        /* Delete the temp cursor, we're done with it. */
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        tcur = NULL;
+        /* If here, we need to do a join to keep the tree balanced. */
+        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+            lrecs + xfs_btree_get_numrecs(block) <=
+                        cur->bc_ops->get_maxrecs(cur, level)) {
+                /*
+                 * Set "right" to be the starting block,
+                 * "left" to be the left neighbor.
+                 */
+                rptr = cptr;
+                right = block;
+                rbp = bp;
+                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
+                if (error)
+                        goto error0;
+        /*
+         * If that won't work, see if we can join with the right neighbor block.
+         */
+        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+                   rrecs + xfs_btree_get_numrecs(block) <=
+                        cur->bc_ops->get_maxrecs(cur, level)) {
+                /*
+                 * Set "left" to be the starting block,
+                 * "right" to be the right neighbor.
+                 */
+                lptr = cptr;
+                left = block;
+                lbp = bp;
+                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
+                if (error)
+                        goto error0;
+        /*
+         * Otherwise, we can't fix the imbalance.
+         * Just return.  This is probably a logic error, but it's not fatal.
+         */
+        } else {
+                error = xfs_btree_dec_cursor(cur, level, stat);
+                if (error)
+                        goto error0;
+                return 0;
+        }
+        rrecs = xfs_btree_get_numrecs(right);
+        lrecs = xfs_btree_get_numrecs(left);
+        /*
+         * We're now going to join "left" and "right" by moving all the stuff
+         * in "right" to "left" and deleting "right".
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                union xfs_btree_key     *rkp;   /* right btree key */
+                union xfs_btree_ptr     *rpp;   /* right address pointer */
+                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = 1; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, rpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                union xfs_btree_rec     *rrp;   /* right record pointer */
+                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        }
+        XFS_BTREE_STATS_INC(cur, join);
+        /*
+         * Fix up the number of records and right block pointer in the
+         * surviving block, and log it.
+         */
+        xfs_btree_set_numrecs(left, lrecs + rrecs);
+        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /* If there is a right sibling, point it to the remaining block. */
+        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
+                if (error)
+                        goto error0;
+                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        /* Free the deleted block. */
+        error = cur->bc_ops->free_block(cur, rbp);
+        if (error)
+                goto error0;
+        XFS_BTREE_STATS_INC(cur, free);
+        /*
+         * If we joined with the left neighbor, set the buffer in the
+         * cursor to the left block, and fix up the index.
+         */
+        if (bp != lbp) {
+                cur->bc_bufs[level] = lbp;
+                cur->bc_ptrs[level] += lrecs;
+                cur->bc_ra[level] = 0;
+        }
+        /*
+         * If we joined with the right neighbor and there's a level above
+         * us, increment the cursor at that level.
+         */
+        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+                   (level + 1 < cur->bc_nlevels)) {
+                error = xfs_btree_increment(cur, level + 1, &i);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * Readjust the ptr at this level if it's not a leaf, since it's
+         * still pointing at the deletion point, which makes the cursor
+         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+         * We can't use decrement because it would change the next level up.
+         */
+        if (level > 0)
+                cur->bc_ptrs[level]--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        /* Return value means the next level up has something to do. */
+        *stat = 2;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        if (tcur)
+                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                     /* error */
+xfs_btree_delete(
+        struct xfs_btree_cur    *cur,
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+        int                     level;
+        int                     i;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        /*
+         * Go up the tree, starting at leaf level.
+         *
+         * If 2 is returned then a join was done; go to the next level.
+         * Otherwise we are done.
+         */
+        for (level = 0, i = 2; i == 2; level++) {
+                error = xfs_btree_delrec(cur, level, &i);
+                if (error)
+                        goto error0;
+        }
+        if (i == 0) {
+                for (level = 1; level < cur->bc_nlevels; level++) {
+                        if (cur->bc_ptrs[level] == 0) {
+                                error = xfs_btree_decrement(cur, level, &i);
+                                if (error)
+                                        goto error0;
+                                break;
+                        }
+                }
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = i;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_btree_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        union xfs_btree_rec     **recp, /* output: btree record */
+        int                     *stat)  /* output: success/failure */
+{
+        struct xfs_btree_block  *block; /* btree block */
+        struct xfs_buf          *bp;    /* buffer pointer */
+        int                     ptr;    /* record number */
+#ifdef DEBUG
+        int                     error;  /* error return value */
+#endif
+        ptr = cur->bc_ptrs[0];
+        block = xfs_btree_get_block(cur, 0, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, 0, bp);
+        if (error)
+                return error;
+#endif
+        /*
+         * Off the right end or left end, return failure.
+         */
+        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Point to the record and extract its data.
+         */
+        *recp = xfs_btree_rec_addr(cur, ptr, block);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction.  If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        __uint64_t              new_owner,
+        struct list_head        *buffer_list)
+{
+        struct xfs_btree_block  *block;
+        struct xfs_buf          *bp;
+        union xfs_btree_ptr     rptr;
+        /* do right sibling readahead */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+        /* modify the owner */
+        block = xfs_btree_get_block(cur, level, &bp);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+        else
+                block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+        /*
+         * If the block is a root block hosted in an inode, we might not have a
+         * buffer pointer here and we shouldn't attempt to log the change as the
+         * information is already held in the inode and discarded when the root
+         * block is formatted into the on-disk inode fork. We still change it,
+         * though, so everything is consistent in memory.
+         */
+        if (bp) {
+                if (cur->bc_tp) {
+                        xfs_trans_ordered_buf(cur->bc_tp, bp);
+                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+                } else {
+                        xfs_buf_delwri_queue(bp, buffer_list);
+                }
+        } else {
+                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+                ASSERT(level == cur->bc_nlevels - 1);
+        }
+        /* now read rh sibling block for next iteration */
+        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+        if (xfs_btree_ptr_is_null(cur, &rptr))
+                return -ENOENT;
+        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+int
+xfs_btree_change_owner(
+        struct xfs_btree_cur    *cur,
+        __uint64_t              new_owner,
+        struct list_head        *buffer_list)
+{
+        union xfs_btree_ptr     lptr;
+        int                     level;
+        struct xfs_btree_block  *block = NULL;
+        int                     error = 0;
+        cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+        /* for each level */
+        for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+                /* grab the left hand block */
+                error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+                if (error)
+                        return error;
+                /* readahead the left most block for the next level down */
+                if (level > 0) {
+                        union xfs_btree_ptr     *ptr;
+                        ptr = xfs_btree_ptr_addr(cur, 1, block);
+                        xfs_btree_readahead_ptr(cur, ptr, 1);
+                        /* save for the next iteration of the loop */
+                        lptr = *ptr;
+                }
+                /* for each buffer in the level */
+                do {
+                        error = xfs_btree_block_change_owner(cur, level,
+                                                             new_owner,
+                                                             buffer_list);
+                } while (!error);
+                if (error != -ENOENT)
+                        return error;
+        }
+        return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
new file mode 100644
index 000000000000..a04b69422f67
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_H__
+#define __XFS_BTREE_H__
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+extern kmem_zone_t      *xfs_btree_cur_zone;
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+        __be32                  s;      /* short form ptr */
+        __be64                  l;      /* long form ptr */
+};
+union xfs_btree_key {
+        xfs_bmbt_key_t          bmbt;
+        xfs_bmdr_key_t          bmbr;   /* bmbt root block */
+        xfs_alloc_key_t         alloc;
+        xfs_inobt_key_t         inobt;
+};
+union xfs_btree_rec {
+        xfs_bmbt_rec_t          bmbt;
+        xfs_bmdr_rec_t          bmbr;   /* bmbt root block */
+        xfs_alloc_rec_t         alloc;
+        xfs_inobt_rec_t         inobt;
+};
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define XFS_LOOKUP_EQ   ((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define XFS_LOOKUP_LE   ((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define XFS_LOOKUP_GE   ((xfs_lookup_t)XFS_LOOKUP_GEi)
+#define XFS_BTNUM_BNO   ((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define XFS_BTNUM_CNT   ((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define XFS_BTNUM_BMAP  ((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
+#define XFS_BTNUM_FINO  ((xfs_btnum_t)XFS_BTNUM_FINOi)
+/*
+ * For logging record fields.
+ */
+#define XFS_BB_MAGIC            (1 << 0)
+#define XFS_BB_LEVEL            (1 << 1)
+#define XFS_BB_NUMRECS          (1 << 2)
+#define XFS_BB_LEFTSIB          (1 << 3)
+#define XFS_BB_RIGHTSIB         (1 << 4)
+#define XFS_BB_BLKNO            (1 << 5)
+#define XFS_BB_LSN              (1 << 6)
+#define XFS_BB_UUID             (1 << 7)
+#define XFS_BB_OWNER            (1 << 8)
+#define XFS_BB_NUM_BITS         5
+#define XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
+#define XFS_BB_NUM_BITS_CRC     9
+#define XFS_BB_ALL_BITS_CRC     ((1 << XFS_BB_NUM_BITS_CRC) - 1)
+/*
+ * Generic stats interface
+ */
+#define __XFS_BTREE_STATS_INC(type, stat) \
+        XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat)  \
+do {    \
+        switch (cur->bc_btnum) {  \
+        case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;   \
+        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;   \
+        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;  \
+        case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;    \
+        case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break;  \
+        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+        }       \
+} while (0)
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
+        XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
+do {    \
+        switch (cur->bc_btnum) {  \
+        case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
+        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
+        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
+        case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+        case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
+        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+        }       \
+} while (0)
+#define XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
+struct xfs_btree_ops {
+        /* size of the key and record structures */
+        size_t  key_len;
+        size_t  rec_len;
+        /* cursor operations */
+        struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+        void    (*update_cursor)(struct xfs_btree_cur *src,
+                                 struct xfs_btree_cur *dst);
+        /* update btree root pointer */
+        void    (*set_root)(struct xfs_btree_cur *cur,
+                            union xfs_btree_ptr *nptr, int level_change);
+        /* block allocation / freeing */
+        int     (*alloc_block)(struct xfs_btree_cur *cur,
+                               union xfs_btree_ptr *start_bno,
+                               union xfs_btree_ptr *new_bno,
+                               int *stat);
+        int     (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+        /* update last record information */
+        void    (*update_lastrec)(struct xfs_btree_cur *cur,
+                                  struct xfs_btree_block *block,
+                                  union xfs_btree_rec *rec,
+                                  int ptr, int reason);
+        /* records in block/level */
+        int     (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+        int     (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+        /* records on disk.  Matter for the root in inode case. */
+        int     (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+        /* init values of btree structures */
+        void    (*init_key_from_rec)(union xfs_btree_key *key,
+                                     union xfs_btree_rec *rec);
+        void    (*init_rec_from_key)(union xfs_btree_key *key,
+                                     union xfs_btree_rec *rec);
+        void    (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+                                     union xfs_btree_rec *rec);
+        void    (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+                                     union xfs_btree_ptr *ptr);
+        /* difference between key value and cursor value */
+        __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+                              union xfs_btree_key *key);
+        const struct xfs_buf_ops        *buf_ops;
+#if defined(DEBUG) || defined(XFS_WARN)
+        /* check that k1 is lower than k2 */
+        int     (*keys_inorder)(struct xfs_btree_cur *cur,
+                                union xfs_btree_key *k1,
+                                union xfs_btree_key *k2);
+        /* check that r1 is lower than r2 */
+        int     (*recs_inorder)(struct xfs_btree_cur *cur,
+                                union xfs_btree_rec *r1,
+                                union xfs_btree_rec *r2);
+#endif
+};
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE  0
+#define LASTREC_INSREC  1
+#define LASTREC_DELREC  2
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+typedef struct xfs_btree_cur
+{
+        struct xfs_trans        *bc_tp; /* transaction we're in, if any */
+        struct xfs_mount        *bc_mp; /* file system mount struct */
+        const struct xfs_btree_ops *bc_ops;
+        uint                    bc_flags; /* btree features - below */
+        union {
+                xfs_alloc_rec_incore_t  a;
+                xfs_bmbt_irec_t         b;
+                xfs_inobt_rec_incore_t  i;
+        }               bc_rec;         /* current insert/search record value */
+        struct xfs_buf  *bc_bufs[XFS_BTREE_MAXLEVELS];  /* buf ptr per level */
+        int             bc_ptrs[XFS_BTREE_MAXLEVELS];   /* key/record # */
+        __uint8_t       bc_ra[XFS_BTREE_MAXLEVELS];     /* readahead bits */
+#define XFS_BTCUR_LEFTRA        1       /* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA       2       /* right sibling has been read-ahead */
+        __uint8_t       bc_nlevels;     /* number of levels in the tree */
+        __uint8_t       bc_blocklog;    /* log2(blocksize) of btree blocks */
+        xfs_btnum_t     bc_btnum;       /* identifies which btree type */
+        union {
+                struct {                        /* needed for BNO, CNT, INO */
+                        struct xfs_buf  *agbp;  /* agf/agi buffer pointer */
+                        xfs_agnumber_t  agno;   /* ag number */
+                } a;
+                struct {                        /* needed for BMAP */
+                        struct xfs_inode *ip;   /* pointer to our inode */
+                        struct xfs_bmap_free *flist;    /* list to free after */
+                        xfs_fsblock_t   firstblock;     /* 1st blk allocated */
+                        int             allocated;      /* count of alloced */
+                        short           forksize;       /* fork's inode space */
+                        char            whichfork;      /* data or attr fork */
+                        char            flags;          /* flags */
+#define XFS_BTCUR_BPRV_WASDEL   1                       /* was delayed */
+                } b;
+        }               bc_private;     /* per-btree type data */
+} xfs_btree_cur_t;
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS             (1<<0)  /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE         (1<<1)  /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE        (1<<2)  /* track last rec externally */
+#define XFS_BTREE_CRC_BLOCKS            (1<<3)  /* uses extended btree blocks */
+#define XFS_BTREE_NOERROR       0
+#define XFS_BTREE_ERROR         1
+/*
+ * Convert from buffer to btree block header.
+ */
+#define XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)((bp)->b_addr))
+/*
+ * Check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_btree_block  *block, /* generic btree block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp);   /* buffer containing block, if any */
+/*
+ * Check that (long) pointer is ok.
+ */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_dfsbno_t            ptr,    /* btree block disk address */
+        int                     level); /* btree block level */
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     error); /* del because of error */
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int                                     /* error */
+xfs_btree_dup_cursor(
+        xfs_btree_cur_t         *cur,   /* input cursor */
+        xfs_btree_cur_t         **ncur);/* output cursor */
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+struct xfs_buf *                                /* buffer for fsbno */
+xfs_btree_get_bufl(
+        struct xfs_mount        *mp,    /* file system mount point */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_fsblock_t           fsbno,  /* file system block number */
+        uint                    lock);  /* lock flags for get_buf */
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+struct xfs_buf *                                /* buffer for agno/agbno */
+xfs_btree_get_bufs(
+        struct xfs_mount        *mp,    /* file system mount point */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        xfs_agblock_t           agbno,  /* allocation group block number */
+        uint                    lock);  /* lock flags for get_buf */
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int                                     /* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level); /* level to check */
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+        __int64_t               fields, /* bitmask of fields */
+        const short             *offsets,/* table of field offsets */
+        int                     nbits,  /* number of bits to inspect */
+        int                     *first, /* output: first byte offset */
+        int                     *last); /* output: last byte offset */
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int                                     /* error */
+xfs_btree_read_bufl(
+        struct xfs_mount        *mp,    /* file system mount point */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_fsblock_t           fsbno,  /* file system block number */
+        uint                    lock,   /* lock flags for read_buf */
+        struct xfs_buf          **bpp,  /* buffer for fsbno */
+        int                     refval, /* ref count value for buffer */
+        const struct xfs_buf_ops *ops);
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void                                    /* error */
+xfs_btree_reada_bufl(
+        struct xfs_mount        *mp,    /* file system mount point */
+        xfs_fsblock_t           fsbno,  /* file system block number */
+        xfs_extlen_t            count,  /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops);
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void                                    /* error */
+xfs_btree_reada_bufs(
+        struct xfs_mount        *mp,    /* file system mount point */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        xfs_agblock_t           agbno,  /* allocation group block number */
+        xfs_extlen_t            count,  /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops);
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+        struct xfs_mount *mp,
+        struct xfs_buf  *bp,
+        __u32           magic,
+        __u16           level,
+        __u16           numrecs,
+        __u64           owner,
+        unsigned int    flags);
+void
+xfs_btree_init_block_int(
+        struct xfs_mount        *mp,
+        struct xfs_btree_block  *buf,
+        xfs_daddr_t             blkno,
+        __u32                   magic,
+        __u16                   level,
+        __u16                   numrecs,
+        __u64                   owner,
+        unsigned int            flags);
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+                           struct list_head *buffer_list);
+/*
+ * btree block CRC helpers
+ */
+void xfs_btree_lblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
+void xfs_btree_sblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
+/*
+ * Helpers.
+ */
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+{
+        return be16_to_cpu(block->bb_numrecs);
+}
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+                __uint16_t numrecs)
+{
+        block->bb_numrecs = cpu_to_be16(numrecs);
+}
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+        return be16_to_cpu(block->bb_level);
+}
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#define XFS_EXTLEN_MIN(a,b)     min_t(xfs_extlen_t, (a), (b))
+#define XFS_EXTLEN_MAX(a,b)     max_t(xfs_extlen_t, (a), (b))
+#define XFS_AGBLOCK_MIN(a,b)    min_t(xfs_agblock_t, (a), (b))
+#define XFS_AGBLOCK_MAX(a,b)    max_t(xfs_agblock_t, (a), (b))
+#define XFS_FILEOFF_MIN(a,b)    min_t(xfs_fileoff_t, (a), (b))
+#define XFS_FILEOFF_MAX(a,b)    max_t(xfs_fileoff_t, (a), (b))
+#define XFS_FILBLKS_MIN(a,b)    min_t(xfs_filblks_t, (a), (b))
+#define XFS_FILBLKS_MAX(a,b)    max_t(xfs_filblks_t, (a), (b))
+#define XFS_FSB_SANITY_CHECK(mp,fsb)    \
+        (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+                XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+/*
+ * Trace hooks.  Currently not implemented as they need to be ported
+ * over to the generic tracing functionality, which is some effort.
+ *
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define XFS_BTREE_TRACE_ARGI(c, i)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define XFS_BTREE_TRACE_CURSOR(c, t)
+#endif  /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+#define XFS_CRC_SEED    (~(__uint32_t)0)
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it.  The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+        __uint32_t zero = 0;
+        __uint32_t crc;
+        /* Calculate CRC up to the checksum. */
+        crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+        /* Skip checksum field */
+        crc = crc32c(crc, &zero, sizeof(__u32));
+        /* Calculate the rest of the CRC. */
+        return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+                      length - (cksum_offset + sizeof(__be32)));
+}
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+        return ~cpu_to_le32(crc);
+}
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+        __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+        *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+        __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+        return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
new file mode 100644
index 000000000000..8d809873525b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -0,0 +1,2665 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
+                                            xfs_da_state_blk_t *existing_root,
+                                            xfs_da_state_blk_t *new_child);
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
+                                            xfs_da_state_blk_t *existing_blk,
+                                            xfs_da_state_blk_t *split_blk,
+                                            xfs_da_state_blk_t *blk_to_add,
+                                            int treelevel,
+                                            int *result);
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
+                                         xfs_da_state_blk_t *node_blk_1,
+                                         xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
+                                   xfs_da_state_blk_t *old_node_blk,
+                                   xfs_da_state_blk_t *new_node_blk);
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
+                                           xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
+                                              xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
+                                         xfs_da_state_blk_t *src_node_blk,
+                                         xfs_da_state_blk_t *dst_node_blk);
+/*
+ * Utility routines.
+ */
+STATIC int      xfs_da3_blk_unlink(xfs_da_state_t *state,
+                                  xfs_da_state_blk_t *drop_blk,
+                                  xfs_da_state_blk_t *save_blk);
+kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+        return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+        int     i;
+        for (i = 0; i < state->altpath.active; i++)
+                state->altpath.blk[i].bp = NULL;
+        state->altpath.active = 0;
+}
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+        xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+        memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+        kmem_zone_free(xfs_da_state_zone, state);
+}
+static bool
+xfs_da3_node_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_da_intnode   *hdr = bp->b_addr;
+        struct xfs_da3_icnode_hdr ichdr;
+        const struct xfs_dir_ops *ops;
+        ops = xfs_dir_get_ops(mp, NULL);
+        ops->node_hdr_from_disk(&ichdr, hdr);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+                if (ichdr.magic != XFS_DA3_NODE_MAGIC)
+                        return false;
+                if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+                        return false;
+        } else {
+                if (ichdr.magic != XFS_DA_NODE_MAGIC)
+                        return false;
+        }
+        if (ichdr.level == 0)
+                return false;
+        if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+                return false;
+        if (ichdr.count == 0)
+                return false;
+        /*
+         * we don't know if the node is for and attribute or directory tree,
+         * so only fail if the count is outside both bounds
+         */
+        if (ichdr.count > mp->m_dir_geo->node_ents &&
+            ichdr.count > mp->m_attr_geo->node_ents)
+                return false;
+        /* XXX: hash order check? */
+        return true;
+}
+static void
+xfs_da3_node_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+        if (!xfs_da3_node_verify(bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (bip)
+                hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
+}
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da3_node_read_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_da_blkinfo   *info = bp->b_addr;
+        switch (be16_to_cpu(info->magic)) {
+                case XFS_DA3_NODE_MAGIC:
+                        if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
+                                xfs_buf_ioerror(bp, -EFSBADCRC);
+                                break;
+                        }
+                        /* fall through */
+                case XFS_DA_NODE_MAGIC:
+                        if (!xfs_da3_node_verify(bp)) {
+                                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                                break;
+                        }
+                        return;
+                case XFS_ATTR_LEAF_MAGIC:
+                case XFS_ATTR3_LEAF_MAGIC:
+                        bp->b_ops = &xfs_attr3_leaf_buf_ops;
+                        bp->b_ops->verify_read(bp);
+                        return;
+                case XFS_DIR2_LEAFN_MAGIC:
+                case XFS_DIR3_LEAFN_MAGIC:
+                        bp->b_ops = &xfs_dir3_leafn_buf_ops;
+                        bp->b_ops->verify_read(bp);
+                        return;
+                default:
+                        break;
+        }
+        /* corrupt block */
+        xfs_verifier_error(bp);
+}
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+        .verify_read = xfs_da3_node_read_verify,
+        .verify_write = xfs_da3_node_write_verify,
+};
+int
+xfs_da3_node_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp,
+        int                     which_fork)
+{
+        int                     err;
+        err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                                        which_fork, &xfs_da3_node_buf_ops);
+        if (!err && tp) {
+                struct xfs_da_blkinfo   *info = (*bpp)->b_addr;
+                int                     type;
+                switch (be16_to_cpu(info->magic)) {
+                case XFS_DA_NODE_MAGIC:
+                case XFS_DA3_NODE_MAGIC:
+                        type = XFS_BLFT_DA_NODE_BUF;
+                        break;
+                case XFS_ATTR_LEAF_MAGIC:
+                case XFS_ATTR3_LEAF_MAGIC:
+                        type = XFS_BLFT_ATTR_LEAF_BUF;
+                        break;
+                case XFS_DIR2_LEAFN_MAGIC:
+                case XFS_DIR3_LEAFN_MAGIC:
+                        type = XFS_BLFT_DIR_LEAFN_BUF;
+                        break;
+                default:
+                        type = 0;
+                        ASSERT(0);
+                        break;
+                }
+                xfs_trans_buf_set_type(tp, *bpp, type);
+        }
+        return err;
+}
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da3_node_create(
+        struct xfs_da_args      *args,
+        xfs_dablk_t             blkno,
+        int                     level,
+        struct xfs_buf          **bpp,
+        int                     whichfork)
+{
+        struct xfs_da_intnode   *node;
+        struct xfs_trans        *tp = args->trans;
+        struct xfs_mount        *mp = tp->t_mountp;
+        struct xfs_da3_icnode_hdr ichdr = {0};
+        struct xfs_buf          *bp;
+        int                     error;
+        struct xfs_inode        *dp = args->dp;
+        trace_xfs_da_node_create(args);
+        ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
+        error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
+        if (error)
+                return error;
+        bp->b_ops = &xfs_da3_node_buf_ops;
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+        node = bp->b_addr;
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+                ichdr.magic = XFS_DA3_NODE_MAGIC;
+                hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+                hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+                uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+        } else {
+                ichdr.magic = XFS_DA_NODE_MAGIC;
+        }
+        ichdr.level = level;
+        dp->d_ops->node_hdr_to_disk(node, &ichdr);
+        xfs_trans_log_buf(tp, bp,
+                XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int                                                     /* error */
+xfs_da3_split(
+        struct xfs_da_state     *state)
+{
+        struct xfs_da_state_blk *oldblk;
+        struct xfs_da_state_blk *newblk;
+        struct xfs_da_state_blk *addblk;
+        struct xfs_da_intnode   *node;
+        struct xfs_buf          *bp;
+        int                     max;
+        int                     action = 0;
+        int                     error;
+        int                     i;
+        trace_xfs_da_split(state->args);
+        /*
+         * Walk back up the tree splitting/inserting/adjusting as necessary.
+         * If we need to insert and there isn't room, split the node, then
+         * decide which fragment to insert the new block from below into.
+         * Note that we may split the root this way, but we need more fixup.
+         */
+        max = state->path.active - 1;
+        ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+        ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+               state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+        addblk = &state->path.blk[max];         /* initial dummy value */
+        for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+                oldblk = &state->path.blk[i];
+                newblk = &state->altpath.blk[i];
+                /*
+                 * If a leaf node then
+                 *     Allocate a new leaf node, then rebalance across them.
+                 * else if an intermediate node then
+                 *     We split on the last layer, must we split the node?
+                 */
+                switch (oldblk->magic) {
+                case XFS_ATTR_LEAF_MAGIC:
+                        error = xfs_attr3_leaf_split(state, oldblk, newblk);
+                        if ((error != 0) && (error != -ENOSPC)) {
+                                return error;   /* GROT: attr is inconsistent */
+                        }
+                        if (!error) {
+                                addblk = newblk;
+                                break;
+                        }
+                        /*
+                         * Entry wouldn't fit, split the leaf again.
+                         */
+                        state->extravalid = 1;
+                        if (state->inleaf) {
+                                state->extraafter = 0;  /* before newblk */
+                                trace_xfs_attr_leaf_split_before(state->args);
+                                error = xfs_attr3_leaf_split(state, oldblk,
+                                                            &state->extrablk);
+                        } else {
+                                state->extraafter = 1;  /* after newblk */
+                                trace_xfs_attr_leaf_split_after(state->args);
+                                error = xfs_attr3_leaf_split(state, newblk,
+                                                            &state->extrablk);
+                        }
+                        if (error)
+                                return error;   /* GROT: attr inconsistent */
+                        addblk = newblk;
+                        break;
+                case XFS_DIR2_LEAFN_MAGIC:
+                        error = xfs_dir2_leafn_split(state, oldblk, newblk);
+                        if (error)
+                                return error;
+                        addblk = newblk;
+                        break;
+                case XFS_DA_NODE_MAGIC:
+                        error = xfs_da3_node_split(state, oldblk, newblk, addblk,
+                                                         max - i, &action);
+                        addblk->bp = NULL;
+                        if (error)
+                                return error;   /* GROT: dir is inconsistent */
+                        /*
+                         * Record the newly split block for the next time thru?
+                         */
+                        if (action)
+                                addblk = newblk;
+                        else
+                                addblk = NULL;
+                        break;
+                }
+                /*
+                 * Update the btree to show the new hashval for this child.
+                 */
+                xfs_da3_fixhashpath(state, &state->path);
+        }
+        if (!addblk)
+                return 0;
+        /*
+         * Split the root node.
+         */
+        ASSERT(state->path.active == 0);
+        oldblk = &state->path.blk[0];
+        error = xfs_da3_root_split(state, oldblk, addblk);
+        if (error) {
+                addblk->bp = NULL;
+                return error;   /* GROT: dir is inconsistent */
+        }
+        /*
+         * Update pointers to the node which used to be block 0 and
+         * just got bumped because of the addition of a new root node.
+         * There might be three blocks involved if a double split occurred,
+         * and the original block 0 could be at any position in the list.
+         *
+         * Note: the magic numbers and sibling pointers are in the same
+         * physical place for both v2 and v3 headers (by design). Hence it
+         * doesn't matter which version of the xfs_da_intnode structure we use
+         * here as the result will be the same using either structure.
+         */
+        node = oldblk->bp->b_addr;
+        if (node->hdr.info.forw) {
+                if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
+                        bp = addblk->bp;
+                } else {
+                        ASSERT(state->extravalid);
+                        bp = state->extrablk.bp;
+                }
+                node = bp->b_addr;
+                node->hdr.info.back = cpu_to_be32(oldblk->blkno);
+                xfs_trans_log_buf(state->args->trans, bp,
+                    XFS_DA_LOGRANGE(node, &node->hdr.info,
+                    sizeof(node->hdr.info)));
+        }
+        node = oldblk->bp->b_addr;
+        if (node->hdr.info.back) {
+                if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
+                        bp = addblk->bp;
+                } else {
+                        ASSERT(state->extravalid);
+                        bp = state->extrablk.bp;
+                }
+                node = bp->b_addr;
+                node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
+                xfs_trans_log_buf(state->args->trans, bp,
+                    XFS_DA_LOGRANGE(node, &node->hdr.info,
+                    sizeof(node->hdr.info)));
+        }
+        addblk->bp = NULL;
+        return 0;
+}
+/*
+ * Split the root.  We have to create a new root and point to the two
+ * parts (the split old root) that we just created.  Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int                                              /* error */
+xfs_da3_root_split(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *blk1,
+        struct xfs_da_state_blk *blk2)
+{
+        struct xfs_da_intnode   *node;
+        struct xfs_da_intnode   *oldroot;
+        struct xfs_da_node_entry *btree;
+        struct xfs_da3_icnode_hdr nodehdr;
+        struct xfs_da_args      *args;
+        struct xfs_buf          *bp;
+        struct xfs_inode        *dp;
+        struct xfs_trans        *tp;
+        struct xfs_mount        *mp;
+        struct xfs_dir2_leaf    *leaf;
+        xfs_dablk_t             blkno;
+        int                     level;
+        int                     error;
+        int                     size;
+        trace_xfs_da_root_split(state->args);
+        /*
+         * Copy the existing (incorrect) block from the root node position
+         * to a free space somewhere.
+         */
+        args = state->args;
+        error = xfs_da_grow_inode(args, &blkno);
+        if (error)
+                return error;
+        dp = args->dp;
+        tp = args->trans;
+        mp = state->mp;
+        error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+        if (error)
+                return error;
+        node = bp->b_addr;
+        oldroot = blk1->bp->b_addr;
+        if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+            oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+                struct xfs_da3_icnode_hdr nodehdr;
+                dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+                btree = dp->d_ops->node_tree_p(oldroot);
+                size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
+                level = nodehdr.level;
+                /*
+                 * we are about to copy oldroot to bp, so set up the type
+                 * of bp while we know exactly what it will be.
+                 */
+                xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+        } else {
+                struct xfs_dir3_icleaf_hdr leafhdr;
+                struct xfs_dir2_leaf_entry *ents;
+                leaf = (xfs_dir2_leaf_t *)oldroot;
+                dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+                ents = dp->d_ops->leaf_ents_p(leaf);
+                ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+                       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+                size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+                level = 0;
+                /*
+                 * we are about to copy oldroot to bp, so set up the type
+                 * of bp while we know exactly what it will be.
+                 */
+                xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+        }
+        /*
+         * we can copy most of the information in the node from one block to
+         * another, but for CRC enabled headers we have to make sure that the
+         * block specific identifiers are kept intact. We update the buffer
+         * directly for this.
+         */
+        memcpy(node, oldroot, size);
+        if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+            oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+                struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+                node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+        }
+        xfs_trans_log_buf(tp, bp, 0, size - 1);
+        bp->b_ops = blk1->bp->b_ops;
+        xfs_trans_buf_copy_type(bp, blk1->bp);
+        blk1->bp = bp;
+        blk1->blkno = blkno;
+        /*
+         * Set up the new root node.
+         */
+        error = xfs_da3_node_create(args,
+                (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
+                level + 1, &bp, args->whichfork);
+        if (error)
+                return error;
+        node = bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+        btree = dp->d_ops->node_tree_p(node);
+        btree[0].hashval = cpu_to_be32(blk1->hashval);
+        btree[0].before = cpu_to_be32(blk1->blkno);
+        btree[1].hashval = cpu_to_be32(blk2->hashval);
+        btree[1].before = cpu_to_be32(blk2->blkno);
+        nodehdr.count = 2;
+        dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+#ifdef DEBUG
+        if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+            oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+                ASSERT(blk1->blkno >= args->geo->leafblk &&
+                       blk1->blkno < args->geo->freeblk);
+                ASSERT(blk2->blkno >= args->geo->leafblk &&
+                       blk2->blkno < args->geo->freeblk);
+        }
+#endif
+        /* Header is already logged by xfs_da_node_create */
+        xfs_trans_log_buf(tp, bp,
+                XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
+        return 0;
+}
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int                                              /* error */
+xfs_da3_node_split(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *oldblk,
+        struct xfs_da_state_blk *newblk,
+        struct xfs_da_state_blk *addblk,
+        int                     treelevel,
+        int                     *result)
+{
+        struct xfs_da_intnode   *node;
+        struct xfs_da3_icnode_hdr nodehdr;
+        xfs_dablk_t             blkno;
+        int                     newcount;
+        int                     error;
+        int                     useextra;
+        struct xfs_inode        *dp = state->args->dp;
+        trace_xfs_da_node_split(state->args);
+        node = oldblk->bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+        /*
+         * With V2 dirs the extra block is data or freespace.
+         */
+        useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
+        newcount = 1 + useextra;
+        /*
+         * Do we have to split the node?
+         */
+        if (nodehdr.count + newcount > state->args->geo->node_ents) {
+                /*
+                 * Allocate a new node, add to the doubly linked chain of
+                 * nodes, then move some of our excess entries into it.
+                 */
+                error = xfs_da_grow_inode(state->args, &blkno);
+                if (error)
+                        return error;   /* GROT: dir is inconsistent */
+                error = xfs_da3_node_create(state->args, blkno, treelevel,
+                                           &newblk->bp, state->args->whichfork);
+                if (error)
+                        return error;   /* GROT: dir is inconsistent */
+                newblk->blkno = blkno;
+                newblk->magic = XFS_DA_NODE_MAGIC;
+                xfs_da3_node_rebalance(state, oldblk, newblk);
+                error = xfs_da3_blk_link(state, oldblk, newblk);
+                if (error)
+                        return error;
+                *result = 1;
+        } else {
+                *result = 0;
+        }
+        /*
+         * Insert the new entry(s) into the correct block
+         * (updating last hashval in the process).
+         *
+         * xfs_da3_node_add() inserts BEFORE the given index,
+         * and as a result of using node_lookup_int() we always
+         * point to a valid entry (not after one), but a split
+         * operation always results in a new block whose hashvals
+         * FOLLOW the current block.
+         *
+         * If we had double-split op below us, then add the extra block too.
+         */
+        node = oldblk->bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+        if (oldblk->index <= nodehdr.count) {
+                oldblk->index++;
+                xfs_da3_node_add(state, oldblk, addblk);
+                if (useextra) {
+                        if (state->extraafter)
+                                oldblk->index++;
+                        xfs_da3_node_add(state, oldblk, &state->extrablk);
+                        state->extravalid = 0;
+                }
+        } else {
+                newblk->index++;
+                xfs_da3_node_add(state, newblk, addblk);
+                if (useextra) {
+                        if (state->extraafter)
+                                newblk->index++;
+                        xfs_da3_node_add(state, newblk, &state->extrablk);
+                        state->extravalid = 0;
+                }
+        }
+        return 0;
+}
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da3_node_rebalance(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *blk1,
+        struct xfs_da_state_blk *blk2)
+{
+        struct xfs_da_intnode   *node1;
+        struct xfs_da_intnode   *node2;
+        struct xfs_da_intnode   *tmpnode;
+        struct xfs_da_node_entry *btree1;
+        struct xfs_da_node_entry *btree2;
+        struct xfs_da_node_entry *btree_s;
+        struct xfs_da_node_entry *btree_d;
+        struct xfs_da3_icnode_hdr nodehdr1;
+        struct xfs_da3_icnode_hdr nodehdr2;
+        struct xfs_trans        *tp;
+        int                     count;
+        int                     tmp;
+        int                     swap = 0;
+        struct xfs_inode        *dp = state->args->dp;
+        trace_xfs_da_node_rebalance(state->args);
+        node1 = blk1->bp->b_addr;
+        node2 = blk2->bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+        dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+        btree1 = dp->d_ops->node_tree_p(node1);
+        btree2 = dp->d_ops->node_tree_p(node2);
+        /*
+         * Figure out how many entries need to move, and in which direction.
+         * Swap the nodes around if that makes it simpler.
+         */
+        if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+            ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+             (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+                        be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
+                tmpnode = node1;
+                node1 = node2;
+                node2 = tmpnode;
+                dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+                dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+                btree1 = dp->d_ops->node_tree_p(node1);
+                btree2 = dp->d_ops->node_tree_p(node2);
+                swap = 1;
+        }
+        count = (nodehdr1.count - nodehdr2.count) / 2;
+        if (count == 0)
+                return;
+        tp = state->args->trans;
+        /*
+         * Two cases: high-to-low and low-to-high.
+         */
+        if (count > 0) {
+                /*
+                 * Move elements in node2 up to make a hole.
+                 */
+                tmp = nodehdr2.count;
+                if (tmp > 0) {
+                        tmp *= (uint)sizeof(xfs_da_node_entry_t);
+                        btree_s = &btree2[0];
+                        btree_d = &btree2[count];
+                        memmove(btree_d, btree_s, tmp);
+                }
+                /*
+                 * Move the req'd B-tree elements from high in node1 to
+                 * low in node2.
+                 */
+                nodehdr2.count += count;
+                tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+                btree_s = &btree1[nodehdr1.count - count];
+                btree_d = &btree2[0];
+                memcpy(btree_d, btree_s, tmp);
+                nodehdr1.count -= count;
+        } else {
+                /*
+                 * Move the req'd B-tree elements from low in node2 to
+                 * high in node1.
+                 */
+                count = -count;
+                tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+                btree_s = &btree2[0];
+                btree_d = &btree1[nodehdr1.count];
+                memcpy(btree_d, btree_s, tmp);
+                nodehdr1.count += count;
+                xfs_trans_log_buf(tp, blk1->bp,
+                        XFS_DA_LOGRANGE(node1, btree_d, tmp));
+                /*
+                 * Move elements in node2 down to fill the hole.
+                 */
+                tmp  = nodehdr2.count - count;
+                tmp *= (uint)sizeof(xfs_da_node_entry_t);
+                btree_s = &btree2[count];
+                btree_d = &btree2[0];
+                memmove(btree_d, btree_s, tmp);
+                nodehdr2.count -= count;
+        }
+        /*
+         * Log header of node 1 and all current bits of node 2.
+         */
+        dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
+        xfs_trans_log_buf(tp, blk1->bp,
+                XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
+        dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
+        xfs_trans_log_buf(tp, blk2->bp,
+                XFS_DA_LOGRANGE(node2, &node2->hdr,
+                                dp->d_ops->node_hdr_size +
+                                (sizeof(btree2[0]) * nodehdr2.count)));
+        /*
+         * Record the last hashval from each block for upward propagation.
+         * (note: don't use the swapped node pointers)
+         */
+        if (swap) {
+                node1 = blk1->bp->b_addr;
+                node2 = blk2->bp->b_addr;
+                dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+                dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+                btree1 = dp->d_ops->node_tree_p(node1);
+                btree2 = dp->d_ops->node_tree_p(node2);
+        }
+        blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+        blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
+        /*
+         * Adjust the expected index for insertion.
+         */
+        if (blk1->index >= nodehdr1.count) {
+                blk2->index = blk1->index - nodehdr1.count;
+                blk1->index = nodehdr1.count + 1;       /* make it invalid */
+        }
+}
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da3_node_add(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *oldblk,
+        struct xfs_da_state_blk *newblk)
+{
+        struct xfs_da_intnode   *node;
+        struct xfs_da3_icnode_hdr nodehdr;
+        struct xfs_da_node_entry *btree;
+        int                     tmp;
+        struct xfs_inode        *dp = state->args->dp;
+        trace_xfs_da_node_add(state->args);
+        node = oldblk->bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+        btree = dp->d_ops->node_tree_p(node);
+        ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
+        ASSERT(newblk->blkno != 0);
+        if (state->args->whichfork == XFS_DATA_FORK)
+                ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+                       newblk->blkno < state->args->geo->freeblk);
+        /*
+         * We may need to make some room before we insert the new node.
+         */
+        tmp = 0;
+        if (oldblk->index < nodehdr.count) {
+                tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+                memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
+        }
+        btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+        btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
+        xfs_trans_log_buf(state->args->trans, oldblk->bp,
+                XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+                                tmp + sizeof(*btree)));
+        nodehdr.count += 1;
+        dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+        xfs_trans_log_buf(state->args->trans, oldblk->bp,
+                XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+        /*
+         * Copy the last hash value from the oldblk to propagate upwards.
+         */
+        oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da3_join(
+        struct xfs_da_state     *state)
+{
+        struct xfs_da_state_blk *drop_blk;
+        struct xfs_da_state_blk *save_blk;
+        int                     action = 0;
+        int                     error;
+        trace_xfs_da_join(state->args);
+        drop_blk = &state->path.blk[ state->path.active-1 ];
+        save_blk = &state->altpath.blk[ state->path.active-1 ];
+        ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+        ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+               drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        /*
+         * Walk back up the tree joining/deallocating as necessary.
+         * When we stop dropping blocks, break out.
+         */
+        for (  ; state->path.active >= 2; drop_blk--, save_blk--,
+                 state->path.active--) {
+                /*
+                 * See if we can combine the block with a neighbor.
+                 *   (action == 0) => no options, just leave
+                 *   (action == 1) => coalesce, then unlink
+                 *   (action == 2) => block empty, unlink it
+                 */
+                switch (drop_blk->magic) {
+                case XFS_ATTR_LEAF_MAGIC:
+                        error = xfs_attr3_leaf_toosmall(state, &action);
+                        if (error)
+                                return error;
+                        if (action == 0)
+                                return 0;
+                        xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
+                        break;
+                case XFS_DIR2_LEAFN_MAGIC:
+                        error = xfs_dir2_leafn_toosmall(state, &action);
+                        if (error)
+                                return error;
+                        if (action == 0)
+                                return 0;
+                        xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+                        break;
+                case XFS_DA_NODE_MAGIC:
+                        /*
+                         * Remove the offending node, fixup hashvals,
+                         * check for a toosmall neighbor.
+                         */
+                        xfs_da3_node_remove(state, drop_blk);
+                        xfs_da3_fixhashpath(state, &state->path);
+                        error = xfs_da3_node_toosmall(state, &action);
+                        if (error)
+                                return error;
+                        if (action == 0)
+                                return 0;
+                        xfs_da3_node_unbalance(state, drop_blk, save_blk);
+                        break;
+                }
+                xfs_da3_fixhashpath(state, &state->altpath);
+                error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
+                xfs_da_state_kill_altpath(state);
+                if (error)
+                        return error;
+                error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+                                                         drop_blk->bp);
+                drop_blk->bp = NULL;
+                if (error)
+                        return error;
+        }
+        /*
+         * We joined all the way to the top.  If it turns out that
+         * we only have one entry in the root, make the child block
+         * the new root.
+         */
+        xfs_da3_node_remove(state, drop_blk);
+        xfs_da3_fixhashpath(state, &state->path);
+        error = xfs_da3_root_join(state, &state->path.blk[0]);
+        return error;
+}
+#ifdef  DEBUG
+static void
+xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
+{
+        __be16  magic = blkinfo->magic;
+        if (level == 1) {
+                ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+                       magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+                       magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+                       magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+        } else {
+                ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+                       magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+        }
+        ASSERT(!blkinfo->forw);
+        ASSERT(!blkinfo->back);
+}
+#else   /* !DEBUG */
+#define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
+#endif  /* !DEBUG */
+/*
+ * We have only one entry in the root.  Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da3_root_join(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *root_blk)
+{
+        struct xfs_da_intnode   *oldroot;
+        struct xfs_da_args      *args;
+        xfs_dablk_t             child;
+        struct xfs_buf          *bp;
+        struct xfs_da3_icnode_hdr oldroothdr;
+        struct xfs_da_node_entry *btree;
+        int                     error;
+        struct xfs_inode        *dp = state->args->dp;
+        trace_xfs_da_root_join(state->args);
+        ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+        args = state->args;
+        oldroot = root_blk->bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
+        ASSERT(oldroothdr.forw == 0);
+        ASSERT(oldroothdr.back == 0);
+        /*
+         * If the root has more than one child, then don't do anything.
+         */
+        if (oldroothdr.count > 1)
+                return 0;
+        /*
+         * Read in the (only) child block, then copy those bytes into
+         * the root block's buffer and free the original child block.
+         */
+        btree = dp->d_ops->node_tree_p(oldroot);
+        child = be32_to_cpu(btree[0].before);
+        ASSERT(child != 0);
+        error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
+                                             args->whichfork);
+        if (error)
+                return error;
+        xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
+        /*
+         * This could be copying a leaf back into the root block in the case of
+         * there only being a single leaf block left in the tree. Hence we have
+         * to update the b_ops pointer as well to match the buffer type change
+         * that could occur. For dir3 blocks we also need to update the block
+         * number in the buffer header.
+         */
+        memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
+        root_blk->bp->b_ops = bp->b_ops;
+        xfs_trans_buf_copy_type(root_blk->bp, bp);
+        if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+                struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+                da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+        }
+        xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+                          args->geo->blksize - 1);
+        error = xfs_da_shrink_inode(args, child, bp);
+        return error;
+}
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da3_node_toosmall(
+        struct xfs_da_state     *state,
+        int                     *action)
+{
+        struct xfs_da_intnode   *node;
+        struct xfs_da_state_blk *blk;
+        struct xfs_da_blkinfo   *info;
+        xfs_dablk_t             blkno;
+        struct xfs_buf          *bp;
+        struct xfs_da3_icnode_hdr nodehdr;
+        int                     count;
+        int                     forward;
+        int                     error;
+        int                     retval;
+        int                     i;
+        struct xfs_inode        *dp = state->args->dp;
+        trace_xfs_da_node_toosmall(state->args);
+        /*
+         * Check for the degenerate case of the block being over 50% full.
+         * If so, it's not worth even looking to see if we might be able
+         * to coalesce with a sibling.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        info = blk->bp->b_addr;
+        node = (xfs_da_intnode_t *)info;
+        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+        if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
+                *action = 0;    /* blk over 50%, don't try to join */
+                return 0;       /* blk over 50%, don't try to join */
+        }
+        /*
+         * Check for the degenerate case of the block being empty.
+         * If the block is empty, we'll simply delete it, no need to
+         * coalesce it with a sibling block.  We choose (arbitrarily)
+         * to merge with the forward block unless it is NULL.
+         */
+        if (nodehdr.count == 0) {
+                /*
+                 * Make altpath point to the block we want to keep and
+                 * path point to the block we want to drop (this one).
+                 */
+                forward = (info->forw != 0);
+                memcpy(&state->altpath, &state->path, sizeof(state->path));
+                error = xfs_da3_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+                if (error)
+                        return error;
+                if (retval) {
+                        *action = 0;
+                } else {
+                        *action = 2;
+                }
+                return 0;
+        }
+        /*
+         * Examine each sibling block to see if we can coalesce with
+         * at least 25% free space to spare.  We need to figure out
+         * whether to merge with the forward or the backward block.
+         * We prefer coalescing with the lower numbered sibling so as
+         * to shrink a directory over time.
+         */
+        count  = state->args->geo->node_ents;
+        count -= state->args->geo->node_ents >> 2;
+        count -= nodehdr.count;
+        /* start with smaller blk num */
+        forward = nodehdr.forw < nodehdr.back;
+        for (i = 0; i < 2; forward = !forward, i++) {
+                struct xfs_da3_icnode_hdr thdr;
+                if (forward)
+                        blkno = nodehdr.forw;
+                else
+                        blkno = nodehdr.back;
+                if (blkno == 0)
+                        continue;
+                error = xfs_da3_node_read(state->args->trans, dp,
+                                        blkno, -1, &bp, state->args->whichfork);
+                if (error)
+                        return error;
+                node = bp->b_addr;
+                dp->d_ops->node_hdr_from_disk(&thdr, node);
+                xfs_trans_brelse(state->args->trans, bp);
+                if (count - thdr.count >= 0)
+                        break;  /* fits with at least 25% to spare */
+        }
+        if (i >= 2) {
+                *action = 0;
+                return 0;
+        }
+        /*
+         * Make altpath point to the block we want to keep (the lower
+         * numbered block) and path point to the block we want to drop.
+         */
+        memcpy(&state->altpath, &state->path, sizeof(state->path));
+        if (blkno < blk->blkno) {
+                error = xfs_da3_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+        } else {
+                error = xfs_da3_path_shift(state, &state->path, forward,
+                                                 0, &retval);
+        }
+        if (error)
+                return error;
+        if (retval) {
+                *action = 0;
+                return 0;
+        }
+        *action = 1;
+        return 0;
+}
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+        struct xfs_inode        *dp,
+        struct xfs_buf          *bp,
+        int                     *count)
+{
+        struct xfs_da_intnode    *node;
+        struct xfs_da_node_entry *btree;
+        struct xfs_da3_icnode_hdr nodehdr;
+        node = bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+        if (count)
+                *count = nodehdr.count;
+        if (!nodehdr.count)
+                return 0;
+        btree = dp->d_ops->node_tree_p(node);
+        return be32_to_cpu(btree[nodehdr.count - 1].hashval);
+}
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da3_fixhashpath(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_path *path)
+{
+        struct xfs_da_state_blk *blk;
+        struct xfs_da_intnode   *node;
+        struct xfs_da_node_entry *btree;
+        xfs_dahash_t            lasthash=0;
+        int                     level;
+        int                     count;
+        struct xfs_inode        *dp = state->args->dp;
+        trace_xfs_da_fixhashpath(state->args);
+        level = path->active-1;
+        blk = &path->blk[ level ];
+        switch (blk->magic) {
+        case XFS_ATTR_LEAF_MAGIC:
+                lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+                if (count == 0)
+                        return;
+                break;
+        case XFS_DIR2_LEAFN_MAGIC:
+                lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
+                if (count == 0)
+                        return;
+                break;
+        case XFS_DA_NODE_MAGIC:
+                lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
+                if (count == 0)
+                        return;
+                break;
+        }
+        for (blk--, level--; level >= 0; blk--, level--) {
+                struct xfs_da3_icnode_hdr nodehdr;
+                node = blk->bp->b_addr;
+                dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+                btree = dp->d_ops->node_tree_p(node);
+                if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
+                        break;
+                blk->hashval = lasthash;
+                btree[blk->index].hashval = cpu_to_be32(lasthash);
+                xfs_trans_log_buf(state->args->trans, blk->bp,
+                                  XFS_DA_LOGRANGE(node, &btree[blk->index],
+                                                  sizeof(*btree)));
+                lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+        }
+}
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da3_node_remove(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *drop_blk)
+{
+        struct xfs_da_intnode   *node;
+        struct xfs_da3_icnode_hdr nodehdr;
+        struct xfs_da_node_entry *btree;
+        int                     index;
+        int                     tmp;
+        struct xfs_inode        *dp = state->args->dp;
+        trace_xfs_da_node_remove(state->args);
+        node = drop_blk->bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+        ASSERT(drop_blk->index < nodehdr.count);
+        ASSERT(drop_blk->index >= 0);
+        /*
+         * Copy over the offending entry, or just zero it out.
+         */
+        index = drop_blk->index;
+        btree = dp->d_ops->node_tree_p(node);
+        if (index < nodehdr.count - 1) {
+                tmp  = nodehdr.count - index - 1;
+                tmp *= (uint)sizeof(xfs_da_node_entry_t);
+                memmove(&btree[index], &btree[index + 1], tmp);
+                xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+                    XFS_DA_LOGRANGE(node, &btree[index], tmp));
+                index = nodehdr.count - 1;
+        }
+        memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
+        xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+            XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+        nodehdr.count -= 1;
+        dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+        xfs_trans_log_buf(state->args->trans, drop_blk->bp,
+            XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+        /*
+         * Copy the last hash value from the block to propagate upwards.
+         */
+        drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
+}
+/*
+ * Unbalance the elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da3_node_unbalance(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *drop_blk,
+        struct xfs_da_state_blk *save_blk)
+{
+        struct xfs_da_intnode   *drop_node;
+        struct xfs_da_intnode   *save_node;
+        struct xfs_da_node_entry *drop_btree;
+        struct xfs_da_node_entry *save_btree;
+        struct xfs_da3_icnode_hdr drop_hdr;
+        struct xfs_da3_icnode_hdr save_hdr;
+        struct xfs_trans        *tp;
+        int                     sindex;
+        int                     tmp;
+        struct xfs_inode        *dp = state->args->dp;
+        trace_xfs_da_node_unbalance(state->args);
+        drop_node = drop_blk->bp->b_addr;
+        save_node = save_blk->bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
+        dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
+        drop_btree = dp->d_ops->node_tree_p(drop_node);
+        save_btree = dp->d_ops->node_tree_p(save_node);
+        tp = state->args->trans;
+        /*
+         * If the dying block has lower hashvals, then move all the
+         * elements in the remaining block up to make a hole.
+         */
+        if ((be32_to_cpu(drop_btree[0].hashval) <
+                        be32_to_cpu(save_btree[0].hashval)) ||
+            (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+                        be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+                /* XXX: check this - is memmove dst correct? */
+                tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+                memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+                sindex = 0;
+                xfs_trans_log_buf(tp, save_blk->bp,
+                        XFS_DA_LOGRANGE(save_node, &save_btree[0],
+                                (save_hdr.count + drop_hdr.count) *
+                                                sizeof(xfs_da_node_entry_t)));
+        } else {
+                sindex = save_hdr.count;
+                xfs_trans_log_buf(tp, save_blk->bp,
+                        XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+                                drop_hdr.count * sizeof(xfs_da_node_entry_t)));
+        }
+        /*
+         * Move all the B-tree elements from drop_blk to save_blk.
+         */
+        tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+        memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+        save_hdr.count += drop_hdr.count;
+        dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
+        xfs_trans_log_buf(tp, save_blk->bp,
+                XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+                                dp->d_ops->node_hdr_size));
+        /*
+         * Save the last hashval in the remaining block for upward propagation.
+         */
+        save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
+}
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend.  This is a
+ * pruned depth-first tree search.
+ */
+int                                                     /* error */
+xfs_da3_node_lookup_int(
+        struct xfs_da_state     *state,
+        int                     *result)
+{
+        struct xfs_da_state_blk *blk;
+        struct xfs_da_blkinfo   *curr;
+        struct xfs_da_intnode   *node;
+        struct xfs_da_node_entry *btree;
+        struct xfs_da3_icnode_hdr nodehdr;
+        struct xfs_da_args      *args;
+        xfs_dablk_t             blkno;
+        xfs_dahash_t            hashval;
+        xfs_dahash_t            btreehashval;
+        int                     probe;
+        int                     span;
+        int                     max;
+        int                     error;
+        int                     retval;
+        struct xfs_inode        *dp = state->args->dp;
+        args = state->args;
+        /*
+         * Descend thru the B-tree searching each level for the right
+         * node to use, until the right hashval is found.
+         */
+        blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
+        for (blk = &state->path.blk[0], state->path.active = 1;
+                         state->path.active <= XFS_DA_NODE_MAXDEPTH;
+                         blk++, state->path.active++) {
+                /*
+                 * Read the next node down in the tree.
+                 */
+                blk->blkno = blkno;
+                error = xfs_da3_node_read(args->trans, args->dp, blkno,
+                                        -1, &blk->bp, args->whichfork);
+                if (error) {
+                        blk->blkno = 0;
+                        state->path.active--;
+                        return error;
+                }
+                curr = blk->bp->b_addr;
+                blk->magic = be16_to_cpu(curr->magic);
+                if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
+                    blk->magic == XFS_ATTR3_LEAF_MAGIC) {
+                        blk->magic = XFS_ATTR_LEAF_MAGIC;
+                        blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+                        break;
+                }
+                if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+                    blk->magic == XFS_DIR3_LEAFN_MAGIC) {
+                        blk->magic = XFS_DIR2_LEAFN_MAGIC;
+                        blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+                                                               blk->bp, NULL);
+                        break;
+                }
+                blk->magic = XFS_DA_NODE_MAGIC;
+                /*
+                 * Search an intermediate node for a match.
+                 */
+                node = blk->bp->b_addr;
+                dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+                btree = dp->d_ops->node_tree_p(node);
+                max = nodehdr.count;
+                blk->hashval = be32_to_cpu(btree[max - 1].hashval);
+                /*
+                 * Binary search.  (note: small blocks will skip loop)
+                 */
+                probe = span = max / 2;
+                hashval = args->hashval;
+                while (span > 4) {
+                        span /= 2;
+                        btreehashval = be32_to_cpu(btree[probe].hashval);
+                        if (btreehashval < hashval)
+                                probe += span;
+                        else if (btreehashval > hashval)
+                                probe -= span;
+                        else
+                                break;
+                }
+                ASSERT((probe >= 0) && (probe < max));
+                ASSERT((span <= 4) ||
+                        (be32_to_cpu(btree[probe].hashval) == hashval));
+                /*
+                 * Since we may have duplicate hashval's, find the first
+                 * matching hashval in the node.
+                 */
+                while (probe > 0 &&
+                       be32_to_cpu(btree[probe].hashval) >= hashval) {
+                        probe--;
+                }
+                while (probe < max &&
+                       be32_to_cpu(btree[probe].hashval) < hashval) {
+                        probe++;
+                }
+                /*
+                 * Pick the right block to descend on.
+                 */
+                if (probe == max) {
+                        blk->index = max - 1;
+                        blkno = be32_to_cpu(btree[max - 1].before);
+                } else {
+                        blk->index = probe;
+                        blkno = be32_to_cpu(btree[probe].before);
+                }
+        }
+        /*
+         * A leaf block that ends in the hashval that we are interested in
+         * (final hashval == search hashval) means that the next block may
+         * contain more entries with the same hashval, shift upward to the
+         * next leaf and keep searching.
+         */
+        for (;;) {
+                if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+                        retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
+                                                        &blk->index, state);
+                } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+                        retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
+                        blk->index = args->index;
+                        args->blkno = blk->blkno;
+                } else {
+                        ASSERT(0);
+                        return -EFSCORRUPTED;
+                }
+                if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
+                    (blk->hashval == args->hashval)) {
+                        error = xfs_da3_path_shift(state, &state->path, 1, 1,
+                                                         &retval);
+                        if (error)
+                                return error;
+                        if (retval == 0) {
+                                continue;
+                        } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+                                /* path_shift() gives ENOENT */
+                                retval = -ENOATTR;
+                        }
+                }
+                break;
+        }
+        *result = retval;
+        return 0;
+}
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+        struct xfs_inode *dp,
+        struct xfs_buf  *node1_bp,
+        struct xfs_buf  *node2_bp)
+{
+        struct xfs_da_intnode   *node1;
+        struct xfs_da_intnode   *node2;
+        struct xfs_da_node_entry *btree1;
+        struct xfs_da_node_entry *btree2;
+        struct xfs_da3_icnode_hdr node1hdr;
+        struct xfs_da3_icnode_hdr node2hdr;
+        node1 = node1_bp->b_addr;
+        node2 = node2_bp->b_addr;
+        dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
+        dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
+        btree1 = dp->d_ops->node_tree_p(node1);
+        btree2 = dp->d_ops->node_tree_p(node2);
+        if (node1hdr.count > 0 && node2hdr.count > 0 &&
+            ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+             (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+              be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+                return 1;
+        }
+        return 0;
+}
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int                                                     /* error */
+xfs_da3_blk_link(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *old_blk,
+        struct xfs_da_state_blk *new_blk)
+{
+        struct xfs_da_blkinfo   *old_info;
+        struct xfs_da_blkinfo   *new_info;
+        struct xfs_da_blkinfo   *tmp_info;
+        struct xfs_da_args      *args;
+        struct xfs_buf          *bp;
+        int                     before = 0;
+        int                     error;
+        struct xfs_inode        *dp = state->args->dp;
+        /*
+         * Set up environment.
+         */
+        args = state->args;
+        ASSERT(args != NULL);
+        old_info = old_blk->bp->b_addr;
+        new_info = new_blk->bp->b_addr;
+        ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+               old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+               old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+        switch (old_blk->magic) {
+        case XFS_ATTR_LEAF_MAGIC:
+                before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+                break;
+        case XFS_DIR2_LEAFN_MAGIC:
+                before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
+                break;
+        case XFS_DA_NODE_MAGIC:
+                before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
+                break;
+        }
+        /*
+         * Link blocks in appropriate order.
+         */
+        if (before) {
+                /*
+                 * Link new block in before existing block.
+                 */
+                trace_xfs_da_link_before(args);
+                new_info->forw = cpu_to_be32(old_blk->blkno);
+                new_info->back = old_info->back;
+                if (old_info->back) {
+                        error = xfs_da3_node_read(args->trans, dp,
+                                                be32_to_cpu(old_info->back),
+                                                -1, &bp, args->whichfork);
+                        if (error)
+                                return error;
+                        ASSERT(bp != NULL);
+                        tmp_info = bp->b_addr;
+                        ASSERT(tmp_info->magic == old_info->magic);
+                        ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
+                        tmp_info->forw = cpu_to_be32(new_blk->blkno);
+                        xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+                }
+                old_info->back = cpu_to_be32(new_blk->blkno);
+        } else {
+                /*
+                 * Link new block in after existing block.
+                 */
+                trace_xfs_da_link_after(args);
+                new_info->forw = old_info->forw;
+                new_info->back = cpu_to_be32(old_blk->blkno);
+                if (old_info->forw) {
+                        error = xfs_da3_node_read(args->trans, dp,
+                                                be32_to_cpu(old_info->forw),
+                                                -1, &bp, args->whichfork);
+                        if (error)
+                                return error;
+                        ASSERT(bp != NULL);
+                        tmp_info = bp->b_addr;
+                        ASSERT(tmp_info->magic == old_info->magic);
+                        ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
+                        tmp_info->back = cpu_to_be32(new_blk->blkno);
+                        xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+                }
+                old_info->forw = cpu_to_be32(new_blk->blkno);
+        }
+        xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+        xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+        return 0;
+}
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+STATIC int                                              /* error */
+xfs_da3_blk_unlink(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_blk *drop_blk,
+        struct xfs_da_state_blk *save_blk)
+{
+        struct xfs_da_blkinfo   *drop_info;
+        struct xfs_da_blkinfo   *save_info;
+        struct xfs_da_blkinfo   *tmp_info;
+        struct xfs_da_args      *args;
+        struct xfs_buf          *bp;
+        int                     error;
+        /*
+         * Set up environment.
+         */
+        args = state->args;
+        ASSERT(args != NULL);
+        save_info = save_blk->bp->b_addr;
+        drop_info = drop_blk->bp->b_addr;
+        ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+               save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+               save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(save_blk->magic == drop_blk->magic);
+        ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
+               (be32_to_cpu(save_info->back) == drop_blk->blkno));
+        ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
+               (be32_to_cpu(drop_info->back) == save_blk->blkno));
+        /*
+         * Unlink the leaf block from the doubly linked chain of leaves.
+         */
+        if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+                trace_xfs_da_unlink_back(args);
+                save_info->back = drop_info->back;
+                if (drop_info->back) {
+                        error = xfs_da3_node_read(args->trans, args->dp,
+                                                be32_to_cpu(drop_info->back),
+                                                -1, &bp, args->whichfork);
+                        if (error)
+                                return error;
+                        ASSERT(bp != NULL);
+                        tmp_info = bp->b_addr;
+                        ASSERT(tmp_info->magic == save_info->magic);
+                        ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
+                        tmp_info->forw = cpu_to_be32(save_blk->blkno);
+                        xfs_trans_log_buf(args->trans, bp, 0,
+                                                    sizeof(*tmp_info) - 1);
+                }
+        } else {
+                trace_xfs_da_unlink_forward(args);
+                save_info->forw = drop_info->forw;
+                if (drop_info->forw) {
+                        error = xfs_da3_node_read(args->trans, args->dp,
+                                                be32_to_cpu(drop_info->forw),
+                                                -1, &bp, args->whichfork);
+                        if (error)
+                                return error;
+                        ASSERT(bp != NULL);
+                        tmp_info = bp->b_addr;
+                        ASSERT(tmp_info->magic == save_info->magic);
+                        ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
+                        tmp_info->back = cpu_to_be32(save_blk->blkno);
+                        xfs_trans_log_buf(args->trans, bp, 0,
+                                                    sizeof(*tmp_info) - 1);
+                }
+        }
+        xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+        return 0;
+}
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int                                                     /* error */
+xfs_da3_path_shift(
+        struct xfs_da_state     *state,
+        struct xfs_da_state_path *path,
+        int                     forward,
+        int                     release,
+        int                     *result)
+{
+        struct xfs_da_state_blk *blk;
+        struct xfs_da_blkinfo   *info;
+        struct xfs_da_intnode   *node;
+        struct xfs_da_args      *args;
+        struct xfs_da_node_entry *btree;
+        struct xfs_da3_icnode_hdr nodehdr;
+        xfs_dablk_t             blkno = 0;
+        int                     level;
+        int                     error;
+        struct xfs_inode        *dp = state->args->dp;
+        trace_xfs_da_path_shift(state->args);
+        /*
+         * Roll up the Btree looking for the first block where our
+         * current index is not at the edge of the block.  Note that
+         * we skip the bottom layer because we want the sibling block.
+         */
+        args = state->args;
+        ASSERT(args != NULL);
+        ASSERT(path != NULL);
+        ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        level = (path->active-1) - 1;   /* skip bottom layer in path */
+        for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+                node = blk->bp->b_addr;
+                dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+                btree = dp->d_ops->node_tree_p(node);
+                if (forward && (blk->index < nodehdr.count - 1)) {
+                        blk->index++;
+                        blkno = be32_to_cpu(btree[blk->index].before);
+                        break;
+                } else if (!forward && (blk->index > 0)) {
+                        blk->index--;
+                        blkno = be32_to_cpu(btree[blk->index].before);
+                        break;
+                }
+        }
+        if (level < 0) {
+                *result = -ENOENT;      /* we're out of our tree */
+                ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+                return 0;
+        }
+        /*
+         * Roll down the edge of the subtree until we reach the
+         * same depth we were at originally.
+         */
+        for (blk++, level++; level < path->active; blk++, level++) {
+                /*
+                 * Release the old block.
+                 * (if it's dirty, trans won't actually let go)
+                 */
+                if (release)
+                        xfs_trans_brelse(args->trans, blk->bp);
+                /*
+                 * Read the next child block.
+                 */
+                blk->blkno = blkno;
+                error = xfs_da3_node_read(args->trans, dp, blkno, -1,
+                                        &blk->bp, args->whichfork);
+                if (error)
+                        return error;
+                info = blk->bp->b_addr;
+                ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+                       info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+                       info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+                       info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+                       info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+                       info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+                /*
+                 * Note: we flatten the magic number to a single type so we
+                 * don't have to compare against crc/non-crc types elsewhere.
+                 */
+                switch (be16_to_cpu(info->magic)) {
+                case XFS_DA_NODE_MAGIC:
+                case XFS_DA3_NODE_MAGIC:
+                        blk->magic = XFS_DA_NODE_MAGIC;
+                        node = (xfs_da_intnode_t *)info;
+                        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+                        btree = dp->d_ops->node_tree_p(node);
+                        blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
+                        if (forward)
+                                blk->index = 0;
+                        else
+                                blk->index = nodehdr.count - 1;
+                        blkno = be32_to_cpu(btree[blk->index].before);
+                        break;
+                case XFS_ATTR_LEAF_MAGIC:
+                case XFS_ATTR3_LEAF_MAGIC:
+                        blk->magic = XFS_ATTR_LEAF_MAGIC;
+                        ASSERT(level == path->active-1);
+                        blk->index = 0;
+                        blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+                        break;
+                case XFS_DIR2_LEAFN_MAGIC:
+                case XFS_DIR3_LEAFN_MAGIC:
+                        blk->magic = XFS_DIR2_LEAFN_MAGIC;
+                        ASSERT(level == path->active-1);
+                        blk->index = 0;
+                        blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+                                                               blk->bp, NULL);
+                        break;
+                default:
+                        ASSERT(0);
+                        break;
+                }
+        }
+        *result = 0;
+        return 0;
+}
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(const __uint8_t *name, int namelen)
+{
+        xfs_dahash_t hash;
+        /*
+         * Do four characters at a time as long as we can.
+         */
+        for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+                hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+                       (name[3] << 0) ^ rol32(hash, 7 * 4);
+        /*
+         * Now do the rest of the characters.
+         */
+        switch (namelen) {
+        case 3:
+                return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+                       rol32(hash, 7 * 3);
+        case 2:
+                return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+        case 1:
+                return (name[0] << 0) ^ rol32(hash, 7 * 1);
+        default: /* case 0: */
+                return hash;
+        }
+}
+enum xfs_dacmp
+xfs_da_compname(
+        struct xfs_da_args *args,
+        const unsigned char *name,
+        int             len)
+{
+        return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+                                        XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+static xfs_dahash_t
+xfs_default_hashname(
+        struct xfs_name *name)
+{
+        return xfs_da_hashname(name->name, name->len);
+}
+const struct xfs_nameops xfs_default_nameops = {
+        .hashname       = xfs_default_hashname,
+        .compname       = xfs_da_compname
+};
+int
+xfs_da_grow_inode_int(
+        struct xfs_da_args      *args,
+        xfs_fileoff_t           *bno,
+        int                     count)
+{
+        struct xfs_trans        *tp = args->trans;
+        struct xfs_inode        *dp = args->dp;
+        int                     w = args->whichfork;
+        xfs_drfsbno_t           nblks = dp->i_d.di_nblocks;
+        struct xfs_bmbt_irec    map, *mapp;
+        int                     nmap, error, got, i, mapi;
+        /*
+         * Find a spot in the file space to put the new block.
+         */
+        error = xfs_bmap_first_unused(tp, dp, count, bno, w);
+        if (error)
+                return error;
+        /*
+         * Try mapping it in one filesystem block.
+         */
+        nmap = 1;
+        ASSERT(args->firstblock != NULL);
+        error = xfs_bmapi_write(tp, dp, *bno, count,
+                        xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+                        args->firstblock, args->total, &map, &nmap,
+                        args->flist);
+        if (error)
+                return error;
+        ASSERT(nmap <= 1);
+        if (nmap == 1) {
+                mapp = &map;
+                mapi = 1;
+        } else if (nmap == 0 && count > 1) {
+                xfs_fileoff_t           b;
+                int                     c;
+                /*
+                 * If we didn't get it and the block might work if fragmented,
+                 * try without the CONTIG flag.  Loop until we get it all.
+                 */
+                mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+                for (b = *bno, mapi = 0; b < *bno + count; ) {
+                        nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+                        c = (int)(*bno + count - b);
+                        error = xfs_bmapi_write(tp, dp, b, c,
+                                        xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+                                        args->firstblock, args->total,
+                                        &mapp[mapi], &nmap, args->flist);
+                        if (error)
+                                goto out_free_map;
+                        if (nmap < 1)
+                                break;
+                        mapi += nmap;
+                        b = mapp[mapi - 1].br_startoff +
+                            mapp[mapi - 1].br_blockcount;
+                }
+        } else {
+                mapi = 0;
+                mapp = NULL;
+        }
+        /*
+         * Count the blocks we got, make sure it matches the total.
+         */
+        for (i = 0, got = 0; i < mapi; i++)
+                got += mapp[i].br_blockcount;
+        if (got != count || mapp[0].br_startoff != *bno ||
+            mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+            *bno + count) {
+                error = -ENOSPC;
+                goto out_free_map;
+        }
+        /* account for newly allocated blocks in reserved blocks total */
+        args->total -= dp->i_d.di_nblocks - nblks;
+out_free_map:
+        if (mapp != &map)
+                kmem_free(mapp);
+        return error;
+}
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(
+        struct xfs_da_args      *args,
+        xfs_dablk_t             *new_blkno)
+{
+        xfs_fileoff_t           bno;
+        int                     error;
+        trace_xfs_da_grow_inode(args);
+        bno = args->geo->leafblk;
+        error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
+        if (!error)
+                *new_blkno = (xfs_dablk_t)bno;
+        return error;
+}
+/*
+ * Ick.  We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file.  The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da3_swap_lastblock(
+        struct xfs_da_args      *args,
+        xfs_dablk_t             *dead_blknop,
+        struct xfs_buf          **dead_bufp)
+{
+        struct xfs_da_blkinfo   *dead_info;
+        struct xfs_da_blkinfo   *sib_info;
+        struct xfs_da_intnode   *par_node;
+        struct xfs_da_intnode   *dead_node;
+        struct xfs_dir2_leaf    *dead_leaf2;
+        struct xfs_da_node_entry *btree;
+        struct xfs_da3_icnode_hdr par_hdr;
+        struct xfs_inode        *dp;
+        struct xfs_trans        *tp;
+        struct xfs_mount        *mp;
+        struct xfs_buf          *dead_buf;
+        struct xfs_buf          *last_buf;
+        struct xfs_buf          *sib_buf;
+        struct xfs_buf          *par_buf;
+        xfs_dahash_t            dead_hash;
+        xfs_fileoff_t           lastoff;
+        xfs_dablk_t             dead_blkno;
+        xfs_dablk_t             last_blkno;
+        xfs_dablk_t             sib_blkno;
+        xfs_dablk_t             par_blkno;
+        int                     error;
+        int                     w;
+        int                     entno;
+        int                     level;
+        int                     dead_level;
+        trace_xfs_da_swap_lastblock(args);
+        dead_buf = *dead_bufp;
+        dead_blkno = *dead_blknop;
+        tp = args->trans;
+        dp = args->dp;
+        w = args->whichfork;
+        ASSERT(w == XFS_DATA_FORK);
+        mp = dp->i_mount;
+        lastoff = args->geo->freeblk;
+        error = xfs_bmap_last_before(tp, dp, &lastoff, w);
+        if (error)
+                return error;
+        if (unlikely(lastoff == 0)) {
+                XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
+                                 mp);
+                return -EFSCORRUPTED;
+        }
+        /*
+         * Read the last block in the btree space.
+         */
+        last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
+        error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
+        if (error)
+                return error;
+        /*
+         * Copy the last block into the dead buffer and log it.
+         */
+        memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+        xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
+        dead_info = dead_buf->b_addr;
+        /*
+         * Get values from the moved block.
+         */
+        if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+            dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+                struct xfs_dir3_icleaf_hdr leafhdr;
+                struct xfs_dir2_leaf_entry *ents;
+                dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+                dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+                ents = dp->d_ops->leaf_ents_p(dead_leaf2);
+                dead_level = 0;
+                dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
+        } else {
+                struct xfs_da3_icnode_hdr deadhdr;
+                dead_node = (xfs_da_intnode_t *)dead_info;
+                dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
+                btree = dp->d_ops->node_tree_p(dead_node);
+                dead_level = deadhdr.level;
+                dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
+        }
+        sib_buf = par_buf = NULL;
+        /*
+         * If the moved block has a left sibling, fix up the pointers.
+         */
+        if ((sib_blkno = be32_to_cpu(dead_info->back))) {
+                error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+                if (error)
+                        goto done;
+                sib_info = sib_buf->b_addr;
+                if (unlikely(
+                    be32_to_cpu(sib_info->forw) != last_blkno ||
+                    sib_info->magic != dead_info->magic)) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = -EFSCORRUPTED;
+                        goto done;
+                }
+                sib_info->forw = cpu_to_be32(dead_blkno);
+                xfs_trans_log_buf(tp, sib_buf,
+                        XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+                                        sizeof(sib_info->forw)));
+                sib_buf = NULL;
+        }
+        /*
+         * If the moved block has a right sibling, fix up the pointers.
+         */
+        if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
+                error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+                if (error)
+                        goto done;
+                sib_info = sib_buf->b_addr;
+                if (unlikely(
+                       be32_to_cpu(sib_info->back) != last_blkno ||
+                       sib_info->magic != dead_info->magic)) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = -EFSCORRUPTED;
+                        goto done;
+                }
+                sib_info->back = cpu_to_be32(dead_blkno);
+                xfs_trans_log_buf(tp, sib_buf,
+                        XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+                                        sizeof(sib_info->back)));
+                sib_buf = NULL;
+        }
+        par_blkno = args->geo->leafblk;
+        level = -1;
+        /*
+         * Walk down the tree looking for the parent of the moved block.
+         */
+        for (;;) {
+                error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+                if (error)
+                        goto done;
+                par_node = par_buf->b_addr;
+                dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+                if (level >= 0 && level != par_hdr.level + 1) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = -EFSCORRUPTED;
+                        goto done;
+                }
+                level = par_hdr.level;
+                btree = dp->d_ops->node_tree_p(par_node);
+                for (entno = 0;
+                     entno < par_hdr.count &&
+                     be32_to_cpu(btree[entno].hashval) < dead_hash;
+                     entno++)
+                        continue;
+                if (entno == par_hdr.count) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = -EFSCORRUPTED;
+                        goto done;
+                }
+                par_blkno = be32_to_cpu(btree[entno].before);
+                if (level == dead_level + 1)
+                        break;
+                xfs_trans_brelse(tp, par_buf);
+                par_buf = NULL;
+        }
+        /*
+         * We're in the right parent block.
+         * Look for the right entry.
+         */
+        for (;;) {
+                for (;
+                     entno < par_hdr.count &&
+                     be32_to_cpu(btree[entno].before) != last_blkno;
+                     entno++)
+                        continue;
+                if (entno < par_hdr.count)
+                        break;
+                par_blkno = par_hdr.forw;
+                xfs_trans_brelse(tp, par_buf);
+                par_buf = NULL;
+                if (unlikely(par_blkno == 0)) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = -EFSCORRUPTED;
+                        goto done;
+                }
+                error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+                if (error)
+                        goto done;
+                par_node = par_buf->b_addr;
+                dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
+                if (par_hdr.level != level) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = -EFSCORRUPTED;
+                        goto done;
+                }
+                btree = dp->d_ops->node_tree_p(par_node);
+                entno = 0;
+        }
+        /*
+         * Update the parent entry pointing to the moved block.
+         */
+        btree[entno].before = cpu_to_be32(dead_blkno);
+        xfs_trans_log_buf(tp, par_buf,
+                XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+                                sizeof(btree[entno].before)));
+        *dead_blknop = last_blkno;
+        *dead_bufp = last_buf;
+        return 0;
+done:
+        if (par_buf)
+                xfs_trans_brelse(tp, par_buf);
+        if (sib_buf)
+                xfs_trans_brelse(tp, sib_buf);
+        xfs_trans_brelse(tp, last_buf);
+        return error;
+}
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(
+        xfs_da_args_t   *args,
+        xfs_dablk_t     dead_blkno,
+        struct xfs_buf  *dead_buf)
+{
+        xfs_inode_t *dp;
+        int done, error, w, count;
+        xfs_trans_t *tp;
+        xfs_mount_t *mp;
+        trace_xfs_da_shrink_inode(args);
+        dp = args->dp;
+        w = args->whichfork;
+        tp = args->trans;
+        mp = dp->i_mount;
+        count = args->geo->fsbcount;
+        for (;;) {
+                /*
+                 * Remove extents.  If we get ENOSPC for a dir we have to move
+                 * the last block to the place we want to kill.
+                 */
+                error = xfs_bunmapi(tp, dp, dead_blkno, count,
+                                    xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+                                    0, args->firstblock, args->flist, &done);
+                if (error == -ENOSPC) {
+                        if (w != XFS_DATA_FORK)
+                                break;
+                        error = xfs_da3_swap_lastblock(args, &dead_blkno,
+                                                      &dead_buf);
+                        if (error)
+                                break;
+                } else {
+                        break;
+                }
+        }
+        xfs_trans_binval(tp, dead_buf);
+        return error;
+}
+/*
+ * See if the mapping(s) for this btree block are valid, i.e.
+ * don't contain holes, are logically contiguous, and cover the whole range.
+ */
+STATIC int
+xfs_da_map_covers_blocks(
+        int             nmap,
+        xfs_bmbt_irec_t *mapp,
+        xfs_dablk_t     bno,
+        int             count)
+{
+        int             i;
+        xfs_fileoff_t   off;
+        for (i = 0, off = bno; i < nmap; i++) {
+                if (mapp[i].br_startblock == HOLESTARTBLOCK ||
+                    mapp[i].br_startblock == DELAYSTARTBLOCK) {
+                        return 0;
+                }
+                if (off != mapp[i].br_startoff) {
+                        return 0;
+                }
+                off += mapp[i].br_blockcount;
+        }
+        return off == bno + count;
+}
+/*
+ * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
+ *
+ * For the single map case, it is assumed that the caller has provided a pointer
+ * to a valid xfs_buf_map.  For the multiple map case, this function will
+ * allocate the xfs_buf_map to hold all the maps and replace the caller's single
+ * map pointer with the allocated map.
+ */
+static int
+xfs_buf_map_from_irec(
+        struct xfs_mount        *mp,
+        struct xfs_buf_map      **mapp,
+        int                     *nmaps,
+        struct xfs_bmbt_irec    *irecs,
+        int                     nirecs)
+{
+        struct xfs_buf_map      *map;
+        int                     i;
+        ASSERT(*nmaps == 1);
+        ASSERT(nirecs >= 1);
+        if (nirecs > 1) {
+                map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+                                  KM_SLEEP | KM_NOFS);
+                if (!map)
+                        return -ENOMEM;
+                *mapp = map;
+        }
+        *nmaps = nirecs;
+        map = *mapp;
+        for (i = 0; i < *nmaps; i++) {
+                ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
+                       irecs[i].br_startblock != HOLESTARTBLOCK);
+                map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
+                map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+        }
+        return 0;
+}
+/*
+ * Map the block we are given ready for reading. There are three possible return
+ * values:
+ *      -1 - will be returned if we land in a hole and mappedbno == -2 so the
+ *           caller knows not to execute a subsequent read.
+ *       0 - if we mapped the block successfully
+ *      >0 - positive error number if there was an error.
+ */
+static int
+xfs_dabuf_map(
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        int                     whichfork,
+        struct xfs_buf_map      **map,
+        int                     *nmaps)
+{
+        struct xfs_mount        *mp = dp->i_mount;
+        int                     nfsb;
+        int                     error = 0;
+        struct xfs_bmbt_irec    irec;
+        struct xfs_bmbt_irec    *irecs = &irec;
+        int                     nirecs;
+        ASSERT(map && *map);
+        ASSERT(*nmaps == 1);
+        if (whichfork == XFS_DATA_FORK)
+                nfsb = mp->m_dir_geo->fsbcount;
+        else
+                nfsb = mp->m_attr_geo->fsbcount;
+        /*
+         * Caller doesn't have a mapping.  -2 means don't complain
+         * if we land in a hole.
+         */
+        if (mappedbno == -1 || mappedbno == -2) {
+                /*
+                 * Optimize the one-block case.
+                 */
+                if (nfsb != 1)
+                        irecs = kmem_zalloc(sizeof(irec) * nfsb,
+                                            KM_SLEEP | KM_NOFS);
+                nirecs = nfsb;
+                error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
+                                       &nirecs, xfs_bmapi_aflag(whichfork));
+                if (error)
+                        goto out;
+        } else {
+                irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+                irecs->br_startoff = (xfs_fileoff_t)bno;
+                irecs->br_blockcount = nfsb;
+                irecs->br_state = 0;
+                nirecs = 1;
+        }
+        if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
+                error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
+                if (unlikely(error == -EFSCORRUPTED)) {
+                        if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+                                int i;
+                                xfs_alert(mp, "%s: bno %lld dir: inode %lld",
+                                        __func__, (long long)bno,
+                                        (long long)dp->i_ino);
+                                for (i = 0; i < *nmaps; i++) {
+                                        xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
+                                                i,
+                                                (long long)irecs[i].br_startoff,
+                                                (long long)irecs[i].br_startblock,
+                                                (long long)irecs[i].br_blockcount,
+                                                irecs[i].br_state);
+                                }
+                        }
+                        XFS_ERROR_REPORT("xfs_da_do_buf(1)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                }
+                goto out;
+        }
+        error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
+out:
+        if (irecs != &irec)
+                kmem_free(irecs);
+        return error;
+}
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+        struct xfs_trans        *trans,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp,
+        int                     whichfork)
+{
+        struct xfs_buf          *bp;
+        struct xfs_buf_map      map;
+        struct xfs_buf_map      *mapp;
+        int                     nmap;
+        int                     error;
+        *bpp = NULL;
+        mapp = &map;
+        nmap = 1;
+        error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+                                &mapp, &nmap);
+        if (error) {
+                /* mapping a hole is not an error, but we don't continue */
+                if (error == -1)
+                        error = 0;
+                goto out_free;
+        }
+        bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
+                                    mapp, nmap, 0);
+        error = bp ? bp->b_error : -EIO;
+        if (error) {
+                xfs_trans_brelse(trans, bp);
+                goto out_free;
+        }
+        *bpp = bp;
+out_free:
+        if (mapp != &map)
+                kmem_free(mapp);
+        return error;
+}
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+        struct xfs_trans        *trans,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp,
+        int                     whichfork,
+        const struct xfs_buf_ops *ops)
+{
+        struct xfs_buf          *bp;
+        struct xfs_buf_map      map;
+        struct xfs_buf_map      *mapp;
+        int                     nmap;
+        int                     error;
+        *bpp = NULL;
+        mapp = &map;
+        nmap = 1;
+        error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+                                &mapp, &nmap);
+        if (error) {
+                /* mapping a hole is not an error, but we don't continue */
+                if (error == -1)
+                        error = 0;
+                goto out_free;
+        }
+        error = xfs_trans_read_buf_map(dp->i_mount, trans,
+                                        dp->i_mount->m_ddev_targp,
+                                        mapp, nmap, 0, &bp, ops);
+        if (error)
+                goto out_free;
+        if (whichfork == XFS_ATTR_FORK)
+                xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
+        else
+                xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
+        *bpp = bp;
+out_free:
+        if (mapp != &map)
+                kmem_free(mapp);
+        return error;
+}
+/*
+ * Readahead the dir/attr block.
+ */
+xfs_daddr_t
+xfs_da_reada_buf(
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        int                     whichfork,
+        const struct xfs_buf_ops *ops)
+{
+        struct xfs_buf_map      map;
+        struct xfs_buf_map      *mapp;
+        int                     nmap;
+        int                     error;
+        mapp = &map;
+        nmap = 1;
+        error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
+                                &mapp, &nmap);
+        if (error) {
+                /* mapping a hole is not an error, but we don't continue */
+                if (error == -1)
+                        error = 0;
+                goto out_free;
+        }
+        mappedbno = mapp[0].bm_bn;
+        xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
+out_free:
+        if (mapp != &map)
+                kmem_free(mapp);
+        if (error)
+                return -1;
+        return mappedbno;
+}
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
new file mode 100644
index 000000000000..6e153e399a77
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define __XFS_DA_BTREE_H__
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_trans;
+struct zone;
+struct xfs_dir_ops;
+/*
+ * Directory/attribute geometry information. There will be one of these for each
+ * data fork type, and it will be passed around via the xfs_da_args. Global
+ * structures will be attached to the xfs_mount.
+ */
+struct xfs_da_geometry {
+        int             blksize;        /* da block size in bytes */
+        int             fsbcount;       /* da block size in filesystem blocks */
+        uint8_t         fsblog;         /* log2 of _filesystem_ block size */
+        uint8_t         blklog;         /* log2 of da block size */
+        uint            node_ents;      /* # of entries in a danode */
+        int             magicpct;       /* 37% of block size in bytes */
+        xfs_dablk_t     datablk;        /* blockno of dir data v2 */
+        xfs_dablk_t     leafblk;        /* blockno of leaf data v2 */
+        xfs_dablk_t     freeblk;        /* blockno of free data v2 */
+};
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+/*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+        XFS_CMP_DIFFERENT,      /* names are completely different */
+        XFS_CMP_EXACT,          /* names are exactly the same */
+        XFS_CMP_CASE            /* names are same but differ in case */
+};
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+        struct xfs_da_geometry *geo;    /* da block geometry */
+        const __uint8_t *name;          /* string (maybe not NULL terminated) */
+        int             namelen;        /* length of string (maybe no NULL) */
+        __uint8_t       filetype;       /* filetype of inode for directories */
+        __uint8_t       *value;         /* set of bytes (maybe contain NULLs) */
+        int             valuelen;       /* length of value */
+        int             flags;          /* argument flags (eg: ATTR_NOCREATE) */
+        xfs_dahash_t    hashval;        /* hash value of name */
+        xfs_ino_t       inumber;        /* input/output inode number */
+        struct xfs_inode *dp;           /* directory inode to manipulate */
+        xfs_fsblock_t   *firstblock;    /* ptr to firstblock for bmap calls */
+        struct xfs_bmap_free *flist;    /* ptr to freelist for bmap_finish */
+        struct xfs_trans *trans;        /* current trans (changes over time) */
+        xfs_extlen_t    total;          /* total blocks needed, for 1st bmap */
+        int             whichfork;      /* data or attribute fork */
+        xfs_dablk_t     blkno;          /* blkno of attr leaf of interest */
+        int             index;          /* index of attr of interest in blk */
+        xfs_dablk_t     rmtblkno;       /* remote attr value starting blkno */
+        int             rmtblkcnt;      /* remote attr value block count */
+        int             rmtvaluelen;    /* remote attr value length in bytes */
+        xfs_dablk_t     blkno2;         /* blkno of 2nd attr leaf of interest */
+        int             index2;         /* index of 2nd attr in blk */
+        xfs_dablk_t     rmtblkno2;      /* remote attr value starting blkno */
+        int             rmtblkcnt2;     /* remote attr value block count */
+        int             rmtvaluelen2;   /* remote attr value length in bytes */
+        int             op_flags;       /* operation flags */
+        enum xfs_dacmp  cmpresult;      /* name compare result for lookups */
+} xfs_da_args_t;
+/*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK     0x0001  /* check for ok with no space */
+#define XFS_DA_OP_RENAME        0x0002  /* this is an atomic rename op */
+#define XFS_DA_OP_ADDNAME       0x0004  /* this is an add operation */
+#define XFS_DA_OP_OKNOENT       0x0008  /* lookup/add op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP      0x0010  /* lookup to return CI name if found */
+#define XFS_DA_OP_FLAGS \
+        { XFS_DA_OP_JUSTCHECK,  "JUSTCHECK" }, \
+        { XFS_DA_OP_RENAME,     "RENAME" }, \
+        { XFS_DA_OP_ADDNAME,    "ADDNAME" }, \
+        { XFS_DA_OP_OKNOENT,    "OKNOENT" }, \
+        { XFS_DA_OP_CILOOKUP,   "CILOOKUP" }
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes.  With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+        struct xfs_buf  *bp;            /* buffer containing block */
+        xfs_dablk_t     blkno;          /* filesystem blkno of buffer */
+        xfs_daddr_t     disk_blkno;     /* on-disk blkno (in BBs) of buffer */
+        int             index;          /* relevant index into block */
+        xfs_dahash_t    hashval;        /* last hash value in block */
+        int             magic;          /* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+typedef struct xfs_da_state_path {
+        int                     active;         /* number of active levels */
+        xfs_da_state_blk_t      blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+typedef struct xfs_da_state {
+        xfs_da_args_t           *args;          /* filename arguments */
+        struct xfs_mount        *mp;            /* filesystem mount point */
+        xfs_da_state_path_t     path;           /* search/split paths */
+        xfs_da_state_path_t     altpath;        /* alternate path for join */
+        unsigned char           inleaf;         /* insert into 1->lf, 0->splf */
+        unsigned char           extravalid;     /* T/F: extrablk is in use */
+        unsigned char           extraafter;     /* T/F: extrablk is after new */
+        xfs_da_state_blk_t      extrablk;       /* for double-splits on leaves */
+                                                /* for dirv2 extrablk is data */
+} xfs_da_state_t;
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR)       ((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE)       \
+                (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+                (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+/*
+ * Name ops for directory and/or attr name operations
+ */
+struct xfs_nameops {
+        xfs_dahash_t    (*hashname)(struct xfs_name *);
+        enum xfs_dacmp  (*compname)(struct xfs_da_args *,
+                                        const unsigned char *, int);
+};
+/*========================================================================
+ * Function prototypes.
+ *========================================================================*/
+/*
+ * Routines used for growing the Btree.
+ */
+int     xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
+                            int level, struct xfs_buf **bpp, int whichfork);
+int     xfs_da3_split(xfs_da_state_t *state);
+/*
+ * Routines used for shrinking the Btree.
+ */
+int     xfs_da3_join(xfs_da_state_t *state);
+void    xfs_da3_fixhashpath(struct xfs_da_state *state,
+                            struct xfs_da_state_path *path_to_to_fix);
+/*
+ * Routines used for finding things in the Btree.
+ */
+int     xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
+int     xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+                                         int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int     xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+                                       xfs_da_state_blk_t *new_blk);
+int     xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                         xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                         struct xfs_buf **bpp, int which_fork);
+/*
+ * Utility routines.
+ */
+int     xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int     xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
+                              int count);
+int     xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+                              xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                              struct xfs_buf **bp, int whichfork);
+int     xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+                               xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                               struct xfs_buf **bpp, int whichfork,
+                               const struct xfs_buf_ops *ops);
+xfs_daddr_t     xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+                                xfs_daddr_t mapped_bno, int whichfork,
+                                const struct xfs_buf_ops *ops);
+int     xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+                                          struct xfs_buf *dead_buf);
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+                                const unsigned char *name, int len);
+xfs_da_state_t *xfs_da_state_alloc(void);
+void xfs_da_state_free(xfs_da_state_t *state);
+extern struct kmem_zone *xfs_da_state_zone;
+extern const struct xfs_nameops xfs_default_nameops;
+#endif  /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
new file mode 100644
index 000000000000..c9aee52a37e2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+/*
+ * Shortform directory ops
+ */
+static int
+xfs_dir2_sf_entsize(
+        struct xfs_dir2_sf_hdr  *hdr,
+        int                     len)
+{
+        int count = sizeof(struct xfs_dir2_sf_entry);   /* namelen + offset */
+        count += len;                                   /* name */
+        count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+                                sizeof(xfs_dir2_ino4_t); /* ino # */
+        return count;
+}
+static int
+xfs_dir3_sf_entsize(
+        struct xfs_dir2_sf_hdr  *hdr,
+        int                     len)
+{
+        return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
+}
+static struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+        struct xfs_dir2_sf_hdr  *hdr,
+        struct xfs_dir2_sf_entry *sfep)
+{
+        return (struct xfs_dir2_sf_entry *)
+                ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+}
+static struct xfs_dir2_sf_entry *
+xfs_dir3_sf_nextentry(
+        struct xfs_dir2_sf_hdr  *hdr,
+        struct xfs_dir2_sf_entry *sfep)
+{
+        return (struct xfs_dir2_sf_entry *)
+                ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
+}
+/*
+ * For filetype enabled shortform directories, the file type field is stored at
+ * the end of the name.  Because it's only a single byte, endian conversion is
+ * not necessary. For non-filetype enable directories, the type is always
+ * unknown and we never store the value.
+ */
+static __uint8_t
+xfs_dir2_sfe_get_ftype(
+        struct xfs_dir2_sf_entry *sfep)
+{
+        return XFS_DIR3_FT_UNKNOWN;
+}
+static void
+xfs_dir2_sfe_put_ftype(
+        struct xfs_dir2_sf_entry *sfep,
+        __uint8_t               ftype)
+{
+        ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+static __uint8_t
+xfs_dir3_sfe_get_ftype(
+        struct xfs_dir2_sf_entry *sfep)
+{
+        __uint8_t       ftype;
+        ftype = sfep->name[sfep->namelen];
+        if (ftype >= XFS_DIR3_FT_MAX)
+                return XFS_DIR3_FT_UNKNOWN;
+        return ftype;
+}
+static void
+xfs_dir3_sfe_put_ftype(
+        struct xfs_dir2_sf_entry *sfep,
+        __uint8_t               ftype)
+{
+        ASSERT(ftype < XFS_DIR3_FT_MAX);
+        sfep->name[sfep->namelen] = ftype;
+}
+/*
+ * Inode numbers in short-form directories can come in two versions,
+ * either 4 bytes or 8 bytes wide.  These helpers deal with the
+ * two forms transparently by looking at the headers i8count field.
+ *
+ * For 64-bit inode number the most significant byte must be zero.
+ */
+static xfs_ino_t
+xfs_dir2_sf_get_ino(
+        struct xfs_dir2_sf_hdr  *hdr,
+        xfs_dir2_inou_t         *from)
+{
+        if (hdr->i8count)
+                return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+        else
+                return get_unaligned_be32(&from->i4.i);
+}
+static void
+xfs_dir2_sf_put_ino(
+        struct xfs_dir2_sf_hdr  *hdr,
+        xfs_dir2_inou_t         *to,
+        xfs_ino_t               ino)
+{
+        ASSERT((ino & 0xff00000000000000ULL) == 0);
+        if (hdr->i8count)
+                put_unaligned_be64(ino, &to->i8.i);
+        else
+                put_unaligned_be32(ino, &to->i4.i);
+}
+static xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+        struct xfs_dir2_sf_hdr  *hdr)
+{
+        return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+}
+static void
+xfs_dir2_sf_put_parent_ino(
+        struct xfs_dir2_sf_hdr  *hdr,
+        xfs_ino_t               ino)
+{
+        xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+}
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
+ */
+static xfs_ino_t
+xfs_dir2_sfe_get_ino(
+        struct xfs_dir2_sf_hdr  *hdr,
+        struct xfs_dir2_sf_entry *sfep)
+{
+        return xfs_dir2_sf_get_ino(hdr,
+                                (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+}
+static void
+xfs_dir2_sfe_put_ino(
+        struct xfs_dir2_sf_hdr  *hdr,
+        struct xfs_dir2_sf_entry *sfep,
+        xfs_ino_t               ino)
+{
+        xfs_dir2_sf_put_ino(hdr,
+                            (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+}
+static xfs_ino_t
+xfs_dir3_sfe_get_ino(
+        struct xfs_dir2_sf_hdr  *hdr,
+        struct xfs_dir2_sf_entry *sfep)
+{
+        return xfs_dir2_sf_get_ino(hdr,
+                        (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+}
+static void
+xfs_dir3_sfe_put_ino(
+        struct xfs_dir2_sf_hdr  *hdr,
+        struct xfs_dir2_sf_entry *sfep,
+        xfs_ino_t               ino)
+{
+        xfs_dir2_sf_put_ino(hdr,
+                        (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+}
+/*
+ * Directory data block operations
+ */
+/*
+ * For special situations, the dirent size ends up fixed because we always know
+ * what the size of the entry is. That's true for the "." and "..", and
+ * therefore we know that they are a fixed size and hence their offsets are
+ * constant, as is the first entry.
+ *
+ * Hence, this calculation is written as a macro to be able to be calculated at
+ * compile time and so certain offsets can be calculated directly in the
+ * structure initaliser via the macro. There are two macros - one for dirents
+ * with ftype and without so there are no unresolvable conditionals in the
+ * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
+ * of 2 and the compiler doesn't reject it (unlike roundup()).
+ */
+#define XFS_DIR2_DATA_ENTSIZE(n)                                        \
+        round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+                 sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
+#define XFS_DIR3_DATA_ENTSIZE(n)                                        \
+        round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+                 sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)),      \
+                XFS_DIR2_DATA_ALIGN)
+static int
+xfs_dir2_data_entsize(
+        int                     n)
+{
+        return XFS_DIR2_DATA_ENTSIZE(n);
+}
+static int
+xfs_dir3_data_entsize(
+        int                     n)
+{
+        return XFS_DIR3_DATA_ENTSIZE(n);
+}
+static __uint8_t
+xfs_dir2_data_get_ftype(
+        struct xfs_dir2_data_entry *dep)
+{
+        return XFS_DIR3_FT_UNKNOWN;
+}
+static void
+xfs_dir2_data_put_ftype(
+        struct xfs_dir2_data_entry *dep,
+        __uint8_t               ftype)
+{
+        ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+static __uint8_t
+xfs_dir3_data_get_ftype(
+        struct xfs_dir2_data_entry *dep)
+{
+        __uint8_t       ftype = dep->name[dep->namelen];
+        ASSERT(ftype < XFS_DIR3_FT_MAX);
+        if (ftype >= XFS_DIR3_FT_MAX)
+                return XFS_DIR3_FT_UNKNOWN;
+        return ftype;
+}
+static void
+xfs_dir3_data_put_ftype(
+        struct xfs_dir2_data_entry *dep,
+        __uint8_t               type)
+{
+        ASSERT(type < XFS_DIR3_FT_MAX);
+        ASSERT(dep->namelen != 0);
+        dep->name[dep->namelen] = type;
+}
+/*
+ * Pointer to an entry's tag word.
+ */
+static __be16 *
+xfs_dir2_data_entry_tag_p(
+        struct xfs_dir2_data_entry *dep)
+{
+        return (__be16 *)((char *)dep +
+                xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+static __be16 *
+xfs_dir3_data_entry_tag_p(
+        struct xfs_dir2_data_entry *dep)
+{
+        return (__be16 *)((char *)dep +
+                xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
+}
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dot_entry_p(
+        struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dotdot_entry_p(
+        struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+                                XFS_DIR2_DATA_ENTSIZE(1));
+}
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_first_entry_p(
+        struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+                                XFS_DIR2_DATA_ENTSIZE(1) +
+                                XFS_DIR2_DATA_ENTSIZE(2));
+}
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_dotdot_entry_p(
+        struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+                                XFS_DIR3_DATA_ENTSIZE(1));
+}
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_first_entry_p(
+        struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+                                XFS_DIR3_DATA_ENTSIZE(1) +
+                                XFS_DIR3_DATA_ENTSIZE(2));
+}
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(
+        struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(
+        struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+                                XFS_DIR3_DATA_ENTSIZE(1));
+}
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(
+        struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+                                XFS_DIR3_DATA_ENTSIZE(1) +
+                                XFS_DIR3_DATA_ENTSIZE(2));
+}
+static struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+        return hdr->bestfree;
+}
+static struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+        return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+}
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+static struct xfs_dir2_data_unused *
+xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_unused *)
+                ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_entry *)
+                ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+static struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+        return (struct xfs_dir2_data_unused *)
+                ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+/*
+ * Directory Leaf block operations
+ */
+static int
+xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+        return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
+                (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+static struct xfs_dir2_leaf_entry *
+xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+        return lp->__ents;
+}
+static int
+xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+        return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
+                (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+static struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+        return ((struct xfs_dir3_leaf *)lp)->__ents;
+}
+static void
+xfs_dir2_leaf_hdr_from_disk(
+        struct xfs_dir3_icleaf_hdr      *to,
+        struct xfs_dir2_leaf            *from)
+{
+        to->forw = be32_to_cpu(from->hdr.info.forw);
+        to->back = be32_to_cpu(from->hdr.info.back);
+        to->magic = be16_to_cpu(from->hdr.info.magic);
+        to->count = be16_to_cpu(from->hdr.count);
+        to->stale = be16_to_cpu(from->hdr.stale);
+        ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+               to->magic == XFS_DIR2_LEAFN_MAGIC);
+}
+static void
+xfs_dir2_leaf_hdr_to_disk(
+        struct xfs_dir2_leaf            *to,
+        struct xfs_dir3_icleaf_hdr      *from)
+{
+        ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+               from->magic == XFS_DIR2_LEAFN_MAGIC);
+        to->hdr.info.forw = cpu_to_be32(from->forw);
+        to->hdr.info.back = cpu_to_be32(from->back);
+        to->hdr.info.magic = cpu_to_be16(from->magic);
+        to->hdr.count = cpu_to_be16(from->count);
+        to->hdr.stale = cpu_to_be16(from->stale);
+}
+static void
+xfs_dir3_leaf_hdr_from_disk(
+        struct xfs_dir3_icleaf_hdr      *to,
+        struct xfs_dir2_leaf            *from)
+{
+        struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+        to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+        to->back = be32_to_cpu(hdr3->info.hdr.back);
+        to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+        to->count = be16_to_cpu(hdr3->count);
+        to->stale = be16_to_cpu(hdr3->stale);
+        ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+               to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+static void
+xfs_dir3_leaf_hdr_to_disk(
+        struct xfs_dir2_leaf            *to,
+        struct xfs_dir3_icleaf_hdr      *from)
+{
+        struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+        ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+               from->magic == XFS_DIR3_LEAFN_MAGIC);
+        hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+        hdr3->info.hdr.back = cpu_to_be32(from->back);
+        hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+        hdr3->count = cpu_to_be16(from->count);
+        hdr3->stale = cpu_to_be16(from->stale);
+}
+/*
+ * Directory/Attribute Node block operations
+ */
+static struct xfs_da_node_entry *
+xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
+{
+        return dap->__btree;
+}
+static struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+        return ((struct xfs_da3_intnode *)dap)->__btree;
+}
+static void
+xfs_da2_node_hdr_from_disk(
+        struct xfs_da3_icnode_hdr       *to,
+        struct xfs_da_intnode           *from)
+{
+        ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+        to->forw = be32_to_cpu(from->hdr.info.forw);
+        to->back = be32_to_cpu(from->hdr.info.back);
+        to->magic = be16_to_cpu(from->hdr.info.magic);
+        to->count = be16_to_cpu(from->hdr.__count);
+        to->level = be16_to_cpu(from->hdr.__level);
+}
+static void
+xfs_da2_node_hdr_to_disk(
+        struct xfs_da_intnode           *to,
+        struct xfs_da3_icnode_hdr       *from)
+{
+        ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+        to->hdr.info.forw = cpu_to_be32(from->forw);
+        to->hdr.info.back = cpu_to_be32(from->back);
+        to->hdr.info.magic = cpu_to_be16(from->magic);
+        to->hdr.__count = cpu_to_be16(from->count);
+        to->hdr.__level = cpu_to_be16(from->level);
+}
+static void
+xfs_da3_node_hdr_from_disk(
+        struct xfs_da3_icnode_hdr       *to,
+        struct xfs_da_intnode           *from)
+{
+        struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+        ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+        to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+        to->back = be32_to_cpu(hdr3->info.hdr.back);
+        to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+        to->count = be16_to_cpu(hdr3->__count);
+        to->level = be16_to_cpu(hdr3->__level);
+}
+static void
+xfs_da3_node_hdr_to_disk(
+        struct xfs_da_intnode           *to,
+        struct xfs_da3_icnode_hdr       *from)
+{
+        struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+        ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+        hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+        hdr3->info.hdr.back = cpu_to_be32(from->back);
+        hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+        hdr3->__count = cpu_to_be16(from->count);
+        hdr3->__level = cpu_to_be16(from->level);
+}
+/*
+ * Directory free space block operations
+ */
+static int
+xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
+{
+        return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
+                sizeof(xfs_dir2_data_off_t);
+}
+static __be16 *
+xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
+{
+        return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
+}
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+        return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+                        (db / xfs_dir2_free_max_bests(geo));
+}
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+        return db % xfs_dir2_free_max_bests(geo);
+}
+static int
+xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
+{
+        return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
+                sizeof(xfs_dir2_data_off_t);
+}
+static __be16 *
+xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
+{
+        return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
+}
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+        return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+                        (db / xfs_dir3_free_max_bests(geo));
+}
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+        return db % xfs_dir3_free_max_bests(geo);
+}
+static void
+xfs_dir2_free_hdr_from_disk(
+        struct xfs_dir3_icfree_hdr      *to,
+        struct xfs_dir2_free            *from)
+{
+        to->magic = be32_to_cpu(from->hdr.magic);
+        to->firstdb = be32_to_cpu(from->hdr.firstdb);
+        to->nvalid = be32_to_cpu(from->hdr.nvalid);
+        to->nused = be32_to_cpu(from->hdr.nused);
+        ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+}
+static void
+xfs_dir2_free_hdr_to_disk(
+        struct xfs_dir2_free            *to,
+        struct xfs_dir3_icfree_hdr      *from)
+{
+        ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+        to->hdr.magic = cpu_to_be32(from->magic);
+        to->hdr.firstdb = cpu_to_be32(from->firstdb);
+        to->hdr.nvalid = cpu_to_be32(from->nvalid);
+        to->hdr.nused = cpu_to_be32(from->nused);
+}
+static void
+xfs_dir3_free_hdr_from_disk(
+        struct xfs_dir3_icfree_hdr      *to,
+        struct xfs_dir2_free            *from)
+{
+        struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+        to->magic = be32_to_cpu(hdr3->hdr.magic);
+        to->firstdb = be32_to_cpu(hdr3->firstdb);
+        to->nvalid = be32_to_cpu(hdr3->nvalid);
+        to->nused = be32_to_cpu(hdr3->nused);
+        ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+}
+static void
+xfs_dir3_free_hdr_to_disk(
+        struct xfs_dir2_free            *to,
+        struct xfs_dir3_icfree_hdr      *from)
+{
+        struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+        ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+        hdr3->hdr.magic = cpu_to_be32(from->magic);
+        hdr3->firstdb = cpu_to_be32(from->firstdb);
+        hdr3->nvalid = cpu_to_be32(from->nvalid);
+        hdr3->nused = cpu_to_be32(from->nused);
+}
+static const struct xfs_dir_ops xfs_dir2_ops = {
+        .sf_entsize = xfs_dir2_sf_entsize,
+        .sf_nextentry = xfs_dir2_sf_nextentry,
+        .sf_get_ftype = xfs_dir2_sfe_get_ftype,
+        .sf_put_ftype = xfs_dir2_sfe_put_ftype,
+        .sf_get_ino = xfs_dir2_sfe_get_ino,
+        .sf_put_ino = xfs_dir2_sfe_put_ino,
+        .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+        .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+        .data_entsize = xfs_dir2_data_entsize,
+        .data_get_ftype = xfs_dir2_data_get_ftype,
+        .data_put_ftype = xfs_dir2_data_put_ftype,
+        .data_entry_tag_p = xfs_dir2_data_entry_tag_p,
+        .data_bestfree_p = xfs_dir2_data_bestfree_p,
+        .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+        .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+                                XFS_DIR2_DATA_ENTSIZE(1),
+        .data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+                                XFS_DIR2_DATA_ENTSIZE(1) +
+                                XFS_DIR2_DATA_ENTSIZE(2),
+        .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+        .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+        .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
+        .data_first_entry_p = xfs_dir2_data_first_entry_p,
+        .data_entry_p = xfs_dir2_data_entry_p,
+        .data_unused_p = xfs_dir2_data_unused_p,
+        .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+        .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+        .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+        .leaf_max_ents = xfs_dir2_max_leaf_ents,
+        .leaf_ents_p = xfs_dir2_leaf_ents_p,
+        .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+        .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+        .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+        .node_tree_p = xfs_da2_node_tree_p,
+        .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+        .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+        .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+        .free_max_bests = xfs_dir2_free_max_bests,
+        .free_bests_p = xfs_dir2_free_bests_p,
+        .db_to_fdb = xfs_dir2_db_to_fdb,
+        .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
+        .sf_entsize = xfs_dir3_sf_entsize,
+        .sf_nextentry = xfs_dir3_sf_nextentry,
+        .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+        .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+        .sf_get_ino = xfs_dir3_sfe_get_ino,
+        .sf_put_ino = xfs_dir3_sfe_put_ino,
+        .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+        .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+        .data_entsize = xfs_dir3_data_entsize,
+        .data_get_ftype = xfs_dir3_data_get_ftype,
+        .data_put_ftype = xfs_dir3_data_put_ftype,
+        .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+        .data_bestfree_p = xfs_dir2_data_bestfree_p,
+        .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+        .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+                                XFS_DIR3_DATA_ENTSIZE(1),
+        .data_first_offset =  sizeof(struct xfs_dir2_data_hdr) +
+                                XFS_DIR3_DATA_ENTSIZE(1) +
+                                XFS_DIR3_DATA_ENTSIZE(2),
+        .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+        .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+        .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
+        .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
+        .data_entry_p = xfs_dir2_data_entry_p,
+        .data_unused_p = xfs_dir2_data_unused_p,
+        .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+        .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+        .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+        .leaf_max_ents = xfs_dir2_max_leaf_ents,
+        .leaf_ents_p = xfs_dir2_leaf_ents_p,
+        .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+        .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+        .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+        .node_tree_p = xfs_da2_node_tree_p,
+        .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+        .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+        .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+        .free_max_bests = xfs_dir2_free_max_bests,
+        .free_bests_p = xfs_dir2_free_bests_p,
+        .db_to_fdb = xfs_dir2_db_to_fdb,
+        .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+static const struct xfs_dir_ops xfs_dir3_ops = {
+        .sf_entsize = xfs_dir3_sf_entsize,
+        .sf_nextentry = xfs_dir3_sf_nextentry,
+        .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+        .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+        .sf_get_ino = xfs_dir3_sfe_get_ino,
+        .sf_put_ino = xfs_dir3_sfe_put_ino,
+        .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+        .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+        .data_entsize = xfs_dir3_data_entsize,
+        .data_get_ftype = xfs_dir3_data_get_ftype,
+        .data_put_ftype = xfs_dir3_data_put_ftype,
+        .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+        .data_bestfree_p = xfs_dir3_data_bestfree_p,
+        .data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
+        .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
+                                XFS_DIR3_DATA_ENTSIZE(1),
+        .data_first_offset =  sizeof(struct xfs_dir3_data_hdr) +
+                                XFS_DIR3_DATA_ENTSIZE(1) +
+                                XFS_DIR3_DATA_ENTSIZE(2),
+        .data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
+        .data_dot_entry_p = xfs_dir3_data_dot_entry_p,
+        .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
+        .data_first_entry_p = xfs_dir3_data_first_entry_p,
+        .data_entry_p = xfs_dir3_data_entry_p,
+        .data_unused_p = xfs_dir3_data_unused_p,
+        .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
+        .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
+        .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
+        .leaf_max_ents = xfs_dir3_max_leaf_ents,
+        .leaf_ents_p = xfs_dir3_leaf_ents_p,
+        .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+        .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+        .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+        .node_tree_p = xfs_da3_node_tree_p,
+        .free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
+        .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
+        .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
+        .free_max_bests = xfs_dir3_free_max_bests,
+        .free_bests_p = xfs_dir3_free_bests_p,
+        .db_to_fdb = xfs_dir3_db_to_fdb,
+        .db_to_fdindex = xfs_dir3_db_to_fdindex,
+};
+static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
+        .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+        .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+        .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+        .node_tree_p = xfs_da2_node_tree_p,
+};
+static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
+        .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+        .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+        .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+        .node_tree_p = xfs_da3_node_tree_p,
+};
+/*
+ * Return the ops structure according to the current config.  If we are passed
+ * an inode, then that overrides the default config we use which is based on
+ * feature bits.
+ */
+const struct xfs_dir_ops *
+xfs_dir_get_ops(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *dp)
+{
+        if (dp)
+                return dp->d_ops;
+        if (mp->m_dir_inode_ops)
+                return mp->m_dir_inode_ops;
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                return &xfs_dir3_ops;
+        if (xfs_sb_version_hasftype(&mp->m_sb))
+                return &xfs_dir2_ftype_ops;
+        return &xfs_dir2_ops;
+}
+const struct xfs_dir_ops *
+xfs_nondir_get_ops(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *dp)
+{
+        if (dp)
+                return dp->d_ops;
+        if (mp->m_nondir_inode_ops)
+                return mp->m_nondir_inode_ops;
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                return &xfs_dir3_nondir_ops;
+        return &xfs_dir2_nondir_ops;
+}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
new file mode 100644
index 000000000000..0a49b0286372
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -0,0 +1,861 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DA_FORMAT_H__
+#define __XFS_DA_FORMAT_H__
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * It is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC       0xfebe  /* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC     0xfbee  /* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC    0xd2f1  /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC    0xd2ff  /* magic number: v2 dirlf multi blks */
+typedef struct xfs_da_blkinfo {
+        __be32          forw;                   /* previous block in list */
+        __be32          back;                   /* following block in list */
+        __be16          magic;                  /* validity check on block */
+        __be16          pad;                    /* unused */
+} xfs_da_blkinfo_t;
+/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC      0x3ebe  /* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC    0x3bee  /* magic number: attribute leaf blks */
+#define XFS_DIR3_LEAF1_MAGIC    0x3df1  /* magic number: v2 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC    0x3dff  /* magic number: v2 dirlf multi blks */
+struct xfs_da3_blkinfo {
+        /*
+         * the node link manipulation code relies on the fact that the first
+         * element of this structure is the struct xfs_da_blkinfo so it can
+         * ignore the differences in the rest of the structures.
+         */
+        struct xfs_da_blkinfo   hdr;
+        __be32                  crc;    /* CRC of block */
+        __be64                  blkno;  /* first block of the buffer */
+        __be64                  lsn;    /* sequence number of last write */
+        uuid_t                  uuid;   /* filesystem we belong to */
+        __be64                  owner;  /* inode that owns the block */
+};
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH    5       /* max depth of Btree */
+typedef struct xfs_da_node_hdr {
+        struct xfs_da_blkinfo   info;   /* block type, links, etc. */
+        __be16                  __count; /* count of active entries */
+        __be16                  __level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+struct xfs_da3_node_hdr {
+        struct xfs_da3_blkinfo  info;   /* block type, links, etc. */
+        __be16                  __count; /* count of active entries */
+        __be16                  __level; /* level above leaves (leaf == 0) */
+        __be32                  __pad32;
+};
+#define XFS_DA3_NODE_CRC_OFF    (offsetof(struct xfs_da3_node_hdr, info.crc))
+typedef struct xfs_da_node_entry {
+        __be32  hashval;        /* hash value for this descendant */
+        __be32  before;         /* Btree block before this key */
+} xfs_da_node_entry_t;
+typedef struct xfs_da_intnode {
+        struct xfs_da_node_hdr  hdr;
+        struct xfs_da_node_entry __btree[];
+} xfs_da_intnode_t;
+struct xfs_da3_intnode {
+        struct xfs_da3_node_hdr hdr;
+        struct xfs_da_node_entry __btree[];
+};
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+        __uint32_t      forw;
+        __uint32_t      back;
+        __uint16_t      magic;
+        __uint16_t      count;
+        __uint16_t      level;
+};
+/*
+ * Directory version 2.
+ *
+ * There are 4 possible formats:
+ *  - shortform - embedded into the inode
+ *  - single block - data with embedded leaf at the end
+ *  - multiple data blocks, single leaf+freeindex block
+ *  - data blocks, node and leaf blocks (btree), freeindex blocks
+ *
+ * Note: many node blocks structures and constants are shared with the attr
+ * code and defined in xfs_da_btree.h.
+ */
+#define XFS_DIR2_BLOCK_MAGIC    0x58443242      /* XD2B: single block dirs */
+#define XFS_DIR2_DATA_MAGIC     0x58443244      /* XD2D: multiblock dirs */
+#define XFS_DIR2_FREE_MAGIC     0x58443246      /* XD2F: free index blocks */
+/*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories.  The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ *      - a larger block header for CRC and identification purposes and so the
+ *      offsets of all the structures inside the blocks are different.
+ *
+ *      - new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+#define XFS_DIR3_BLOCK_MAGIC    0x58444233      /* XDB3: single block dirs */
+#define XFS_DIR3_DATA_MAGIC     0x58444433      /* XDD3: multiblock dirs */
+#define XFS_DIR3_FREE_MAGIC     0x58444633      /* XDF3: free index blocks */
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN             0
+#define XFS_DIR3_FT_REG_FILE            1
+#define XFS_DIR3_FT_DIR                 2
+#define XFS_DIR3_FT_CHRDEV              3
+#define XFS_DIR3_FT_BLKDEV              4
+#define XFS_DIR3_FT_FIFO                5
+#define XFS_DIR3_FT_SOCK                6
+#define XFS_DIR3_FT_SYMLINK             7
+#define XFS_DIR3_FT_WHT                 8
+#define XFS_DIR3_FT_MAX                 9
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef __uint16_t      xfs_dir2_data_off_t;
+#define NULLDATAOFF     0xffffU
+typedef uint            xfs_dir2_data_aoff_t;   /* argument form */
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
+/*
+ * Offset in data space of a data entry.
+ */
+typedef __uint32_t      xfs_dir2_dataptr_t;
+#define XFS_DIR2_MAX_DATAPTR    ((xfs_dir2_dataptr_t)0xffffffff)
+#define XFS_DIR2_NULL_DATAPTR   ((xfs_dir2_dataptr_t)0)
+/*
+ * Byte offset in a directory.
+ */
+typedef xfs_off_t       xfs_dir2_off_t;
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef __uint32_t      xfs_dir2_db_t;
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+typedef union {
+        xfs_dir2_ino8_t i8;
+        xfs_dir2_ino4_t i4;
+} xfs_dir2_inou_t;
+#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to fit into the
+ * literal area of the inode.  These "shortform" directories consist of a
+ * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry
+ * structures.  Due the different inode number storage size and the variable
+ * length name field in the xfs_dir2_sf_entry all these structure are
+ * variable length, and the accessors in this file should be used to iterate
+ * over them.
+ */
+typedef struct xfs_dir2_sf_hdr {
+        __uint8_t               count;          /* count of entries */
+        __uint8_t               i8count;        /* count of 8-byte inode #s */
+        xfs_dir2_inou_t         parent;         /* parent dir inode number */
+} __arch_pack xfs_dir2_sf_hdr_t;
+typedef struct xfs_dir2_sf_entry {
+        __u8                    namelen;        /* actual name length */
+        xfs_dir2_sf_off_t       offset;         /* saved offset */
+        __u8                    name[];         /* name, variable size */
+        /*
+         * A single byte containing the file type field follows the inode
+         * number for version 3 directory entries.
+         *
+         * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
+         * variable offset after the name.
+         */
+} __arch_pack xfs_dir2_sf_entry_t;
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+        return sizeof(struct xfs_dir2_sf_hdr) -
+                (i8count == 0) *
+                (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+}
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+        return get_unaligned_be16(&sfep->offset.i);
+}
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+        put_unaligned_be16(off, &sfep->offset.i);
+}
+static inline struct xfs_dir2_sf_entry *
+xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
+{
+        return (struct xfs_dir2_sf_entry *)
+                ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
+}
+/*
+ * Data block structures.
+ *
+ * A pure data block looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ *
+ * In addition to the pure data blocks for the data and node formats,
+ * most structures are also used for the combined data/freespace "block"
+ * format below.
+ */
+#define XFS_DIR2_DATA_ALIGN_LOG 3               /* i.e., 8 bytes */
+#define XFS_DIR2_DATA_ALIGN     (1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define XFS_DIR2_DATA_FREE_TAG  0xffff
+#define XFS_DIR2_DATA_FD_COUNT  3
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define XFS_DIR2_SPACE_SIZE     (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define XFS_DIR2_DATA_SPACE     0
+#define XFS_DIR2_DATA_OFFSET    (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+/*
+ * Describe a free area in the data block.
+ *
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+        __be16                  offset;         /* start of freespace */
+        __be16                  length;         /* length of freespace */
+} xfs_dir2_data_free_t;
+/*
+ * Header for the data blocks.
+ *
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+        __be32                  magic;          /* XFS_DIR2_DATA_MAGIC or */
+                                                /* XFS_DIR2_BLOCK_MAGIC */
+        xfs_dir2_data_free_t    bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+/*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+        __be32                  magic;  /* magic number */
+        __be32                  crc;    /* CRC of block */
+        __be64                  blkno;  /* first block of the buffer */
+        __be64                  lsn;    /* sequence number of last write */
+        uuid_t                  uuid;   /* filesystem we belong to */
+        __be64                  owner;  /* inode that owns the block */
+};
+struct xfs_dir3_data_hdr {
+        struct xfs_dir3_blk_hdr hdr;
+        xfs_dir2_data_free_t    best_free[XFS_DIR2_DATA_FD_COUNT];
+        __be32                  pad;    /* 64 bit alignment */
+};
+#define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+/*
+ * Active entry in a data block.
+ *
+ * Aligned to 8 bytes.  After the variable length name field there is a
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
+ */
+typedef struct xfs_dir2_data_entry {
+        __be64                  inumber;        /* inode number */
+        __u8                    namelen;        /* name length */
+        __u8                    name[];         /* name bytes, no null */
+     /* __u8                    filetype; */    /* type of inode we point to */
+     /* __be16                  tag; */         /* starting offset of us */
+} xfs_dir2_data_entry_t;
+/*
+ * Unused entry in a data block.
+ *
+ * Aligned to 8 bytes.  Tag appears as the last 2 bytes and must be accessed
+ * using xfs_dir2_data_unused_tag_p.
+ */
+typedef struct xfs_dir2_data_unused {
+        __be16                  freetag;        /* XFS_DIR2_DATA_FREE_TAG */
+        __be16                  length;         /* total free length */
+                                                /* variable offset */
+        __be16                  tag;            /* starting offset of us */
+} xfs_dir2_data_unused_t;
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
+{
+        return (__be16 *)((char *)dup +
+                        be16_to_cpu(dup->length) - sizeof(__be16));
+}
+/*
+ * Leaf block structures.
+ *
+ * A pure leaf block looks like the following drawing on disk:
+ *
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_hdr_t       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | xfs_dir2_leaf_entry_t     |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | xfs_dir2_data_off_t       |
+ *    | ...                       |
+ *    +---------------------------+
+ *    | xfs_dir2_leaf_tail_t      |
+ *    +---------------------------+
+ *
+ * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block
+ * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present
+ * for directories with separate leaf nodes and free space blocks
+ * (magic = XFS_DIR2_LEAFN_MAGIC).
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+/*
+ * Offset of the leaf/node space.  First block in this space
+ * is the btree root.
+ */
+#define XFS_DIR2_LEAF_SPACE     1
+#define XFS_DIR2_LEAF_OFFSET    (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+        xfs_da_blkinfo_t        info;           /* header for da routines */
+        __be16                  count;          /* count of entries */
+        __be16                  stale;          /* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+struct xfs_dir3_leaf_hdr {
+        struct xfs_da3_blkinfo  info;           /* header for da routines */
+        __be16                  count;          /* count of entries */
+        __be16                  stale;          /* count of stale entries */
+        __be32                  pad;            /* 64 bit alignment */
+};
+struct xfs_dir3_icleaf_hdr {
+        __uint32_t              forw;
+        __uint32_t              back;
+        __uint16_t              magic;
+        __uint16_t              count;
+        __uint16_t              stale;
+};
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+        __be32                  hashval;        /* hash value of name */
+        __be32                  address;        /* address of data entry */
+} xfs_dir2_leaf_entry_t;
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+        __be32                  bestcount;
+} xfs_dir2_leaf_tail_t;
+/*
+ * Leaf block.
+ */
+typedef struct xfs_dir2_leaf {
+        xfs_dir2_leaf_hdr_t     hdr;                    /* leaf header */
+        xfs_dir2_leaf_entry_t   __ents[];               /* entries */
+} xfs_dir2_leaf_t;
+struct xfs_dir3_leaf {
+        struct xfs_dir3_leaf_hdr        hdr;            /* leaf header */
+        struct xfs_dir2_leaf_entry      __ents[];       /* entries */
+};
+#define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
+{
+        return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+/*
+ * Free space block defintions for the node format.
+ */
+/*
+ * Offset of the freespace index.
+ */
+#define XFS_DIR2_FREE_SPACE     2
+#define XFS_DIR2_FREE_OFFSET    (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+typedef struct xfs_dir2_free_hdr {
+        __be32                  magic;          /* XFS_DIR2_FREE_MAGIC */
+        __be32                  firstdb;        /* db of first entry */
+        __be32                  nvalid;         /* count of valid entries */
+        __be32                  nused;          /* count of used entries */
+} xfs_dir2_free_hdr_t;
+typedef struct xfs_dir2_free {
+        xfs_dir2_free_hdr_t     hdr;            /* block header */
+        __be16                  bests[];        /* best free counts */
+                                                /* unused entries are -1 */
+} xfs_dir2_free_t;
+struct xfs_dir3_free_hdr {
+        struct xfs_dir3_blk_hdr hdr;
+        __be32                  firstdb;        /* db of first entry */
+        __be32                  nvalid;         /* count of valid entries */
+        __be32                  nused;          /* count of used entries */
+        __be32                  pad;            /* 64 bit alignment */
+};
+struct xfs_dir3_free {
+        struct xfs_dir3_free_hdr hdr;
+        __be16                  bests[];        /* best free counts */
+                                                /* unused entries are -1 */
+};
+#define XFS_DIR3_FREE_CRC_OFF  offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+/*
+ * In core version of the free block header, abstracted away from on-disk format
+ * differences. Use this in the code, and convert to/from the disk version using
+ * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
+ */
+struct xfs_dir3_icfree_hdr {
+        __uint32_t      magic;
+        __uint32_t      firstdb;
+        __uint32_t      nvalid;
+        __uint32_t      nused;
+};
+/*
+ * Single block format.
+ *
+ * The single block format looks like the following drawing on disk:
+ *
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_hdr_t                             |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t |
+ *    | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t :
+ *    | ...                                             |
+ *    +-------------------------------------------------+
+ *    | unused space                                    |
+ *    +-------------------------------------------------+
+ *    | ...                                             |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    | xfs_dir2_leaf_entry_t                           |
+ *    +-------------------------------------------------+
+ *    | xfs_dir2_block_tail_t                           |
+ *    +-------------------------------------------------+
+ *
+ * As all the entries are variable size structures the accessors below should
+ * be used to iterate over them.
+ */
+typedef struct xfs_dir2_block_tail {
+        __be32          count;                  /* count of leaf entries */
+        __be32          stale;                  /* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
+{
+        return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+/*
+ * Attribute storage layout
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.  The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Struct leaf_entry's are packed from the top.  Name/values grow from the
+ * bottom but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again.  If we
+ * still don't have enough space, then we have to split the block.  The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string.  The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry.  The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry.  It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set.  We clear the
+ * bit when we have finished setting up the attribute.  We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE   3       /* how many freespace slots */
+typedef struct xfs_attr_leaf_map {      /* RLE map of free bytes */
+        __be16  base;                     /* base of free region */
+        __be16  size;                     /* length of free region */
+} xfs_attr_leaf_map_t;
+typedef struct xfs_attr_leaf_hdr {      /* constant-structure header block */
+        xfs_da_blkinfo_t info;          /* block type, links, etc. */
+        __be16  count;                  /* count of active leaf_entry's */
+        __be16  usedbytes;              /* num bytes of names/values stored */
+        __be16  firstused;              /* first used byte in name area */
+        __u8    holes;                  /* != 0 if blk needs compaction */
+        __u8    pad1;
+        xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+                                        /* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+typedef struct xfs_attr_leaf_entry {    /* sorted on key, not name */
+        __be32  hashval;                /* hash value of name */
+        __be16  nameidx;                /* index into buffer of name/value */
+        __u8    flags;                  /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+        __u8    pad2;                   /* unused pad byte */
+} xfs_attr_leaf_entry_t;
+typedef struct xfs_attr_leaf_name_local {
+        __be16  valuelen;               /* number of bytes in value */
+        __u8    namelen;                /* length of name bytes */
+        __u8    nameval[1];             /* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+typedef struct xfs_attr_leaf_name_remote {
+        __be32  valueblk;               /* block number of value bytes */
+        __be32  valuelen;               /* number of bytes in value */
+        __u8    namelen;                /* length of name bytes */
+        __u8    name[1];                /* name bytes */
+} xfs_attr_leaf_name_remote_t;
+typedef struct xfs_attr_leafblock {
+        xfs_attr_leaf_hdr_t     hdr;    /* constant-structure header block */
+        xfs_attr_leaf_entry_t   entries[1];     /* sorted on key, not name */
+        xfs_attr_leaf_name_local_t namelist;    /* grows from bottom of buf */
+        xfs_attr_leaf_name_remote_t valuelist;  /* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+        struct xfs_da3_blkinfo  info;
+        __be16                  count;
+        __be16                  usedbytes;
+        __be16                  firstused;
+        __u8                    holes;
+        __u8                    pad1;
+        struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+        __be32                  pad2;           /* 64 bit alignment */
+};
+#define XFS_ATTR3_LEAF_CRC_OFF  (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+struct xfs_attr3_leafblock {
+        struct xfs_attr3_leaf_hdr       hdr;
+        struct xfs_attr_leaf_entry      entries[1];
+        /*
+         * The rest of the block contains the following structures after the
+         * leaf entries, growing from the bottom up. The variables are never
+         * referenced, the locations accessed purely from helper functions.
+         *
+         * struct xfs_attr_leaf_name_local
+         * struct xfs_attr_leaf_name_remote
+         */
+};
+/*
+ * incore, neutral version of the attribute leaf header
+ */
+struct xfs_attr3_icleaf_hdr {
+        __uint32_t      forw;
+        __uint32_t      back;
+        __uint16_t      magic;
+        __uint16_t      count;
+        __uint16_t      usedbytes;
+        __uint16_t      firstused;
+        __u8            holes;
+        struct {
+                __uint16_t      base;
+                __uint16_t      size;
+        } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define XFS_ATTR_LOCAL_BIT      0       /* attr is stored locally */
+#define XFS_ATTR_ROOT_BIT       1       /* limit access to trusted attrs */
+#define XFS_ATTR_SECURE_BIT     2       /* limit access to secure attrs */
+#define XFS_ATTR_INCOMPLETE_BIT 7       /* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL          (1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT           (1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE         (1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE     (1 << XFS_ATTR_INCOMPLETE_BIT)
+/*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK          (ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK        (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags)      ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags)        ((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x)  (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+                                         ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x)  (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+                                         ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define XFS_ATTR_LEAF_NAME_ALIGN        ((uint)sizeof(xfs_dablk_t))
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+        if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+                return sizeof(struct xfs_attr3_leaf_hdr);
+        return sizeof(struct xfs_attr_leaf_hdr);
+}
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+        if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+                return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+        return &leafp->entries[0];
+}
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+        struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+        return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
+}
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+        return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+        return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+        return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+                XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+        return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
+                XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+        return (((bsize) >> 1) + ((bsize) >> 2));
+}
+/*
+ * Remote attribute block format definition
+ *
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+#define XFS_ATTR3_RMT_MAGIC     0x5841524d      /* XARM */
+struct xfs_attr3_rmt_hdr {
+        __be32  rm_magic;
+        __be32  rm_offset;
+        __be32  rm_bytes;
+        __be32  rm_crc;
+        uuid_t  rm_uuid;
+        __be64  rm_owner;
+        __be64  rm_blkno;
+        __be64  rm_lsn;
+};
+#define XFS_ATTR3_RMT_CRC_OFF   offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize)    \
+        ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+                        sizeof(struct xfs_attr3_rmt_hdr) : 0))
+#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
new file mode 100644
index 000000000000..623bbe8fd921
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dinode.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DINODE_H__
+#define __XFS_DINODE_H__
+#define XFS_DINODE_MAGIC                0x494e  /* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v)      ((v) >= 1 && (v) <= 3)
+typedef struct xfs_timestamp {
+        __be32          t_sec;          /* timestamp seconds */
+        __be32          t_nsec;         /* timestamp nanoseconds */
+} xfs_timestamp_t;
+/*
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
+ */
+typedef struct xfs_dinode {
+        __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
+        __be16          di_mode;        /* mode and type of file */
+        __u8            di_version;     /* inode version */
+        __u8            di_format;      /* format of di_c data */
+        __be16          di_onlink;      /* old number of links to file */
+        __be32          di_uid;         /* owner's user id */
+        __be32          di_gid;         /* owner's group id */
+        __be32          di_nlink;       /* number of links to file */
+        __be16          di_projid_lo;   /* lower part of owner's project id */
+        __be16          di_projid_hi;   /* higher part owner's project id */
+        __u8            di_pad[6];      /* unused, zeroed space */
+        __be16          di_flushiter;   /* incremented on flush */
+        xfs_timestamp_t di_atime;       /* time last accessed */
+        xfs_timestamp_t di_mtime;       /* time last modified */
+        xfs_timestamp_t di_ctime;       /* time created/inode modified */
+        __be64          di_size;        /* number of bytes in file */
+        __be64          di_nblocks;     /* # of direct & btree blocks used */
+        __be32          di_extsize;     /* basic/minimum extent size for file */
+        __be32          di_nextents;    /* number of extents in data fork */
+        __be16          di_anextents;   /* number of extents in attribute fork*/
+        __u8            di_forkoff;     /* attr fork offs, <<3 for 64b align */
+        __s8            di_aformat;     /* format of attr fork's data */
+        __be32          di_dmevmask;    /* DMIG event mask */
+        __be16          di_dmstate;     /* DMIG state info */
+        __be16          di_flags;       /* random flags, XFS_DIFLAG_... */
+        __be32          di_gen;         /* generation number */
+        /* di_next_unlinked is the only non-core field in the old dinode */
+        __be32          di_next_unlinked;/* agi unlinked list ptr */
+        /* start of the extended dinode, writable fields */
+        __le32          di_crc;         /* CRC of the inode */
+        __be64          di_changecount; /* number of attribute changes */
+        __be64          di_lsn;         /* flush sequence */
+        __be64          di_flags2;      /* more random flags */
+        __u8            di_pad2[16];    /* more padding for future expansion */
+        /* fields only written to during inode creation */
+        xfs_timestamp_t di_crtime;      /* time created */
+        __be64          di_ino;         /* inode number */
+        uuid_t          di_uuid;        /* UUID of the filesystem */
+        /* structure must be padded to 64 bit alignment */
+} xfs_dinode_t;
+#define XFS_DINODE_CRC_OFF      offsetof(struct xfs_dinode, di_crc)
+#define DI_MAX_FLUSH 0xffff
+/*
+ * Size of the core inode on disk.  Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+        if (version == 3)
+                return sizeof(struct xfs_dinode);
+        return offsetof(struct xfs_dinode, di_crc);
+}
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define XFS_MAXLINK             ((1U << 31) - 1U)
+#define XFS_MAXLINK_1           65535U
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt {
+        XFS_DINODE_FMT_DEV,             /* xfs_dev_t */
+        XFS_DINODE_FMT_LOCAL,           /* bulk data */
+        XFS_DINODE_FMT_EXTENTS,         /* struct xfs_bmbt_rec */
+        XFS_DINODE_FMT_BTREE,           /* struct xfs_bmdr_block */
+        XFS_DINODE_FMT_UUID             /* uuid_t */
+} xfs_dinode_fmt_t;
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define XFS_DINODE_MIN_LOG      8
+#define XFS_DINODE_MAX_LOG      11
+#define XFS_DINODE_MIN_SIZE     (1 << XFS_DINODE_MIN_LOG)
+#define XFS_DINODE_MAX_SIZE     (1 << XFS_DINODE_MAX_LOG)
+/*
+ * Inode size for given fs.
+ */
+#define XFS_LITINO(mp, version) \
+        ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#define XFS_DFORK_Q(dip)                ((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip)             ((int)((dip)->di_forkoff << 3))
+#define XFS_DFORK_DSIZE(dip,mp) \
+        (XFS_DFORK_Q(dip) ? \
+                XFS_DFORK_BOFF(dip) : \
+                XFS_LITINO(mp, (dip)->di_version))
+#define XFS_DFORK_ASIZE(dip,mp) \
+        (XFS_DFORK_Q(dip) ? \
+                XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
+                0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+        ((w) == XFS_DATA_FORK ? \
+                XFS_DFORK_DSIZE(dip, mp) : \
+                XFS_DFORK_ASIZE(dip, mp))
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+        ((char *)dip + xfs_dinode_size(dip->di_version))
+#define XFS_DFORK_APTR(dip)     \
+        (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w)    \
+        ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+#define XFS_DFORK_FORMAT(dip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (dip)->di_format : \
+                (dip)->di_aformat)
+#define XFS_DFORK_NEXTENTS(dip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                be32_to_cpu((dip)->di_nextents) : \
+                be16_to_cpu((dip)->di_anextents))
+#define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)((bp)->b_addr))
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+        return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+        *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT  0      /* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT  1      /* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT   2      /* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3      /* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT    4      /* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT      5      /* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT   6      /* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT    7      /* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8      /* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT   9  /* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT   10  /* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT      11  /* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12  /* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT     13  /* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT   14  /* use filestream allocator */
+#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM    (1 << XFS_DIFLAG_FILESTREAM_BIT)
+#ifdef CONFIG_XFS_RT
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+#else
+#define XFS_IS_REALTIME_INODE(ip) (0)
+#endif
+#define XFS_DIFLAG_ANY \
+        (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+         XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+         XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+         XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+         XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
+#endif  /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
new file mode 100644
index 000000000000..6cef22152fd6
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+        struct xfs_name *name)
+{
+        xfs_dahash_t    hash;
+        int             i;
+        for (i = 0, hash = 0; i < name->len; i++)
+                hash = tolower(name->name[i]) ^ rol32(hash, 7);
+        return hash;
+}
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+        struct xfs_da_args *args,
+        const unsigned char *name,
+        int             len)
+{
+        enum xfs_dacmp  result;
+        int             i;
+        if (args->namelen != len)
+                return XFS_CMP_DIFFERENT;
+        result = XFS_CMP_EXACT;
+        for (i = 0; i < len; i++) {
+                if (args->name[i] == name[i])
+                        continue;
+                if (tolower(args->name[i]) != tolower(name[i]))
+                        return XFS_CMP_DIFFERENT;
+                result = XFS_CMP_CASE;
+        }
+        return result;
+}
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+        .hashname       = xfs_ascii_ci_hashname,
+        .compname       = xfs_ascii_ci_compname,
+};
+int
+xfs_da_mount(
+        struct xfs_mount        *mp)
+{
+        struct xfs_da_geometry  *dageo;
+        int                     nodehdr_size;
+        ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+        ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
+               XFS_MAX_BLOCKSIZE);
+        mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
+        mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
+        nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
+        mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+                                    KM_SLEEP | KM_MAYFAIL);
+        mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+                                     KM_SLEEP | KM_MAYFAIL);
+        if (!mp->m_dir_geo || !mp->m_attr_geo) {
+                kmem_free(mp->m_dir_geo);
+                kmem_free(mp->m_attr_geo);
+                return -ENOMEM;
+        }
+        /* set up directory geometry */
+        dageo = mp->m_dir_geo;
+        dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+        dageo->fsblog = mp->m_sb.sb_blocklog;
+        dageo->blksize = 1 << dageo->blklog;
+        dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+        /*
+         * Now we've set up the block conversion variables, we can calculate the
+         * segment block constants using the geometry structure.
+         */
+        dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+        dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+        dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+        dageo->node_ents = (dageo->blksize - nodehdr_size) /
+                                (uint)sizeof(xfs_da_node_entry_t);
+        dageo->magicpct = (dageo->blksize * 37) / 100;
+        /* set up attribute geometry - single fsb only */
+        dageo = mp->m_attr_geo;
+        dageo->blklog = mp->m_sb.sb_blocklog;
+        dageo->fsblog = mp->m_sb.sb_blocklog;
+        dageo->blksize = 1 << dageo->blklog;
+        dageo->fsbcount = 1;
+        dageo->node_ents = (dageo->blksize - nodehdr_size) /
+                                (uint)sizeof(xfs_da_node_entry_t);
+        dageo->magicpct = (dageo->blksize * 37) / 100;
+        if (xfs_sb_version_hasasciici(&mp->m_sb))
+                mp->m_dirnameops = &xfs_ascii_ci_nameops;
+        else
+                mp->m_dirnameops = &xfs_default_nameops;
+        return 0;
+}
+void
+xfs_da_unmount(
+        struct xfs_mount        *mp)
+{
+        kmem_free(mp->m_dir_geo);
+        kmem_free(mp->m_attr_geo);
+}
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+int
+xfs_dir_isempty(
+        xfs_inode_t     *dp)
+{
+        xfs_dir2_sf_hdr_t       *sfp;
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        if (dp->i_d.di_size == 0)       /* might happen during shutdown. */
+                return 1;
+        if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+                return 0;
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        return !sfp->count;
+}
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+        xfs_mount_t     *mp,
+        xfs_ino_t       ino)
+{
+        xfs_agblock_t   agblkno;
+        xfs_agino_t     agino;
+        xfs_agnumber_t  agno;
+        int             ino_ok;
+        int             ioff;
+        agno = XFS_INO_TO_AGNO(mp, ino);
+        agblkno = XFS_INO_TO_AGBNO(mp, ino);
+        ioff = XFS_INO_TO_OFFSET(mp, ino);
+        agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+        ino_ok =
+                agno < mp->m_sb.sb_agcount &&
+                agblkno < mp->m_sb.sb_agblocks &&
+                agblkno != 0 &&
+                ioff < (1 << mp->m_sb.sb_inopblog) &&
+                XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+        if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+                        XFS_RANDOM_DIR_INO_VALIDATE))) {
+                xfs_warn(mp, "Invalid inode number 0x%Lx",
+                                (unsigned long long) ino);
+                XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        return 0;
+}
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+int
+xfs_dir_init(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *dp,
+        xfs_inode_t     *pdp)
+{
+        struct xfs_da_args *args;
+        int             error;
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+        if (error)
+                return error;
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        if (!args)
+                return -ENOMEM;
+        args->geo = dp->i_mount->m_dir_geo;
+        args->dp = dp;
+        args->trans = tp;
+        error = xfs_dir2_sf_create(args, pdp->i_ino);
+        kmem_free(args);
+        return error;
+}
+/*
+  Enter a name in a directory.
+ */
+int
+xfs_dir_createname(
+        xfs_trans_t             *tp,
+        xfs_inode_t             *dp,
+        struct xfs_name         *name,
+        xfs_ino_t               inum,           /* new entry inode number */
+        xfs_fsblock_t           *first,         /* bmap's firstblock */
+        xfs_bmap_free_t         *flist,         /* bmap's freeblock list */
+        xfs_extlen_t            total)          /* bmap's total block count */
+{
+        struct xfs_da_args      *args;
+        int                     rval;
+        int                     v;              /* type-checking value */
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+        if (rval)
+                return rval;
+        XFS_STATS_INC(xs_dir_create);
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        if (!args)
+                return -ENOMEM;
+        args->geo = dp->i_mount->m_dir_geo;
+        args->name = name->name;
+        args->namelen = name->len;
+        args->filetype = name->type;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->inumber = inum;
+        args->dp = dp;
+        args->firstblock = first;
+        args->flist = flist;
+        args->total = total;
+        args->whichfork = XFS_DATA_FORK;
+        args->trans = tp;
+        args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                rval = xfs_dir2_sf_addname(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isblock(args, &v);
+        if (rval)
+                goto out_free;
+        if (v) {
+                rval = xfs_dir2_block_addname(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isleaf(args, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_addname(args);
+        else
+                rval = xfs_dir2_node_addname(args);
+out_free:
+        kmem_free(args);
+        return rval;
+}
+/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+        struct xfs_da_args *args,
+        const unsigned char *name,
+        int             len)
+{
+        if (args->cmpresult == XFS_CMP_DIFFERENT)
+                return -ENOENT;
+        if (args->cmpresult != XFS_CMP_CASE ||
+                                        !(args->op_flags & XFS_DA_OP_CILOOKUP))
+                return -EEXIST;
+        args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+        if (!args->value)
+                return -ENOMEM;
+        memcpy(args->value, name, len);
+        args->valuelen = len;
+        return -EEXIST;
+}
+/*
+ * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
+ */
+int
+xfs_dir_lookup(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *dp,
+        struct xfs_name *name,
+        xfs_ino_t       *inum,          /* out: inode number */
+        struct xfs_name *ci_name)       /* out: actual name if CI match */
+{
+        struct xfs_da_args *args;
+        int             rval;
+        int             v;              /* type-checking value */
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        XFS_STATS_INC(xs_dir_lookup);
+        /*
+         * We need to use KM_NOFS here so that lockdep will not throw false
+         * positive deadlock warnings on a non-transactional lookup path. It is
+         * safe to recurse into inode recalim in that case, but lockdep can't
+         * easily be taught about it. Hence KM_NOFS avoids having to add more
+         * lockdep Doing this avoids having to add a bunch of lockdep class
+         * annotations into the reclaim path for the ilock.
+         */
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args->geo = dp->i_mount->m_dir_geo;
+        args->name = name->name;
+        args->namelen = name->len;
+        args->filetype = name->type;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->dp = dp;
+        args->whichfork = XFS_DATA_FORK;
+        args->trans = tp;
+        args->op_flags = XFS_DA_OP_OKNOENT;
+        if (ci_name)
+                args->op_flags |= XFS_DA_OP_CILOOKUP;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                rval = xfs_dir2_sf_lookup(args);
+                goto out_check_rval;
+        }
+        rval = xfs_dir2_isblock(args, &v);
+        if (rval)
+                goto out_free;
+        if (v) {
+                rval = xfs_dir2_block_lookup(args);
+                goto out_check_rval;
+        }
+        rval = xfs_dir2_isleaf(args, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_lookup(args);
+        else
+                rval = xfs_dir2_node_lookup(args);
+out_check_rval:
+        if (rval == -EEXIST)
+                rval = 0;
+        if (!rval) {
+                *inum = args->inumber;
+                if (ci_name) {
+                        ci_name->name = args->value;
+                        ci_name->len = args->valuelen;
+                }
+        }
+out_free:
+        kmem_free(args);
+        return rval;
+}
+/*
+ * Remove an entry from a directory.
+ */
+int
+xfs_dir_removename(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *dp,
+        struct xfs_name *name,
+        xfs_ino_t       ino,
+        xfs_fsblock_t   *first,         /* bmap's firstblock */
+        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
+        xfs_extlen_t    total)          /* bmap's total block count */
+{
+        struct xfs_da_args *args;
+        int             rval;
+        int             v;              /* type-checking value */
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        XFS_STATS_INC(xs_dir_remove);
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        if (!args)
+                return -ENOMEM;
+        args->geo = dp->i_mount->m_dir_geo;
+        args->name = name->name;
+        args->namelen = name->len;
+        args->filetype = name->type;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->inumber = ino;
+        args->dp = dp;
+        args->firstblock = first;
+        args->flist = flist;
+        args->total = total;
+        args->whichfork = XFS_DATA_FORK;
+        args->trans = tp;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                rval = xfs_dir2_sf_removename(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isblock(args, &v);
+        if (rval)
+                goto out_free;
+        if (v) {
+                rval = xfs_dir2_block_removename(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isleaf(args, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_removename(args);
+        else
+                rval = xfs_dir2_node_removename(args);
+out_free:
+        kmem_free(args);
+        return rval;
+}
+/*
+ * Replace the inode number of a directory entry.
+ */
+int
+xfs_dir_replace(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *dp,
+        struct xfs_name *name,          /* name of entry to replace */
+        xfs_ino_t       inum,           /* new inode number */
+        xfs_fsblock_t   *first,         /* bmap's firstblock */
+        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
+        xfs_extlen_t    total)          /* bmap's total block count */
+{
+        struct xfs_da_args *args;
+        int             rval;
+        int             v;              /* type-checking value */
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+        if (rval)
+                return rval;
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        if (!args)
+                return -ENOMEM;
+        args->geo = dp->i_mount->m_dir_geo;
+        args->name = name->name;
+        args->namelen = name->len;
+        args->filetype = name->type;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->inumber = inum;
+        args->dp = dp;
+        args->firstblock = first;
+        args->flist = flist;
+        args->total = total;
+        args->whichfork = XFS_DATA_FORK;
+        args->trans = tp;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                rval = xfs_dir2_sf_replace(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isblock(args, &v);
+        if (rval)
+                goto out_free;
+        if (v) {
+                rval = xfs_dir2_block_replace(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isleaf(args, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_replace(args);
+        else
+                rval = xfs_dir2_node_replace(args);
+out_free:
+        kmem_free(args);
+        return rval;
+}
+/*
+ * See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
+ */
+int
+xfs_dir_canenter(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *dp,
+        struct xfs_name *name,          /* name of entry to add */
+        uint            resblks)
+{
+        struct xfs_da_args *args;
+        int             rval;
+        int             v;              /* type-checking value */
+        if (resblks)
+                return 0;
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        if (!args)
+                return -ENOMEM;
+        args->geo = dp->i_mount->m_dir_geo;
+        args->name = name->name;
+        args->namelen = name->len;
+        args->filetype = name->type;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->dp = dp;
+        args->whichfork = XFS_DATA_FORK;
+        args->trans = tp;
+        args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+                                                        XFS_DA_OP_OKNOENT;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                rval = xfs_dir2_sf_addname(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isblock(args, &v);
+        if (rval)
+                goto out_free;
+        if (v) {
+                rval = xfs_dir2_block_addname(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isleaf(args, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_addname(args);
+        else
+                rval = xfs_dir2_node_addname(args);
+out_free:
+        kmem_free(args);
+        return rval;
+}
+/*
+ * Utility routines.
+ */
+/*
+ * Add a block to the directory.
+ *
+ * This routine is for data and free blocks, not leaf/node blocks which are
+ * handled by xfs_da_grow_inode.
+ */
+int
+xfs_dir2_grow_inode(
+        struct xfs_da_args      *args,
+        int                     space,  /* v2 dir's space XFS_DIR2_xxx_SPACE */
+        xfs_dir2_db_t           *dbp)   /* out: block number added */
+{
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_mount        *mp = dp->i_mount;
+        xfs_fileoff_t           bno;    /* directory offset of new block */
+        int                     count;  /* count of filesystem blocks */
+        int                     error;
+        trace_xfs_dir2_grow_inode(args, space);
+        /*
+         * Set lowest possible block in the space requested.
+         */
+        bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+        count = args->geo->fsbcount;
+        error = xfs_da_grow_inode_int(args, &bno, count);
+        if (error)
+                return error;
+        *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
+        /*
+         * Update file's size if this is the data space and it grew.
+         */
+        if (space == XFS_DIR2_DATA_SPACE) {
+                xfs_fsize_t     size;           /* directory file (data) size */
+                size = XFS_FSB_TO_B(mp, bno + count);
+                if (size > dp->i_d.di_size) {
+                        dp->i_d.di_size = size;
+                        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+                }
+        }
+        return 0;
+}
+/*
+ * See if the directory is a single-block form directory.
+ */
+int
+xfs_dir2_isblock(
+        struct xfs_da_args      *args,
+        int                     *vp)    /* out: 1 is block, 0 is not block */
+{
+        xfs_fileoff_t           last;   /* last file offset */
+        int                     rval;
+        if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
+                return rval;
+        rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
+        ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
+        *vp = rval;
+        return 0;
+}
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int
+xfs_dir2_isleaf(
+        struct xfs_da_args      *args,
+        int                     *vp)    /* out: 1 is block, 0 is not block */
+{
+        xfs_fileoff_t           last;   /* last file offset */
+        int                     rval;
+        if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
+                return rval;
+        *vp = last == args->geo->leafblk + args->geo->fsbcount;
+        return 0;
+}
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+        xfs_da_args_t   *args,
+        xfs_dir2_db_t   db,
+        struct xfs_buf  *bp)
+{
+        xfs_fileoff_t   bno;            /* directory file offset */
+        xfs_dablk_t     da;             /* directory file offset */
+        int             done;           /* bunmap is finished */
+        xfs_inode_t     *dp;
+        int             error;
+        xfs_mount_t     *mp;
+        xfs_trans_t     *tp;
+        trace_xfs_dir2_shrink_inode(args, db);
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        da = xfs_dir2_db_to_da(args->geo, db);
+        /*
+         * Unmap the fsblock(s).
+         */
+        if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
+                        XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
+                        &done))) {
+                /*
+                 * ENOSPC actually can happen if we're in a removename with
+                 * no space reservation, and the resulting block removal
+                 * would cause a bmap btree split or conversion from extents
+                 * to btree.  This can only happen for un-fragmented
+                 * directory blocks, since you need to be punching out
+                 * the middle of an extent.
+                 * In this case we need to leave the block in the file,
+                 * and not binval it.
+                 * So the block has to be in a consistent empty state
+                 * and appropriately logged.
+                 * We don't free up the buffer, the caller can tell it
+                 * hasn't happened since it got an error back.
+                 */
+                return error;
+        }
+        ASSERT(done);
+        /*
+         * Invalidate the buffer from the transaction.
+         */
+        xfs_trans_binval(tp, bp);
+        /*
+         * If it's not a data block, we're done.
+         */
+        if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
+                return 0;
+        /*
+         * If the block isn't the last one in the directory, we're done.
+         */
+        if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
+                return 0;
+        bno = da;
+        if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+                /*
+                 * This can't really happen unless there's kernel corruption.
+                 */
+                return error;
+        }
+        if (db == args->geo->datablk)
+                ASSERT(bno == 0);
+        else
+                ASSERT(bno > 0);
+        /*
+         * Set the size to the new last block.
+         */
+        dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+        return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
new file mode 100644
index 000000000000..c8e86b0b5e99
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_H__
+#define __XFS_DIR2_H__
+struct xfs_bmap_free;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dir2_sf_hdr;
+struct xfs_dir2_sf_entry;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_data_entry;
+struct xfs_dir2_data_unused;
+extern struct xfs_name  xfs_name_dotdot;
+/*
+ * directory operations vector for encode/decode routines
+ */
+struct xfs_dir_ops {
+        int     (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
+        struct xfs_dir2_sf_entry *
+                (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
+                                struct xfs_dir2_sf_entry *sfep);
+        __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
+        void    (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
+                                __uint8_t ftype);
+        xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
+                                struct xfs_dir2_sf_entry *sfep);
+        void    (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
+                              struct xfs_dir2_sf_entry *sfep,
+                              xfs_ino_t ino);
+        xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
+        void    (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
+                                     xfs_ino_t ino);
+        int     (*data_entsize)(int len);
+        __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
+        void    (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
+                                __uint8_t ftype);
+        __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
+        struct xfs_dir2_data_free *
+                (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
+        xfs_dir2_data_aoff_t data_dot_offset;
+        xfs_dir2_data_aoff_t data_dotdot_offset;
+        xfs_dir2_data_aoff_t data_first_offset;
+        size_t  data_entry_offset;
+        struct xfs_dir2_data_entry *
+                (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+        struct xfs_dir2_data_entry *
+                (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+        struct xfs_dir2_data_entry *
+                (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
+        struct xfs_dir2_data_entry *
+                (*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
+        struct xfs_dir2_data_unused *
+                (*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
+        int     leaf_hdr_size;
+        void    (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
+                                    struct xfs_dir3_icleaf_hdr *from);
+        void    (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
+                                      struct xfs_dir2_leaf *from);
+        int     (*leaf_max_ents)(struct xfs_da_geometry *geo);
+        struct xfs_dir2_leaf_entry *
+                (*leaf_ents_p)(struct xfs_dir2_leaf *lp);
+        int     node_hdr_size;
+        void    (*node_hdr_to_disk)(struct xfs_da_intnode *to,
+                                    struct xfs_da3_icnode_hdr *from);
+        void    (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
+                                      struct xfs_da_intnode *from);
+        struct xfs_da_node_entry *
+                (*node_tree_p)(struct xfs_da_intnode *dap);
+        int     free_hdr_size;
+        void    (*free_hdr_to_disk)(struct xfs_dir2_free *to,
+                                    struct xfs_dir3_icfree_hdr *from);
+        void    (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
+                                      struct xfs_dir2_free *from);
+        int     (*free_max_bests)(struct xfs_da_geometry *geo);
+        __be16 * (*free_bests_p)(struct xfs_dir2_free *free);
+        xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
+                                   xfs_dir2_db_t db);
+        int     (*db_to_fdindex)(struct xfs_da_geometry *geo,
+                                 xfs_dir2_db_t db);
+};
+extern const struct xfs_dir_ops *
+        xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+extern const struct xfs_dir_ops *
+        xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+/*
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern int xfs_da_mount(struct xfs_mount *mp);
+extern void xfs_da_unmount(struct xfs_mount *mp);
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+                                struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+                                struct xfs_name *name, xfs_ino_t inum,
+                                xfs_fsblock_t *first,
+                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+                                struct xfs_name *name, xfs_ino_t *inum,
+                                struct xfs_name *ci_name);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+                                struct xfs_name *name, xfs_ino_t ino,
+                                xfs_fsblock_t *first,
+                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+                                struct xfs_name *name, xfs_ino_t inum,
+                                xfs_fsblock_t *first,
+                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+                                struct xfs_name *name, uint resblks);
+/*
+ * Direct call from the bmap code, bypassing the generic directory layer.
+ */
+extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
+/*
+ * Interface routines used by userspace utilities
+ */
+extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+                                struct xfs_buf *bp);
+extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
+                struct xfs_dir2_data_hdr *hdr, int *loghead);
+extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
+                struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
+                struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
+                struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
+                struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
+                xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
+                struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
+                xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
+                int *needlogp, int *needscanp);
+extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
+                struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+                struct xfs_dir2_data_unused *dup);
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+#endif  /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
new file mode 100644
index 000000000000..9628ceccfa02
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -0,0 +1,1265 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp,
+                                    int first, int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp,
+                                     int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+        xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+        xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
+}
+static bool
+xfs_dir3_block_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+                        return false;
+                if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+                        return false;
+        } else {
+                if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+                        return false;
+        }
+        if (__xfs_dir3_data_check(NULL, bp))
+                return false;
+        return true;
+}
+static void
+xfs_dir3_block_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+             !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_dir3_block_verify(bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+static void
+xfs_dir3_block_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+        if (!xfs_dir3_block_verify(bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (bip)
+                hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+        .verify_read = xfs_dir3_block_read_verify,
+        .verify_write = xfs_dir3_block_write_verify,
+};
+int
+xfs_dir3_block_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = dp->i_mount;
+        int                     err;
+        err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+                                XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+        if (!err && tp)
+                xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+        return err;
+}
+static void
+xfs_dir3_block_init(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_buf          *bp,
+        struct xfs_inode        *dp)
+{
+        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+        bp->b_ops = &xfs_dir3_block_buf_ops;
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                memset(hdr3, 0, sizeof(*hdr3));
+                hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+                hdr3->blkno = cpu_to_be64(bp->b_bn);
+                hdr3->owner = cpu_to_be64(dp->i_ino);
+                uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+                return;
+        }
+        hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+}
+static void
+xfs_dir2_block_need_space(
+        struct xfs_inode                *dp,
+        struct xfs_dir2_data_hdr        *hdr,
+        struct xfs_dir2_block_tail      *btp,
+        struct xfs_dir2_leaf_entry      *blp,
+        __be16                          **tagpp,
+        struct xfs_dir2_data_unused     **dupp,
+        struct xfs_dir2_data_unused     **enddupp,
+        int                             *compact,
+        int                             len)
+{
+        struct xfs_dir2_data_free       *bf;
+        __be16                          *tagp = NULL;
+        struct xfs_dir2_data_unused     *dup = NULL;
+        struct xfs_dir2_data_unused     *enddup = NULL;
+        *compact = 0;
+        bf = dp->d_ops->data_bestfree_p(hdr);
+        /*
+         * If there are stale entries we'll use one for the leaf.
+         */
+        if (btp->stale) {
+                if (be16_to_cpu(bf[0].length) >= len) {
+                        /*
+                         * The biggest entry enough to avoid compaction.
+                         */
+                        dup = (xfs_dir2_data_unused_t *)
+                              ((char *)hdr + be16_to_cpu(bf[0].offset));
+                        goto out;
+                }
+                /*
+                 * Will need to compact to make this work.
+                 * Tag just before the first leaf entry.
+                 */
+                *compact = 1;
+                tagp = (__be16 *)blp - 1;
+                /* Data object just before the first leaf entry.  */
+                dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+                /*
+                 * If it's not free then the data will go where the
+                 * leaf data starts now, if it works at all.
+                 */
+                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                        if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+                            (uint)sizeof(*blp) < len)
+                                dup = NULL;
+                } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+                        dup = NULL;
+                else
+                        dup = (xfs_dir2_data_unused_t *)blp;
+                goto out;
+        }
+        /*
+         * no stale entries, so just use free space.
+         * Tag just before the first leaf entry.
+         */
+        tagp = (__be16 *)blp - 1;
+        /* Data object just before the first leaf entry.  */
+        enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+        /*
+         * If it's not free then can't do this add without cleaning up:
+         * the space before the first leaf entry needs to be free so it
+         * can be expanded to hold the pointer to the new entry.
+         */
+        if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                /*
+                 * Check out the biggest freespace and see if it's the same one.
+                 */
+                dup = (xfs_dir2_data_unused_t *)
+                      ((char *)hdr + be16_to_cpu(bf[0].offset));
+                if (dup != enddup) {
+                        /*
+                         * Not the same free entry, just check its length.
+                         */
+                        if (be16_to_cpu(dup->length) < len)
+                                dup = NULL;
+                        goto out;
+                }
+                /*
+                 * It is the biggest freespace, can it hold the leaf too?
+                 */
+                if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+                        /*
+                         * Yes, use the second-largest entry instead if it works.
+                         */
+                        if (be16_to_cpu(bf[1].length) >= len)
+                                dup = (xfs_dir2_data_unused_t *)
+                                      ((char *)hdr + be16_to_cpu(bf[1].offset));
+                        else
+                                dup = NULL;
+                }
+        }
+out:
+        *tagpp = tagp;
+        *dupp = dup;
+        *enddupp = enddup;
+}
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+        struct xfs_da_args              *args,
+        struct xfs_buf                  *bp,
+        struct xfs_dir2_data_hdr        *hdr,
+        struct xfs_dir2_block_tail      *btp,
+        struct xfs_dir2_leaf_entry      *blp,
+        int                             *needlog,
+        int                             *lfloghigh,
+        int                             *lfloglow)
+{
+        int                     fromidx;        /* source leaf index */
+        int                     toidx;          /* target leaf index */
+        int                     needscan = 0;
+        int                     highstale;      /* high stale index */
+        fromidx = toidx = be32_to_cpu(btp->count) - 1;
+        highstale = *lfloghigh = -1;
+        for (; fromidx >= 0; fromidx--) {
+                if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+                        if (highstale == -1)
+                                highstale = toidx;
+                        else {
+                                if (*lfloghigh == -1)
+                                        *lfloghigh = toidx;
+                                continue;
+                        }
+                }
+                if (fromidx < toidx)
+                        blp[toidx] = blp[fromidx];
+                toidx--;
+        }
+        *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+        *lfloghigh -= be32_to_cpu(btp->stale) - 1;
+        be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+        xfs_dir2_data_make_free(args, bp,
+                (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+                (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+                needlog, &needscan);
+        btp->stale = cpu_to_be32(1);
+        /*
+         * If we now need to rebuild the bestfree map, do so.
+         * This needs to happen before the next call to use_free.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(args->dp, hdr, needlog);
+}
+/*
+ * Add an entry to a block directory.
+ */
+int                                             /* error */
+xfs_dir2_block_addname(
+        xfs_da_args_t           *args)          /* directory op arguments */
+{
+        xfs_dir2_data_hdr_t     *hdr;           /* block header */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        struct xfs_buf          *bp;            /* buffer for block */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        int                     compact;        /* need to compact leaf ents */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* block unused entry */
+        int                     error;          /* error return value */
+        xfs_dir2_data_unused_t  *enddup=NULL;   /* unused at end of data */
+        xfs_dahash_t            hash;           /* hash value of found entry */
+        int                     high;           /* high index for binary srch */
+        int                     highstale;      /* high stale index */
+        int                     lfloghigh=0;    /* last final leaf to log */
+        int                     lfloglow=0;     /* first final leaf to log */
+        int                     len;            /* length of the new entry */
+        int                     low;            /* low index for binary srch */
+        int                     lowstale;       /* low stale index */
+        int                     mid=0;          /* midpoint for binary srch */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log header */
+        int                     needscan;       /* need to rescan freespace */
+        __be16                  *tagp;          /* pointer to tag value */
+        xfs_trans_t             *tp;            /* transaction structure */
+        trace_xfs_dir2_block_addname(args);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        /* Read the (one and only) directory block into bp. */
+        error = xfs_dir3_block_read(tp, dp, &bp);
+        if (error)
+                return error;
+        len = dp->d_ops->data_entsize(args->namelen);
+        /*
+         * Set up pointers to parts of the block.
+         */
+        hdr = bp->b_addr;
+        btp = xfs_dir2_block_tail_p(args->geo, hdr);
+        blp = xfs_dir2_block_leaf_p(btp);
+        /*
+         * Find out if we can reuse stale entries or whether we need extra
+         * space for entry and new leaf.
+         */
+        xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
+                                  &enddup, &compact, len);
+        /*
+         * Done everything we need for a space check now.
+         */
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+                xfs_trans_brelse(tp, bp);
+                if (!dup)
+                        return -ENOSPC;
+                return 0;
+        }
+        /*
+         * If we don't have space for the new entry & leaf ...
+         */
+        if (!dup) {
+                /* Don't have a space reservation: return no-space.  */
+                if (args->total == 0)
+                        return -ENOSPC;
+                /*
+                 * Convert to the next larger format.
+                 * Then add the new entry in that format.
+                 */
+                error = xfs_dir2_block_to_leaf(args, bp);
+                if (error)
+                        return error;
+                return xfs_dir2_leaf_addname(args);
+        }
+        needlog = needscan = 0;
+        /*
+         * If need to compact the leaf entries, do it now.
+         */
+        if (compact) {
+                xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
+                                      &lfloghigh, &lfloglow);
+                /* recalculate blp post-compaction */
+                blp = xfs_dir2_block_leaf_p(btp);
+        } else if (btp->stale) {
+                /*
+                 * Set leaf logging boundaries to impossible state.
+                 * For the no-stale case they're set explicitly.
+                 */
+                lfloglow = be32_to_cpu(btp->count);
+                lfloghigh = -1;
+        }
+        /*
+         * Find the slot that's first lower than our hash value, -1 if none.
+         */
+        for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
+                mid = (low + high) >> 1;
+                if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+                        break;
+                if (hash < args->hashval)
+                        low = mid + 1;
+                else
+                        high = mid - 1;
+        }
+        while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
+                mid--;
+        }
+        /*
+         * No stale entries, will use enddup space to hold new leaf.
+         */
+        if (!btp->stale) {
+                /*
+                 * Mark the space needed for the new leaf entry, now in use.
+                 */
+                xfs_dir2_data_use_free(args, bp, enddup,
+                        (xfs_dir2_data_aoff_t)
+                        ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
+                         sizeof(*blp)),
+                        (xfs_dir2_data_aoff_t)sizeof(*blp),
+                        &needlog, &needscan);
+                /*
+                 * Update the tail (entry count).
+                 */
+                be32_add_cpu(&btp->count, 1);
+                /*
+                 * If we now need to rebuild the bestfree map, do so.
+                 * This needs to happen before the next call to use_free.
+                 */
+                if (needscan) {
+                        xfs_dir2_data_freescan(dp, hdr, &needlog);
+                        needscan = 0;
+                }
+                /*
+                 * Adjust pointer to the first leaf entry, we're about to move
+                 * the table up one to open up space for the new leaf entry.
+                 * Then adjust our index to match.
+                 */
+                blp--;
+                mid++;
+                if (mid)
+                        memmove(blp, &blp[1], mid * sizeof(*blp));
+                lfloglow = 0;
+                lfloghigh = mid;
+        }
+        /*
+         * Use a stale leaf for our new entry.
+         */
+        else {
+                for (lowstale = mid;
+                     lowstale >= 0 &&
+                        blp[lowstale].address !=
+                        cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+                     lowstale--)
+                        continue;
+                for (highstale = mid + 1;
+                     highstale < be32_to_cpu(btp->count) &&
+                        blp[highstale].address !=
+                        cpu_to_be32(XFS_DIR2_NULL_DATAPTR) &&
+                        (lowstale < 0 || mid - lowstale > highstale - mid);
+                     highstale++)
+                        continue;
+                /*
+                 * Move entries toward the low-numbered stale entry.
+                 */
+                if (lowstale >= 0 &&
+                    (highstale == be32_to_cpu(btp->count) ||
+                     mid - lowstale <= highstale - mid)) {
+                        if (mid - lowstale)
+                                memmove(&blp[lowstale], &blp[lowstale + 1],
+                                        (mid - lowstale) * sizeof(*blp));
+                        lfloglow = MIN(lowstale, lfloglow);
+                        lfloghigh = MAX(mid, lfloghigh);
+                }
+                /*
+                 * Move entries toward the high-numbered stale entry.
+                 */
+                else {
+                        ASSERT(highstale < be32_to_cpu(btp->count));
+                        mid++;
+                        if (highstale - mid)
+                                memmove(&blp[mid + 1], &blp[mid],
+                                        (highstale - mid) * sizeof(*blp));
+                        lfloglow = MIN(mid, lfloglow);
+                        lfloghigh = MAX(highstale, lfloghigh);
+                }
+                be32_add_cpu(&btp->stale, -1);
+        }
+        /*
+         * Point to the new data entry.
+         */
+        dep = (xfs_dir2_data_entry_t *)dup;
+        /*
+         * Fill in the leaf entry.
+         */
+        blp[mid].hashval = cpu_to_be32(args->hashval);
+        blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+                                (char *)dep - (char *)hdr));
+        xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+        /*
+         * Mark space for the data entry used.
+         */
+        xfs_dir2_data_use_free(args, bp, dup,
+                (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+                (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+        /*
+         * Create the new data entry.
+         */
+        dep->inumber = cpu_to_be64(args->inumber);
+        dep->namelen = args->namelen;
+        memcpy(dep->name, args->name, args->namelen);
+        dp->d_ops->data_put_ftype(dep, args->filetype);
+        tagp = dp->d_ops->data_entry_tag_p(dep);
+        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+        /*
+         * Clean up the bestfree array and log the header, tail, and entry.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(dp, hdr, &needlog);
+        if (needlog)
+                xfs_dir2_data_log_header(args, bp);
+        xfs_dir2_block_log_tail(tp, bp);
+        xfs_dir2_data_log_entry(args, bp, dep);
+        xfs_dir3_data_check(dp, bp);
+        return 0;
+}
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+        xfs_trans_t             *tp,            /* transaction structure */
+        struct xfs_buf          *bp,            /* block buffer */
+        int                     first,          /* index of first logged leaf */
+        int                     last)           /* index of last logged leaf */
+{
+        xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
+        xfs_dir2_leaf_entry_t   *blp;
+        xfs_dir2_block_tail_t   *btp;
+        btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+        blp = xfs_dir2_block_leaf_p(btp);
+        xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
+                (uint)((char *)&blp[last + 1] - (char *)hdr - 1));
+}
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+        xfs_trans_t             *tp,            /* transaction structure */
+        struct xfs_buf          *bp)            /* block buffer */
+{
+        xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
+        xfs_dir2_block_tail_t   *btp;
+        btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
+        xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
+                (uint)((char *)(btp + 1) - (char *)hdr - 1));
+}
+/*
+ * Look up an entry in the block.  This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int                                             /* error */
+xfs_dir2_block_lookup(
+        xfs_da_args_t           *args)          /* dir lookup arguments */
+{
+        xfs_dir2_data_hdr_t     *hdr;           /* block header */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        struct xfs_buf          *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* incore inode */
+        int                     ent;            /* entry index */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        trace_xfs_dir2_block_lookup(args);
+        /*
+         * Get the buffer, look up the entry.
+         * If not found (ENOENT) then return, have no buffer.
+         */
+        if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+                return error;
+        dp = args->dp;
+        mp = dp->i_mount;
+        hdr = bp->b_addr;
+        xfs_dir3_data_check(dp, bp);
+        btp = xfs_dir2_block_tail_p(args->geo, hdr);
+        blp = xfs_dir2_block_leaf_p(btp);
+        /*
+         * Get the offset from the leaf entry, to point to the data.
+         */
+        dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+                        xfs_dir2_dataptr_to_off(args->geo,
+                                                be32_to_cpu(blp[ent].address)));
+        /*
+         * Fill in inode number, CI name if appropriate, release the block.
+         */
+        args->inumber = be64_to_cpu(dep->inumber);
+        args->filetype = dp->d_ops->data_get_ftype(dep);
+        error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+        xfs_trans_brelse(args->trans, bp);
+        return error;
+}
+/*
+ * Internal block lookup routine.
+ */
+static int                                      /* error */
+xfs_dir2_block_lookup_int(
+        xfs_da_args_t           *args,          /* dir lookup arguments */
+        struct xfs_buf          **bpp,          /* returned block buffer */
+        int                     *entno)         /* returned entry number */
+{
+        xfs_dir2_dataptr_t      addr;           /* data entry address */
+        xfs_dir2_data_hdr_t     *hdr;           /* block header */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        struct xfs_buf          *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* incore inode */
+        int                     error;          /* error return value */
+        xfs_dahash_t            hash;           /* found hash value */
+        int                     high;           /* binary search high index */
+        int                     low;            /* binary search low index */
+        int                     mid;            /* binary search current idx */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        enum xfs_dacmp          cmp;            /* comparison result */
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        error = xfs_dir3_block_read(tp, dp, &bp);
+        if (error)
+                return error;
+        hdr = bp->b_addr;
+        xfs_dir3_data_check(dp, bp);
+        btp = xfs_dir2_block_tail_p(args->geo, hdr);
+        blp = xfs_dir2_block_leaf_p(btp);
+        /*
+         * Loop doing a binary search for our hash value.
+         * Find our entry, ENOENT if it's not there.
+         */
+        for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
+                ASSERT(low <= high);
+                mid = (low + high) >> 1;
+                if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+                        break;
+                if (hash < args->hashval)
+                        low = mid + 1;
+                else
+                        high = mid - 1;
+                if (low > high) {
+                        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+                        xfs_trans_brelse(tp, bp);
+                        return -ENOENT;
+                }
+        }
+        /*
+         * Back up to the first one with the right hash value.
+         */
+        while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
+                mid--;
+        }
+        /*
+         * Now loop forward through all the entries with the
+         * right hash value looking for our name.
+         */
+        do {
+                if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Get pointer to the entry from the leaf.
+                 */
+                dep = (xfs_dir2_data_entry_t *)
+                        ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
+                /*
+                 * Compare name and if it's an exact match, return the index
+                 * and buffer. If it's the first case-insensitive match, store
+                 * the index and buffer and continue looking for an exact match.
+                 */
+                cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+                if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                        args->cmpresult = cmp;
+                        *bpp = bp;
+                        *entno = mid;
+                        if (cmp == XFS_CMP_EXACT)
+                                return 0;
+                }
+        } while (++mid < be32_to_cpu(btp->count) &&
+                        be32_to_cpu(blp[mid].hashval) == hash);
+        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+        /*
+         * Here, we can only be doing a lookup (not a rename or replace).
+         * If a case-insensitive match was found earlier, return success.
+         */
+        if (args->cmpresult == XFS_CMP_CASE)
+                return 0;
+        /*
+         * No match, release the buffer and return ENOENT.
+         */
+        xfs_trans_brelse(tp, bp);
+        return -ENOENT;
+}
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int                                             /* error */
+xfs_dir2_block_removename(
+        xfs_da_args_t           *args)          /* directory operation args */
+{
+        xfs_dir2_data_hdr_t     *hdr;           /* block header */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf pointer */
+        struct xfs_buf          *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* incore inode */
+        int                     ent;            /* block leaf entry index */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log block header */
+        int                     needscan;       /* need to fixup bestfree */
+        xfs_dir2_sf_hdr_t       sfh;            /* shortform header */
+        int                     size;           /* shortform size */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        trace_xfs_dir2_block_removename(args);
+        /*
+         * Look up the entry in the block.  Gets the buffer and entry index.
+         * It will always be there, the vnodeops level does a lookup first.
+         */
+        if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+                return error;
+        }
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        hdr = bp->b_addr;
+        btp = xfs_dir2_block_tail_p(args->geo, hdr);
+        blp = xfs_dir2_block_leaf_p(btp);
+        /*
+         * Point to the data entry using the leaf entry.
+         */
+        dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+                        xfs_dir2_dataptr_to_off(args->geo,
+                                                be32_to_cpu(blp[ent].address)));
+        /*
+         * Mark the data entry's space free.
+         */
+        needlog = needscan = 0;
+        xfs_dir2_data_make_free(args, bp,
+                (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+                dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+        /*
+         * Fix up the block tail.
+         */
+        be32_add_cpu(&btp->stale, 1);
+        xfs_dir2_block_log_tail(tp, bp);
+        /*
+         * Remove the leaf entry by marking it stale.
+         */
+        blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+        xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+        /*
+         * Fix up bestfree, log the header if necessary.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(dp, hdr, &needlog);
+        if (needlog)
+                xfs_dir2_data_log_header(args, bp);
+        xfs_dir3_data_check(dp, bp);
+        /*
+         * See if the size as a shortform is good enough.
+         */
+        size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+        if (size > XFS_IFORK_DSIZE(dp))
+                return 0;
+        /*
+         * If it works, do the conversion.
+         */
+        return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int                                             /* error */
+xfs_dir2_block_replace(
+        xfs_da_args_t           *args)          /* directory operation args */
+{
+        xfs_dir2_data_hdr_t     *hdr;           /* block header */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        struct xfs_buf          *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* incore inode */
+        int                     ent;            /* leaf entry index */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        trace_xfs_dir2_block_replace(args);
+        /*
+         * Lookup the entry in the directory.  Get buffer and entry index.
+         * This will always succeed since the caller has already done a lookup.
+         */
+        if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+                return error;
+        }
+        dp = args->dp;
+        mp = dp->i_mount;
+        hdr = bp->b_addr;
+        btp = xfs_dir2_block_tail_p(args->geo, hdr);
+        blp = xfs_dir2_block_leaf_p(btp);
+        /*
+         * Point to the data entry we need to change.
+         */
+        dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+                        xfs_dir2_dataptr_to_off(args->geo,
+                                                be32_to_cpu(blp[ent].address)));
+        ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
+        /*
+         * Change the inode number to the new value.
+         */
+        dep->inumber = cpu_to_be64(args->inumber);
+        dp->d_ops->data_put_ftype(dep, args->filetype);
+        xfs_dir2_data_log_entry(args, bp, dep);
+        xfs_dir3_data_check(dp, bp);
+        return 0;
+}
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int                                      /* sort order */
+xfs_dir2_block_sort(
+        const void                      *a,     /* first leaf entry */
+        const void                      *b)     /* second leaf entry */
+{
+        const xfs_dir2_leaf_entry_t     *la;    /* first leaf entry */
+        const xfs_dir2_leaf_entry_t     *lb;    /* second leaf entry */
+        la = a;
+        lb = b;
+        return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
+                (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
+}
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int                                             /* error */
+xfs_dir2_leaf_to_block(
+        xfs_da_args_t           *args,          /* operation arguments */
+        struct xfs_buf          *lbp,           /* leaf buffer */
+        struct xfs_buf          *dbp)           /* data buffer */
+{
+        __be16                  *bestsp;        /* leaf bests table */
+        xfs_dir2_data_hdr_t     *hdr;           /* block header */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* unused data entry */
+        int                     error;          /* error return value */
+        int                     from;           /* leaf from index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* file system mount point */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to scan for bestfree */
+        xfs_dir2_sf_hdr_t       sfh;            /* shortform header */
+        int                     size;           /* bytes used */
+        __be16                  *tagp;          /* end of entry (tag) */
+        int                     to;             /* block/leaf to index */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        trace_xfs_dir2_leaf_to_block(args);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = lbp->b_addr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+        ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+               leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
+        /*
+         * If there are data blocks other than the first one, take this
+         * opportunity to remove trailing empty data blocks that may have
+         * been left behind during no-space-reservation operations.
+         * These will show up in the leaf bests table.
+         */
+        while (dp->i_d.di_size > args->geo->blksize) {
+                int hdrsz;
+                hdrsz = dp->d_ops->data_entry_offset;
+                bestsp = xfs_dir2_leaf_bests_p(ltp);
+                if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
+                                            args->geo->blksize - hdrsz) {
+                        if ((error =
+                            xfs_dir2_leaf_trim_data(args, lbp,
+                                    (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
+                                return error;
+                } else
+                        return 0;
+        }
+        /*
+         * Read the data block if we don't already have it, give up if it fails.
+         */
+        if (!dbp) {
+                error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
+                if (error)
+                        return error;
+        }
+        hdr = dbp->b_addr;
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+        /*
+         * Size of the "leaf" area in the block.
+         */
+        size = (uint)sizeof(xfs_dir2_block_tail_t) +
+               (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
+        /*
+         * Look at the last data entry.
+         */
+        tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
+        dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+        /*
+         * If it's not free or is too short we can't do it.
+         */
+        if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
+            be16_to_cpu(dup->length) < size)
+                return 0;
+        /*
+         * Start converting it to block form.
+         */
+        xfs_dir3_block_init(mp, tp, dbp, dp);
+        needlog = 1;
+        needscan = 0;
+        /*
+         * Use up the space at the end of the block (blp/btp).
+         */
+        xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
+                &needlog, &needscan);
+        /*
+         * Initialize the block tail.
+         */
+        btp = xfs_dir2_block_tail_p(args->geo, hdr);
+        btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
+        btp->stale = 0;
+        xfs_dir2_block_log_tail(tp, dbp);
+        /*
+         * Initialize the block leaf area.  We compact out stale entries.
+         */
+        lep = xfs_dir2_block_leaf_p(btp);
+        for (from = to = 0; from < leafhdr.count; from++) {
+                if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                        continue;
+                lep[to++] = ents[from];
+        }
+        ASSERT(to == be32_to_cpu(btp->count));
+        xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
+        /*
+         * Scan the bestfree if we need it and log the data block header.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(dp, hdr, &needlog);
+        if (needlog)
+                xfs_dir2_data_log_header(args, dbp);
+        /*
+         * Pitch the old leaf block.
+         */
+        error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
+        if (error)
+                return error;
+        /*
+         * Now see if the resulting block can be shrunken to shortform.
+         */
+        size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
+        if (size > XFS_IFORK_DSIZE(dp))
+                return 0;
+        return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+}
+/*
+ * Convert the shortform directory to block form.
+ */
+int                                             /* error */
+xfs_dir2_sf_to_block(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_dir2_db_t           blkno;          /* dir-relative block # (0) */
+        xfs_dir2_data_hdr_t     *hdr;           /* block header */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        struct xfs_buf          *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail pointer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     dummy;          /* trash */
+        xfs_dir2_data_unused_t  *dup;           /* unused entry pointer */
+        int                     endoffset;      /* end of data objects */
+        int                     error;          /* error return value */
+        int                     i;              /* index */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log block header */
+        int                     needscan;       /* need to scan block freespc */
+        int                     newoffset;      /* offset from current entry */
+        int                     offset;         /* target block offset */
+        xfs_dir2_sf_entry_t     *sfep;          /* sf entry pointer */
+        xfs_dir2_sf_hdr_t       *oldsfp;        /* old shortform header  */
+        xfs_dir2_sf_hdr_t       *sfp;           /* shortform header  */
+        __be16                  *tagp;          /* end of data entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_name         name;
+        struct xfs_ifork        *ifp;
+        trace_xfs_dir2_sf_to_block(args);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+        ASSERT(ifp->if_flags & XFS_IFINLINE);
+        /*
+         * Bomb out if the shortform directory is way too short.
+         */
+        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                return -EIO;
+        }
+        oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
+        ASSERT(ifp->if_bytes == dp->i_d.di_size);
+        ASSERT(ifp->if_u1.if_data != NULL);
+        ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+        ASSERT(dp->i_d.di_nextents == 0);
+        /*
+         * Copy the directory into a temporary buffer.
+         * Then pitch the incore inode data so we can make extents.
+         */
+        sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+        memcpy(sfp, oldsfp, ifp->if_bytes);
+        xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
+        xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+        dp->i_d.di_size = 0;
+        /*
+         * Add block 0 to the inode.
+         */
+        error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+        if (error) {
+                kmem_free(sfp);
+                return error;
+        }
+        /*
+         * Initialize the data block, then convert it to block format.
+         */
+        error = xfs_dir3_data_init(args, blkno, &bp);
+        if (error) {
+                kmem_free(sfp);
+                return error;
+        }
+        xfs_dir3_block_init(mp, tp, bp, dp);
+        hdr = bp->b_addr;
+        /*
+         * Compute size of block "tail" area.
+         */
+        i = (uint)sizeof(*btp) +
+            (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+        /*
+         * The whole thing is initialized to free by the init routine.
+         * Say we're using the leaf and tail area.
+         */
+        dup = dp->d_ops->data_unused_p(hdr);
+        needlog = needscan = 0;
+        xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+                               i, &needlog, &needscan);
+        ASSERT(needscan == 0);
+        /*
+         * Fill in the tail.
+         */
+        btp = xfs_dir2_block_tail_p(args->geo, hdr);
+        btp->count = cpu_to_be32(sfp->count + 2);       /* ., .. */
+        btp->stale = 0;
+        blp = xfs_dir2_block_leaf_p(btp);
+        endoffset = (uint)((char *)blp - (char *)hdr);
+        /*
+         * Remove the freespace, we'll manage it.
+         */
+        xfs_dir2_data_use_free(args, bp, dup,
+                (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
+                be16_to_cpu(dup->length), &needlog, &needscan);
+        /*
+         * Create entry for .
+         */
+        dep = dp->d_ops->data_dot_entry_p(hdr);
+        dep->inumber = cpu_to_be64(dp->i_ino);
+        dep->namelen = 1;
+        dep->name[0] = '.';
+        dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+        tagp = dp->d_ops->data_entry_tag_p(dep);
+        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+        xfs_dir2_data_log_entry(args, bp, dep);
+        blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
+        blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+                                (char *)dep - (char *)hdr));
+        /*
+         * Create entry for ..
+         */
+        dep = dp->d_ops->data_dotdot_entry_p(hdr);
+        dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
+        dep->namelen = 2;
+        dep->name[0] = dep->name[1] = '.';
+        dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+        tagp = dp->d_ops->data_entry_tag_p(dep);
+        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+        xfs_dir2_data_log_entry(args, bp, dep);
+        blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
+        blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+                                (char *)dep - (char *)hdr));
+        offset = dp->d_ops->data_first_offset;
+        /*
+         * Loop over existing entries, stuff them in.
+         */
+        i = 0;
+        if (!sfp->count)
+                sfep = NULL;
+        else
+                sfep = xfs_dir2_sf_firstentry(sfp);
+        /*
+         * Need to preserve the existing offset values in the sf directory.
+         * Insert holes (unused entries) where necessary.
+         */
+        while (offset < endoffset) {
+                /*
+                 * sfep is null when we reach the end of the list.
+                 */
+                if (sfep == NULL)
+                        newoffset = endoffset;
+                else
+                        newoffset = xfs_dir2_sf_get_offset(sfep);
+                /*
+                 * There should be a hole here, make one.
+                 */
+                if (offset < newoffset) {
+                        dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+                        dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+                        dup->length = cpu_to_be16(newoffset - offset);
+                        *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
+                                ((char *)dup - (char *)hdr));
+                        xfs_dir2_data_log_unused(args, bp, dup);
+                        xfs_dir2_data_freeinsert(hdr,
+                                                 dp->d_ops->data_bestfree_p(hdr),
+                                                 dup, &dummy);
+                        offset += be16_to_cpu(dup->length);
+                        continue;
+                }
+                /*
+                 * Copy a real entry.
+                 */
+                dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
+                dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
+                dep->namelen = sfep->namelen;
+                dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
+                memcpy(dep->name, sfep->name, dep->namelen);
+                tagp = dp->d_ops->data_entry_tag_p(dep);
+                *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+                xfs_dir2_data_log_entry(args, bp, dep);
+                name.name = sfep->name;
+                name.len = sfep->namelen;
+                blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+                                                        hashname(&name));
+                blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
+                                                 (char *)dep - (char *)hdr));
+                offset = (int)((char *)(tagp + 1) - (char *)hdr);
+                if (++i == sfp->count)
+                        sfep = NULL;
+                else
+                        sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+        }
+        /* Done with the temporary buffer */
+        kmem_free(sfp);
+        /*
+         * Sort the leaf entries by hash value.
+         */
+        xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
+        /*
+         * Log the leaf entry area and tail.
+         * Already logged the header in data_init, ignore needlog.
+         */
+        ASSERT(needscan == 0);
+        xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
+        xfs_dir2_block_log_tail(tp, bp);
+        xfs_dir3_data_check(dp, bp);
+        return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
new file mode 100644
index 000000000000..fdd803fecb8e
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Return 0 is the buffer is good, otherwise an error.
+ */
+int
+__xfs_dir3_data_check(
+        struct xfs_inode        *dp,            /* incore inode pointer */
+        struct xfs_buf          *bp)            /* data block's buffer */
+{
+        xfs_dir2_dataptr_t      addr;           /* addr for leaf lookup */
+        xfs_dir2_data_free_t    *bf;            /* bestfree table */
+        xfs_dir2_block_tail_t   *btp=NULL;      /* block tail */
+        int                     count;          /* count of entries found */
+        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+        xfs_dir2_data_entry_t   *dep;           /* data entry */
+        xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
+        xfs_dir2_data_unused_t  *dup;           /* unused entry */
+        char                    *endp;          /* end of useful data */
+        int                     freeseen;       /* mask of bestfrees seen */
+        xfs_dahash_t            hash;           /* hash of current name */
+        int                     i;              /* leaf index */
+        int                     lastfree;       /* last entry was unused */
+        xfs_dir2_leaf_entry_t   *lep=NULL;      /* block leaf entries */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        char                    *p;             /* current data position */
+        int                     stale;          /* count of stale leaves */
+        struct xfs_name         name;
+        const struct xfs_dir_ops *ops;
+        struct xfs_da_geometry  *geo;
+        mp = bp->b_target->bt_mount;
+        geo = mp->m_dir_geo;
+        /*
+         * We can be passed a null dp here from a verifier, so we need to go the
+         * hard way to get them.
+         */
+        ops = xfs_dir_get_ops(mp, dp);
+        hdr = bp->b_addr;
+        p = (char *)ops->data_entry_p(hdr);
+        switch (hdr->magic) {
+        case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+        case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+                btp = xfs_dir2_block_tail_p(geo, hdr);
+                lep = xfs_dir2_block_leaf_p(btp);
+                endp = (char *)lep;
+                /*
+                 * The number of leaf entries is limited by the size of the
+                 * block and the amount of space used by the data entries.
+                 * We don't know how much space is used by the data entries yet,
+                 * so just ensure that the count falls somewhere inside the
+                 * block right now.
+                 */
+                XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+                        ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
+                break;
+        case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+        case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+                endp = (char *)hdr + geo->blksize;
+                break;
+        default:
+                XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        /*
+         * Account for zero bestfree entries.
+         */
+        bf = ops->data_bestfree_p(hdr);
+        count = lastfree = freeseen = 0;
+        if (!bf[0].length) {
+                XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
+                freeseen |= 1 << 0;
+        }
+        if (!bf[1].length) {
+                XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
+                freeseen |= 1 << 1;
+        }
+        if (!bf[2].length) {
+                XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
+                freeseen |= 1 << 2;
+        }
+        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+                                                be16_to_cpu(bf[1].length));
+        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+                                                be16_to_cpu(bf[2].length));
+        /*
+         * Loop over the data/unused entries.
+         */
+        while (p < endp) {
+                dup = (xfs_dir2_data_unused_t *)p;
+                /*
+                 * If it's unused, look for the space in the bestfree table.
+                 * If we find it, account for that, else make sure it
+                 * doesn't need to be there.
+                 */
+                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                        XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+                        XFS_WANT_CORRUPTED_RETURN(
+                                be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+                                               (char *)dup - (char *)hdr);
+                        dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+                        if (dfp) {
+                                i = (int)(dfp - bf);
+                                XFS_WANT_CORRUPTED_RETURN(
+                                        (freeseen & (1 << i)) == 0);
+                                freeseen |= 1 << i;
+                        } else {
+                                XFS_WANT_CORRUPTED_RETURN(
+                                        be16_to_cpu(dup->length) <=
+                                                be16_to_cpu(bf[2].length));
+                        }
+                        p += be16_to_cpu(dup->length);
+                        lastfree = 1;
+                        continue;
+                }
+                /*
+                 * It's a real entry.  Validate the fields.
+                 * If this is a block directory then make sure it's
+                 * in the leaf section of the block.
+                 * The linear search is crude but this is DEBUG code.
+                 */
+                dep = (xfs_dir2_data_entry_t *)p;
+                XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+                XFS_WANT_CORRUPTED_RETURN(
+                        !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
+                XFS_WANT_CORRUPTED_RETURN(
+                        be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
+                                               (char *)dep - (char *)hdr);
+                XFS_WANT_CORRUPTED_RETURN(
+                                ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
+                count++;
+                lastfree = 0;
+                if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+                    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+                        addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+                                                (xfs_dir2_data_aoff_t)
+                                                ((char *)dep - (char *)hdr));
+                        name.name = dep->name;
+                        name.len = dep->namelen;
+                        hash = mp->m_dirnameops->hashname(&name);
+                        for (i = 0; i < be32_to_cpu(btp->count); i++) {
+                                if (be32_to_cpu(lep[i].address) == addr &&
+                                    be32_to_cpu(lep[i].hashval) == hash)
+                                        break;
+                        }
+                        XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
+                }
+                p += ops->data_entsize(dep->namelen);
+        }
+        /*
+         * Need to have seen all the entries and all the bestfree slots.
+         */
+        XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
+        if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+            hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+                for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
+                        if (lep[i].address ==
+                            cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                                stale++;
+                        if (i > 0)
+                                XFS_WANT_CORRUPTED_RETURN(
+                                        be32_to_cpu(lep[i].hashval) >=
+                                                be32_to_cpu(lep[i - 1].hashval));
+                }
+                XFS_WANT_CORRUPTED_RETURN(count ==
+                        be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+                XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
+        }
+        return 0;
+}
+static bool
+xfs_dir3_data_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+                        return false;
+                if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+                        return false;
+        } else {
+                if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+                        return false;
+        }
+        if (__xfs_dir3_data_check(NULL, bp))
+                return false;
+        return true;
+}
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir3_data_reada_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+        switch (hdr->magic) {
+        case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+        case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+                bp->b_ops = &xfs_dir3_block_buf_ops;
+                bp->b_ops->verify_read(bp);
+                return;
+        case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+        case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+                xfs_dir3_data_verify(bp);
+                return;
+        default:
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                break;
+        }
+}
+static void
+xfs_dir3_data_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+             !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+                 xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_dir3_data_verify(bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+static void
+xfs_dir3_data_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+        if (!xfs_dir3_data_verify(bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (bip)
+                hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
+}
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+        .verify_read = xfs_dir3_data_read_verify,
+        .verify_write = xfs_dir3_data_write_verify,
+};
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+        .verify_read = xfs_dir3_data_reada_verify,
+        .verify_write = xfs_dir3_data_write_verify,
+};
+int
+xfs_dir3_data_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mapped_bno,
+        struct xfs_buf          **bpp)
+{
+        int                     err;
+        err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+                                XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
+        if (!err && tp)
+                xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+        return err;
+}
+int
+xfs_dir3_data_readahead(
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mapped_bno)
+{
+        return xfs_da_reada_buf(dp, bno, mapped_bno,
+                                XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
+}
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+        struct xfs_dir2_data_hdr *hdr,          /* data block header */
+        struct xfs_dir2_data_free *bf,          /* bestfree table pointer */
+        struct xfs_dir2_data_unused *dup)       /* unused space */
+{
+        xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
+        xfs_dir2_data_aoff_t    off;            /* offset value needed */
+#ifdef DEBUG
+        int                     matched;        /* matched the value */
+        int                     seenzero;       /* saw a 0 bestfree entry */
+#endif
+        off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+#ifdef DEBUG
+        /*
+         * Validate some consistency in the bestfree table.
+         * Check order, non-overlapping entries, and if we find the
+         * one we're looking for it has to be exact.
+         */
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+        for (dfp = &bf[0], seenzero = matched = 0;
+             dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
+             dfp++) {
+                if (!dfp->offset) {
+                        ASSERT(!dfp->length);
+                        seenzero = 1;
+                        continue;
+                }
+                ASSERT(seenzero == 0);
+                if (be16_to_cpu(dfp->offset) == off) {
+                        matched = 1;
+                        ASSERT(dfp->length == dup->length);
+                } else if (off < be16_to_cpu(dfp->offset))
+                        ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
+                else
+                        ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
+                ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
+                if (dfp > &bf[0])
+                        ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
+        }
+#endif
+        /*
+         * If this is smaller than the smallest bestfree entry,
+         * it can't be there since they're sorted.
+         */
+        if (be16_to_cpu(dup->length) <
+            be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+                return NULL;
+        /*
+         * Look at the three bestfree entries for our guy.
+         */
+        for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+                if (!dfp->offset)
+                        return NULL;
+                if (be16_to_cpu(dfp->offset) == off)
+                        return dfp;
+        }
+        /*
+         * Didn't find it.  This only happens if there are duplicate lengths.
+         */
+        return NULL;
+}
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t *                          /* entry inserted */
+xfs_dir2_data_freeinsert(
+        struct xfs_dir2_data_hdr *hdr,          /* data block pointer */
+        struct xfs_dir2_data_free *dfp,         /* bestfree table pointer */
+        struct xfs_dir2_data_unused *dup,       /* unused space */
+        int                     *loghead)       /* log the data header (out) */
+{
+        xfs_dir2_data_free_t    new;            /* new bestfree entry */
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+        new.length = dup->length;
+        new.offset = cpu_to_be16((char *)dup - (char *)hdr);
+        /*
+         * Insert at position 0, 1, or 2; or not at all.
+         */
+        if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
+                dfp[2] = dfp[1];
+                dfp[1] = dfp[0];
+                dfp[0] = new;
+                *loghead = 1;
+                return &dfp[0];
+        }
+        if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
+                dfp[2] = dfp[1];
+                dfp[1] = new;
+                *loghead = 1;
+                return &dfp[1];
+        }
+        if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
+                dfp[2] = new;
+                *loghead = 1;
+                return &dfp[2];
+        }
+        return NULL;
+}
+/*
+ * Remove a bestfree entry from the table.
+ */
+STATIC void
+xfs_dir2_data_freeremove(
+        struct xfs_dir2_data_hdr *hdr,          /* data block header */
+        struct xfs_dir2_data_free *bf,          /* bestfree table pointer */
+        struct xfs_dir2_data_free *dfp,         /* bestfree entry pointer */
+        int                     *loghead)       /* out: log data header */
+{
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+        /*
+         * It's the first entry, slide the next 2 up.
+         */
+        if (dfp == &bf[0]) {
+                bf[0] = bf[1];
+                bf[1] = bf[2];
+        }
+        /*
+         * It's the second entry, slide the 3rd entry up.
+         */
+        else if (dfp == &bf[1])
+                bf[1] = bf[2];
+        /*
+         * Must be the last entry.
+         */
+        else
+                ASSERT(dfp == &bf[2]);
+        /*
+         * Clear the 3rd entry, must be zero now.
+         */
+        bf[2].length = 0;
+        bf[2].offset = 0;
+        *loghead = 1;
+}
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+        struct xfs_inode        *dp,
+        struct xfs_dir2_data_hdr *hdr,
+        int                     *loghead)
+{
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* active data entry */
+        xfs_dir2_data_unused_t  *dup;           /* unused data entry */
+        struct xfs_dir2_data_free *bf;
+        char                    *endp;          /* end of block's data */
+        char                    *p;             /* current entry pointer */
+        struct xfs_da_geometry  *geo = dp->i_mount->m_dir_geo;
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+        /*
+         * Start by clearing the table.
+         */
+        bf = dp->d_ops->data_bestfree_p(hdr);
+        memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
+        *loghead = 1;
+        /*
+         * Set up pointers.
+         */
+        p = (char *)dp->d_ops->data_entry_p(hdr);
+        if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+            hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+                btp = xfs_dir2_block_tail_p(geo, hdr);
+                endp = (char *)xfs_dir2_block_leaf_p(btp);
+        } else
+                endp = (char *)hdr + geo->blksize;
+        /*
+         * Loop over the block's entries.
+         */
+        while (p < endp) {
+                dup = (xfs_dir2_data_unused_t *)p;
+                /*
+                 * If it's a free entry, insert it.
+                 */
+                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                        ASSERT((char *)dup - (char *)hdr ==
+                               be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+                        xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
+                        p += be16_to_cpu(dup->length);
+                }
+                /*
+                 * For active entries, check their tags and skip them.
+                 */
+                else {
+                        dep = (xfs_dir2_data_entry_t *)p;
+                        ASSERT((char *)dep - (char *)hdr ==
+                               be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
+                        p += dp->d_ops->data_entsize(dep->namelen);
+                }
+        }
+}
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int                                             /* error */
+xfs_dir3_data_init(
+        xfs_da_args_t           *args,          /* directory operation args */
+        xfs_dir2_db_t           blkno,          /* logical dir block number */
+        struct xfs_buf          **bpp)          /* output block buffer */
+{
+        struct xfs_buf          *bp;            /* block buffer */
+        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* unused entry pointer */
+        struct xfs_dir2_data_free *bf;
+        int                     error;          /* error return value */
+        int                     i;              /* bestfree index */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        int                     t;              /* temp */
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Get the buffer set up for the block.
+         */
+        error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+                               -1, &bp, XFS_DATA_FORK);
+        if (error)
+                return error;
+        bp->b_ops = &xfs_dir3_data_buf_ops;
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
+        /*
+         * Initialize the header.
+         */
+        hdr = bp->b_addr;
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+                memset(hdr3, 0, sizeof(*hdr3));
+                hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+                hdr3->blkno = cpu_to_be64(bp->b_bn);
+                hdr3->owner = cpu_to_be64(dp->i_ino);
+                uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+        } else
+                hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+        bf = dp->d_ops->data_bestfree_p(hdr);
+        bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
+        for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+                bf[i].length = 0;
+                bf[i].offset = 0;
+        }
+        /*
+         * Set up an unused entry for the block's body.
+         */
+        dup = dp->d_ops->data_unused_p(hdr);
+        dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+        t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
+        bf[0].length = cpu_to_be16(t);
+        dup->length = cpu_to_be16(t);
+        *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
+        /*
+         * Log it and return it.
+         */
+        xfs_dir2_data_log_header(args, bp);
+        xfs_dir2_data_log_unused(args, bp, dup);
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp,
+        xfs_dir2_data_entry_t   *dep)           /* data entry pointer */
+{
+        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+        xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+                (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
+                       (char *)hdr - 1));
+}
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp)
+{
+#ifdef DEBUG
+        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
+        xfs_trans_log_buf(args->trans, bp, 0,
+                          args->dp->d_ops->data_entry_offset - 1);
+}
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp,
+        xfs_dir2_data_unused_t  *dup)           /* data unused pointer */
+{
+        xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+        /*
+         * Log the first part of the unused entry.
+         */
+        xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
+                (uint)((char *)&dup->length + sizeof(dup->length) -
+                       1 - (char *)hdr));
+        /*
+         * Log the end (tag) of the unused entry.
+         */
+        xfs_trans_log_buf(args->trans, bp,
+                (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
+                (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
+                       sizeof(xfs_dir2_data_off_t) - 1));
+}
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp,
+        xfs_dir2_data_aoff_t    offset,         /* starting byte offset */
+        xfs_dir2_data_aoff_t    len,            /* length in bytes */
+        int                     *needlogp,      /* out: log header */
+        int                     *needscanp)     /* out: regen bestfree */
+{
+        xfs_dir2_data_hdr_t     *hdr;           /* data block pointer */
+        xfs_dir2_data_free_t    *dfp;           /* bestfree pointer */
+        char                    *endptr;        /* end of data area */
+        int                     needscan;       /* need to regen bestfree */
+        xfs_dir2_data_unused_t  *newdup;        /* new unused entry */
+        xfs_dir2_data_unused_t  *postdup;       /* unused entry after us */
+        xfs_dir2_data_unused_t  *prevdup;       /* unused entry before us */
+        struct xfs_dir2_data_free *bf;
+        hdr = bp->b_addr;
+        /*
+         * Figure out where the end of the data area is.
+         */
+        if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+            hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+                endptr = (char *)hdr + args->geo->blksize;
+        else {
+                xfs_dir2_block_tail_t   *btp;   /* block tail */
+                ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+                        hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+                btp = xfs_dir2_block_tail_p(args->geo, hdr);
+                endptr = (char *)xfs_dir2_block_leaf_p(btp);
+        }
+        /*
+         * If this isn't the start of the block, then back up to
+         * the previous entry and see if it's free.
+         */
+        if (offset > args->dp->d_ops->data_entry_offset) {
+                __be16                  *tagp;  /* tag just before us */
+                tagp = (__be16 *)((char *)hdr + offset) - 1;
+                prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+                if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+                        prevdup = NULL;
+        } else
+                prevdup = NULL;
+        /*
+         * If this isn't the end of the block, see if the entry after
+         * us is free.
+         */
+        if ((char *)hdr + offset + len < endptr) {
+                postdup =
+                        (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+                if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+                        postdup = NULL;
+        } else
+                postdup = NULL;
+        ASSERT(*needscanp == 0);
+        needscan = 0;
+        /*
+         * Previous and following entries are both free,
+         * merge everything into a single free entry.
+         */
+        bf = args->dp->d_ops->data_bestfree_p(hdr);
+        if (prevdup && postdup) {
+                xfs_dir2_data_free_t    *dfp2;  /* another bestfree pointer */
+                /*
+                 * See if prevdup and/or postdup are in bestfree table.
+                 */
+                dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+                dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
+                /*
+                 * We need a rescan unless there are exactly 2 free entries
+                 * namely our two.  Then we know what's happening, otherwise
+                 * since the third bestfree is there, there might be more
+                 * entries.
+                 */
+                needscan = (bf[2].length != 0);
+                /*
+                 * Fix up the new big freespace.
+                 */
+                be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
+                *xfs_dir2_data_unused_tag_p(prevdup) =
+                        cpu_to_be16((char *)prevdup - (char *)hdr);
+                xfs_dir2_data_log_unused(args, bp, prevdup);
+                if (!needscan) {
+                        /*
+                         * Has to be the case that entries 0 and 1 are
+                         * dfp and dfp2 (don't know which is which), and
+                         * entry 2 is empty.
+                         * Remove entry 1 first then entry 0.
+                         */
+                        ASSERT(dfp && dfp2);
+                        if (dfp == &bf[1]) {
+                                dfp = &bf[0];
+                                ASSERT(dfp2 == dfp);
+                                dfp2 = &bf[1];
+                        }
+                        xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+                        xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                        /*
+                         * Now insert the new entry.
+                         */
+                        dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+                                                       needlogp);
+                        ASSERT(dfp == &bf[0]);
+                        ASSERT(dfp->length == prevdup->length);
+                        ASSERT(!dfp[1].length);
+                        ASSERT(!dfp[2].length);
+                }
+        }
+        /*
+         * The entry before us is free, merge with it.
+         */
+        else if (prevdup) {
+                dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+                be16_add_cpu(&prevdup->length, len);
+                *xfs_dir2_data_unused_tag_p(prevdup) =
+                        cpu_to_be16((char *)prevdup - (char *)hdr);
+                xfs_dir2_data_log_unused(args, bp, prevdup);
+                /*
+                 * If the previous entry was in the table, the new entry
+                 * is longer, so it will be in the table too.  Remove
+                 * the old one and add the new one.
+                 */
+                if (dfp) {
+                        xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                        xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
+                }
+                /*
+                 * Otherwise we need a scan if the new entry is big enough.
+                 */
+                else {
+                        needscan = be16_to_cpu(prevdup->length) >
+                                   be16_to_cpu(bf[2].length);
+                }
+        }
+        /*
+         * The following entry is free, merge with it.
+         */
+        else if (postdup) {
+                dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
+                newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+                newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+                newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
+                *xfs_dir2_data_unused_tag_p(newdup) =
+                        cpu_to_be16((char *)newdup - (char *)hdr);
+                xfs_dir2_data_log_unused(args, bp, newdup);
+                /*
+                 * If the following entry was in the table, the new entry
+                 * is longer, so it will be in the table too.  Remove
+                 * the old one and add the new one.
+                 */
+                if (dfp) {
+                        xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                        xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+                }
+                /*
+                 * Otherwise we need a scan if the new entry is big enough.
+                 */
+                else {
+                        needscan = be16_to_cpu(newdup->length) >
+                                   be16_to_cpu(bf[2].length);
+                }
+        }
+        /*
+         * Neither neighbor is free.  Make a new entry.
+         */
+        else {
+                newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+                newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+                newdup->length = cpu_to_be16(len);
+                *xfs_dir2_data_unused_tag_p(newdup) =
+                        cpu_to_be16((char *)newdup - (char *)hdr);
+                xfs_dir2_data_log_unused(args, bp, newdup);
+                xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
+        }
+        *needscanp = needscan;
+}
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+void
+xfs_dir2_data_use_free(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp,
+        xfs_dir2_data_unused_t  *dup,           /* unused entry */
+        xfs_dir2_data_aoff_t    offset,         /* starting offset to use */
+        xfs_dir2_data_aoff_t    len,            /* length to use */
+        int                     *needlogp,      /* out: need to log header */
+        int                     *needscanp)     /* out: need regen bestfree */
+{
+        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+        xfs_dir2_data_free_t    *dfp;           /* bestfree pointer */
+        int                     matchback;      /* matches end of freespace */
+        int                     matchfront;     /* matches start of freespace */
+        int                     needscan;       /* need to regen bestfree */
+        xfs_dir2_data_unused_t  *newdup;        /* new unused entry */
+        xfs_dir2_data_unused_t  *newdup2;       /* another new unused entry */
+        int                     oldlen;         /* old unused entry's length */
+        struct xfs_dir2_data_free *bf;
+        hdr = bp->b_addr;
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+        ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
+        ASSERT(offset >= (char *)dup - (char *)hdr);
+        ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
+        ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+        /*
+         * Look up the entry in the bestfree table.
+         */
+        oldlen = be16_to_cpu(dup->length);
+        bf = args->dp->d_ops->data_bestfree_p(hdr);
+        dfp = xfs_dir2_data_freefind(hdr, bf, dup);
+        ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
+        /*
+         * Check for alignment with front and back of the entry.
+         */
+        matchfront = (char *)dup - (char *)hdr == offset;
+        matchback = (char *)dup + oldlen - (char *)hdr == offset + len;
+        ASSERT(*needscanp == 0);
+        needscan = 0;
+        /*
+         * If we matched it exactly we just need to get rid of it from
+         * the bestfree table.
+         */
+        if (matchfront && matchback) {
+                if (dfp) {
+                        needscan = (bf[2].offset != 0);
+                        if (!needscan)
+                                xfs_dir2_data_freeremove(hdr, bf, dfp,
+                                                         needlogp);
+                }
+        }
+        /*
+         * We match the first part of the entry.
+         * Make a new entry with the remaining freespace.
+         */
+        else if (matchfront) {
+                newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+                newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+                newdup->length = cpu_to_be16(oldlen - len);
+                *xfs_dir2_data_unused_tag_p(newdup) =
+                        cpu_to_be16((char *)newdup - (char *)hdr);
+                xfs_dir2_data_log_unused(args, bp, newdup);
+                /*
+                 * If it was in the table, remove it and add the new one.
+                 */
+                if (dfp) {
+                        xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                        dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+                                                       needlogp);
+                        ASSERT(dfp != NULL);
+                        ASSERT(dfp->length == newdup->length);
+                        ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+                        /*
+                         * If we got inserted at the last slot,
+                         * that means we don't know if there was a better
+                         * choice for the last slot, or not.  Rescan.
+                         */
+                        needscan = dfp == &bf[2];
+                }
+        }
+        /*
+         * We match the last part of the entry.
+         * Trim the allocated space off the tail of the entry.
+         */
+        else if (matchback) {
+                newdup = dup;
+                newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+                *xfs_dir2_data_unused_tag_p(newdup) =
+                        cpu_to_be16((char *)newdup - (char *)hdr);
+                xfs_dir2_data_log_unused(args, bp, newdup);
+                /*
+                 * If it was in the table, remove it and add the new one.
+                 */
+                if (dfp) {
+                        xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+                        dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+                                                       needlogp);
+                        ASSERT(dfp != NULL);
+                        ASSERT(dfp->length == newdup->length);
+                        ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
+                        /*
+                         * If we got inserted at the last slot,
+                         * that means we don't know if there was a better
+                         * choice for the last slot, or not.  Rescan.
+                         */
+                        needscan = dfp == &bf[2];
+                }
+        }
+        /*
+         * Poking out the middle of an entry.
+         * Make two new entries.
+         */
+        else {
+                newdup = dup;
+                newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
+                *xfs_dir2_data_unused_tag_p(newdup) =
+                        cpu_to_be16((char *)newdup - (char *)hdr);
+                xfs_dir2_data_log_unused(args, bp, newdup);
+                newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
+                newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+                newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
+                *xfs_dir2_data_unused_tag_p(newdup2) =
+                        cpu_to_be16((char *)newdup2 - (char *)hdr);
+                xfs_dir2_data_log_unused(args, bp, newdup2);
+                /*
+                 * If the old entry was in the table, we need to scan
+                 * if the 3rd entry was valid, since these entries
+                 * are smaller than the old one.
+                 * If we don't need to scan that means there were 1 or 2
+                 * entries in the table, and removing the old and adding
+                 * the 2 new will work.
+                 */
+                if (dfp) {
+                        needscan = (bf[2].length != 0);
+                        if (!needscan) {
+                                xfs_dir2_data_freeremove(hdr, bf, dfp,
+                                                         needlogp);
+                                xfs_dir2_data_freeinsert(hdr, bf, newdup,
+                                                         needlogp);
+                                xfs_dir2_data_freeinsert(hdr, bf, newdup2,
+                                                         needlogp);
+                        }
+                }
+        }
+        *needscanp = needscan;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
new file mode 100644
index 000000000000..a19174eb3cb2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -0,0 +1,1831 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+/*
+ * Local function declarations.
+ */
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
+                                    int *indexp, struct xfs_buf **dbpp);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+                                    struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+                                   struct xfs_buf *bp);
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+#ifdef DEBUG
+#define xfs_dir3_leaf_check(dp, bp) \
+do { \
+        if (!xfs_dir3_leaf1_check((dp), (bp))) \
+                ASSERT(0); \
+} while (0);
+STATIC bool
+xfs_dir3_leaf1_check(
+        struct xfs_inode        *dp,
+        struct xfs_buf          *bp)
+{
+        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+                struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+                if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+                        return false;
+        } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+                return false;
+        return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define xfs_dir3_leaf_check(dp, bp)
+#endif
+bool
+xfs_dir3_leaf_check_int(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *dp,
+        struct xfs_dir3_icleaf_hdr *hdr,
+        struct xfs_dir2_leaf    *leaf)
+{
+        struct xfs_dir2_leaf_entry *ents;
+        xfs_dir2_leaf_tail_t    *ltp;
+        int                     stale;
+        int                     i;
+        const struct xfs_dir_ops *ops;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        struct xfs_da_geometry  *geo = mp->m_dir_geo;
+        /*
+         * we can be passed a null dp here from a verifier, so we need to go the
+         * hard way to get them.
+         */
+        ops = xfs_dir_get_ops(mp, dp);
+        if (!hdr) {
+                ops->leaf_hdr_from_disk(&leafhdr, leaf);
+                hdr = &leafhdr;
+        }
+        ents = ops->leaf_ents_p(leaf);
+        ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+        /*
+         * XXX (dgc): This value is not restrictive enough.
+         * Should factor in the size of the bests table as well.
+         * We can deduce a value for that from di_size.
+         */
+        if (hdr->count > ops->leaf_max_ents(geo))
+                return false;
+        /* Leaves and bests don't overlap in leaf format. */
+        if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+             hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+            (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+                return false;
+        /* Check hash value order, count stale entries.  */
+        for (i = stale = 0; i < hdr->count; i++) {
+                if (i + 1 < hdr->count) {
+                        if (be32_to_cpu(ents[i].hashval) >
+                                        be32_to_cpu(ents[i + 1].hashval))
+                                return false;
+                }
+                if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                        stale++;
+        }
+        if (hdr->stale != stale)
+                return false;
+        return true;
+}
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
+static bool
+xfs_dir3_leaf_verify(
+        struct xfs_buf          *bp,
+        __uint16_t              magic)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+        ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+                __uint16_t              magic3;
+                magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
+                                                         : XFS_DIR3_LEAFN_MAGIC;
+                if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
+                        return false;
+                if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+                        return false;
+        } else {
+                if (leaf->hdr.info.magic != cpu_to_be16(magic))
+                        return false;
+        }
+        return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
+}
+static void
+__read_verify(
+        struct xfs_buf  *bp,
+        __uint16_t      magic)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+             !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_dir3_leaf_verify(bp, magic))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+static void
+__write_verify(
+        struct xfs_buf  *bp,
+        __uint16_t      magic)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+        if (!xfs_dir3_leaf_verify(bp, magic)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (bip)
+                hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
+}
+static void
+xfs_dir3_leaf1_read_verify(
+        struct xfs_buf  *bp)
+{
+        __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+static void
+xfs_dir3_leaf1_write_verify(
+        struct xfs_buf  *bp)
+{
+        __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
+}
+static void
+xfs_dir3_leafn_read_verify(
+        struct xfs_buf  *bp)
+{
+        __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+static void
+xfs_dir3_leafn_write_verify(
+        struct xfs_buf  *bp)
+{
+        __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
+}
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+        .verify_read = xfs_dir3_leaf1_read_verify,
+        .verify_write = xfs_dir3_leaf1_write_verify,
+};
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+        .verify_read = xfs_dir3_leafn_read_verify,
+        .verify_write = xfs_dir3_leafn_write_verify,
+};
+static int
+xfs_dir3_leaf_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        int                     err;
+        err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                                XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+        if (!err && tp)
+                xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+        return err;
+}
+int
+xfs_dir3_leafn_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        int                     err;
+        err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                                XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+        if (!err && tp)
+                xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+        return err;
+}
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_buf          *bp,
+        xfs_ino_t               owner,
+        __uint16_t              type)
+{
+        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+        ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+                memset(leaf3, 0, sizeof(*leaf3));
+                leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+                                         ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+                                         : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+                leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+                leaf3->info.owner = cpu_to_be64(owner);
+                uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+        } else {
+                memset(leaf, 0, sizeof(*leaf));
+                leaf->hdr.info.magic = cpu_to_be16(type);
+        }
+        /*
+         * If it's a leaf-format directory initialize the tail.
+         * Caller is responsible for initialising the bests table.
+         */
+        if (type == XFS_DIR2_LEAF1_MAGIC) {
+                struct xfs_dir2_leaf_tail *ltp;
+                ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
+                ltp->bestcount = 0;
+                bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+                xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+        } else {
+                bp->b_ops = &xfs_dir3_leafn_buf_ops;
+                xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+        }
+}
+int
+xfs_dir3_leaf_get_buf(
+        xfs_da_args_t           *args,
+        xfs_dir2_db_t           bno,
+        struct xfs_buf          **bpp,
+        __uint16_t              magic)
+{
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_trans        *tp = args->trans;
+        struct xfs_mount        *mp = dp->i_mount;
+        struct xfs_buf          *bp;
+        int                     error;
+        ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+        ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+               bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+        error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+                               -1, &bp, XFS_DATA_FORK);
+        if (error)
+                return error;
+        xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+        xfs_dir3_leaf_log_header(args, bp);
+        if (magic == XFS_DIR2_LEAF1_MAGIC)
+                xfs_dir3_leaf_log_tail(args, bp);
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int                                             /* error */
+xfs_dir2_block_to_leaf(
+        xfs_da_args_t           *args,          /* operation arguments */
+        struct xfs_buf          *dbp)           /* input block's buffer */
+{
+        __be16                  *bestsp;        /* leaf's bestsp entries */
+        xfs_dablk_t             blkno;          /* leaf block's bno */
+        xfs_dir2_data_hdr_t     *hdr;           /* block header */
+        xfs_dir2_leaf_entry_t   *blp;           /* block's leaf entries */
+        xfs_dir2_block_tail_t   *btp;           /* block's tail */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        struct xfs_buf          *lbp;           /* leaf block's buffer */
+        xfs_dir2_db_t           ldb;            /* leaf block's bno */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf's tail */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log block header */
+        int                     needscan;       /* need to rescan bestfree */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir2_data_free *bf;
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        trace_xfs_dir2_block_to_leaf(args);
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Add the leaf block to the inode.
+         * This interface will only put blocks in the leaf/node range.
+         * Since that's empty now, we'll get the root (block 0 in range).
+         */
+        if ((error = xfs_da_grow_inode(args, &blkno))) {
+                return error;
+        }
+        ldb = xfs_dir2_da_to_db(args->geo, blkno);
+        ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
+        /*
+         * Initialize the leaf block, get a buffer for it.
+         */
+        error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+        if (error)
+                return error;
+        leaf = lbp->b_addr;
+        hdr = dbp->b_addr;
+        xfs_dir3_data_check(dp, dbp);
+        btp = xfs_dir2_block_tail_p(args->geo, hdr);
+        blp = xfs_dir2_block_leaf_p(btp);
+        bf = dp->d_ops->data_bestfree_p(hdr);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        /*
+         * Set the counts in the leaf header.
+         */
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        leafhdr.count = be32_to_cpu(btp->count);
+        leafhdr.stale = be32_to_cpu(btp->stale);
+        dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+        xfs_dir3_leaf_log_header(args, lbp);
+        /*
+         * Could compact these but I think we always do the conversion
+         * after squeezing out stale entries.
+         */
+        memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+        xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
+        needscan = 0;
+        needlog = 1;
+        /*
+         * Make the space formerly occupied by the leaf entries and block
+         * tail be free.
+         */
+        xfs_dir2_data_make_free(args, dbp,
+                (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+                (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
+                                       (char *)blp),
+                &needlog, &needscan);
+        /*
+         * Fix up the block header, make it a data block.
+         */
+        dbp->b_ops = &xfs_dir3_data_buf_ops;
+        xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+        if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+                hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+        else
+                hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+        if (needscan)
+                xfs_dir2_data_freescan(dp, hdr, &needlog);
+        /*
+         * Set up leaf tail and bests table.
+         */
+        ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+        ltp->bestcount = cpu_to_be32(1);
+        bestsp = xfs_dir2_leaf_bests_p(ltp);
+        bestsp[0] =  bf[0].length;
+        /*
+         * Log the data header and leaf bests table.
+         */
+        if (needlog)
+                xfs_dir2_data_log_header(args, dbp);
+        xfs_dir3_leaf_check(dp, lbp);
+        xfs_dir3_data_check(dp, dbp);
+        xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
+        return 0;
+}
+STATIC void
+xfs_dir3_leaf_find_stale(
+        struct xfs_dir3_icleaf_hdr *leafhdr,
+        struct xfs_dir2_leaf_entry *ents,
+        int                     index,
+        int                     *lowstale,
+        int                     *highstale)
+{
+        /*
+         * Find the first stale entry before our index, if any.
+         */
+        for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
+                if (ents[*lowstale].address ==
+                    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                        break;
+        }
+        /*
+         * Find the first stale entry at or after our index, if any.
+         * Stop if the result would require moving more entries than using
+         * lowstale.
+         */
+        for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+                if (ents[*highstale].address ==
+                    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                        break;
+                if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
+                        break;
+        }
+}
+struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(
+        struct xfs_dir3_icleaf_hdr *leafhdr,
+        struct xfs_dir2_leaf_entry *ents,
+        int                     index,          /* leaf table position */
+        int                     compact,        /* need to compact leaves */
+        int                     lowstale,       /* index of prev stale leaf */
+        int                     highstale,      /* index of next stale leaf */
+        int                     *lfloglow,      /* low leaf logging index */
+        int                     *lfloghigh)     /* high leaf logging index */
+{
+        if (!leafhdr->stale) {
+                xfs_dir2_leaf_entry_t   *lep;   /* leaf entry table pointer */
+                /*
+                 * Now we need to make room to insert the leaf entry.
+                 *
+                 * If there are no stale entries, just insert a hole at index.
+                 */
+                lep = &ents[index];
+                if (index < leafhdr->count)
+                        memmove(lep + 1, lep,
+                                (leafhdr->count - index) * sizeof(*lep));
+                /*
+                 * Record low and high logging indices for the leaf.
+                 */
+                *lfloglow = index;
+                *lfloghigh = leafhdr->count++;
+                return lep;
+        }
+        /*
+         * There are stale entries.
+         *
+         * We will use one of them for the new entry.  It's probably not at
+         * the right location, so we'll have to shift some up or down first.
+         *
+         * If we didn't compact before, we need to find the nearest stale
+         * entries before and after our insertion point.
+         */
+        if (compact == 0)
+                xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+                                         &lowstale, &highstale);
+        /*
+         * If the low one is better, use it.
+         */
+        if (lowstale >= 0 &&
+            (highstale == leafhdr->count ||
+             index - lowstale - 1 < highstale - index)) {
+                ASSERT(index - lowstale - 1 >= 0);
+                ASSERT(ents[lowstale].address ==
+                       cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+                /*
+                 * Copy entries up to cover the stale entry and make room
+                 * for the new entry.
+                 */
+                if (index - lowstale - 1 > 0) {
+                        memmove(&ents[lowstale], &ents[lowstale + 1],
+                                (index - lowstale - 1) *
+                                        sizeof(xfs_dir2_leaf_entry_t));
+                }
+                *lfloglow = MIN(lowstale, *lfloglow);
+                *lfloghigh = MAX(index - 1, *lfloghigh);
+                leafhdr->stale--;
+                return &ents[index - 1];
+        }
+        /*
+         * The high one is better, so use that one.
+         */
+        ASSERT(highstale - index >= 0);
+        ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+        /*
+         * Copy entries down to cover the stale entry and make room for the
+         * new entry.
+         */
+        if (highstale - index > 0) {
+                memmove(&ents[index + 1], &ents[index],
+                        (highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
+        }
+        *lfloglow = MIN(index, *lfloglow);
+        *lfloghigh = MAX(highstale, *lfloghigh);
+        leafhdr->stale--;
+        return &ents[index];
+}
+/*
+ * Add an entry to a leaf form directory.
+ */
+int                                             /* error */
+xfs_dir2_leaf_addname(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        __be16                  *bestsp;        /* freespace table in leaf */
+        int                     compact;        /* need to compact leaves */
+        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+        struct xfs_buf          *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* data unused entry */
+        int                     error;          /* error return value */
+        int                     grown;          /* allocated new data block */
+        int                     highstale;      /* index of next stale leaf */
+        int                     i;              /* temporary, index */
+        int                     index;          /* leaf table position */
+        struct xfs_buf          *lbp;           /* leaf's buffer */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        int                     length;         /* length of new entry */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry table pointer */
+        int                     lfloglow;       /* low leaf logging index */
+        int                     lfloghigh;      /* high leaf logging index */
+        int                     lowstale;       /* index of prev stale leaf */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail pointer */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needbytes;      /* leaf block bytes needed */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to rescan data free */
+        __be16                  *tagp;          /* end of data entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_db_t           use_block;      /* data block number */
+        struct xfs_dir2_data_free *bf;          /* bestfree table */
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        trace_xfs_dir2_leaf_addname(args);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+        if (error)
+                return error;
+        /*
+         * Look up the entry by hash value and name.
+         * We know it's not there, our caller has already done a lookup.
+         * So the index is of the entry to insert in front of.
+         * But if there are dup hash values the index is of the first of those.
+         */
+        index = xfs_dir2_leaf_search_hash(args, lbp);
+        leaf = lbp->b_addr;
+        ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        bestsp = xfs_dir2_leaf_bests_p(ltp);
+        length = dp->d_ops->data_entsize(args->namelen);
+        /*
+         * See if there are any entries with the same hash value
+         * and space in their block for the new entry.
+         * This is good because it puts multiple same-hash value entries
+         * in a data block, improving the lookup of those entries.
+         */
+        for (use_block = -1, lep = &ents[index];
+             index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+             index++, lep++) {
+                if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+                ASSERT(i < be32_to_cpu(ltp->bestcount));
+                ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
+                if (be16_to_cpu(bestsp[i]) >= length) {
+                        use_block = i;
+                        break;
+                }
+        }
+        /*
+         * Didn't find a block yet, linear search all the data blocks.
+         */
+        if (use_block == -1) {
+                for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
+                        /*
+                         * Remember a block we see that's missing.
+                         */
+                        if (bestsp[i] == cpu_to_be16(NULLDATAOFF) &&
+                            use_block == -1)
+                                use_block = i;
+                        else if (be16_to_cpu(bestsp[i]) >= length) {
+                                use_block = i;
+                                break;
+                        }
+                }
+        }
+        /*
+         * How many bytes do we need in the leaf block?
+         */
+        needbytes = 0;
+        if (!leafhdr.stale)
+                needbytes += sizeof(xfs_dir2_leaf_entry_t);
+        if (use_block == -1)
+                needbytes += sizeof(xfs_dir2_data_off_t);
+        /*
+         * Now kill use_block if it refers to a missing block, so we
+         * can use it as an indication of allocation needed.
+         */
+        if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF))
+                use_block = -1;
+        /*
+         * If we don't have enough free bytes but we can make enough
+         * by compacting out stale entries, we'll do that.
+         */
+        if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+            leafhdr.stale > 1)
+                compact = 1;
+        /*
+         * Otherwise if we don't have enough free bytes we need to
+         * convert to node form.
+         */
+        else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
+                /*
+                 * Just checking or no space reservation, give up.
+                 */
+                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+                                                        args->total == 0) {
+                        xfs_trans_brelse(tp, lbp);
+                        return -ENOSPC;
+                }
+                /*
+                 * Convert to node form.
+                 */
+                error = xfs_dir2_leaf_to_node(args, lbp);
+                if (error)
+                        return error;
+                /*
+                 * Then add the new entry.
+                 */
+                return xfs_dir2_node_addname(args);
+        }
+        /*
+         * Otherwise it will fit without compaction.
+         */
+        else
+                compact = 0;
+        /*
+         * If just checking, then it will fit unless we needed to allocate
+         * a new data block.
+         */
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+                xfs_trans_brelse(tp, lbp);
+                return use_block == -1 ? -ENOSPC : 0;
+        }
+        /*
+         * If no allocations are allowed, return now before we've
+         * changed anything.
+         */
+        if (args->total == 0 && use_block == -1) {
+                xfs_trans_brelse(tp, lbp);
+                return -ENOSPC;
+        }
+        /*
+         * Need to compact the leaf entries, removing stale ones.
+         * Leave one stale entry behind - the one closest to our
+         * insertion index - and we'll shift that one to our insertion
+         * point later.
+         */
+        if (compact) {
+                xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+                        &highstale, &lfloglow, &lfloghigh);
+        }
+        /*
+         * There are stale entries, so we'll need log-low and log-high
+         * impossibly bad values later.
+         */
+        else if (leafhdr.stale) {
+                lfloglow = leafhdr.count;
+                lfloghigh = -1;
+        }
+        /*
+         * If there was no data block space found, we need to allocate
+         * a new one.
+         */
+        if (use_block == -1) {
+                /*
+                 * Add the new data block.
+                 */
+                if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+                                &use_block))) {
+                        xfs_trans_brelse(tp, lbp);
+                        return error;
+                }
+                /*
+                 * Initialize the block.
+                 */
+                if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
+                        xfs_trans_brelse(tp, lbp);
+                        return error;
+                }
+                /*
+                 * If we're adding a new data block on the end we need to
+                 * extend the bests table.  Copy it up one entry.
+                 */
+                if (use_block >= be32_to_cpu(ltp->bestcount)) {
+                        bestsp--;
+                        memmove(&bestsp[0], &bestsp[1],
+                                be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
+                        be32_add_cpu(&ltp->bestcount, 1);
+                        xfs_dir3_leaf_log_tail(args, lbp);
+                        xfs_dir3_leaf_log_bests(args, lbp, 0,
+                                                be32_to_cpu(ltp->bestcount) - 1);
+                }
+                /*
+                 * If we're filling in a previously empty block just log it.
+                 */
+                else
+                        xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+                hdr = dbp->b_addr;
+                bf = dp->d_ops->data_bestfree_p(hdr);
+                bestsp[use_block] = bf[0].length;
+                grown = 1;
+        } else {
+                /*
+                 * Already had space in some data block.
+                 * Just read that one in.
+                 */
+                error = xfs_dir3_data_read(tp, dp,
+                                   xfs_dir2_db_to_da(args->geo, use_block),
+                                   -1, &dbp);
+                if (error) {
+                        xfs_trans_brelse(tp, lbp);
+                        return error;
+                }
+                hdr = dbp->b_addr;
+                bf = dp->d_ops->data_bestfree_p(hdr);
+                grown = 0;
+        }
+        /*
+         * Point to the biggest freespace in our data block.
+         */
+        dup = (xfs_dir2_data_unused_t *)
+              ((char *)hdr + be16_to_cpu(bf[0].offset));
+        ASSERT(be16_to_cpu(dup->length) >= length);
+        needscan = needlog = 0;
+        /*
+         * Mark the initial part of our freespace in use for the new entry.
+         */
+        xfs_dir2_data_use_free(args, dbp, dup,
+                (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
+                &needlog, &needscan);
+        /*
+         * Initialize our new entry (at last).
+         */
+        dep = (xfs_dir2_data_entry_t *)dup;
+        dep->inumber = cpu_to_be64(args->inumber);
+        dep->namelen = args->namelen;
+        memcpy(dep->name, args->name, dep->namelen);
+        dp->d_ops->data_put_ftype(dep, args->filetype);
+        tagp = dp->d_ops->data_entry_tag_p(dep);
+        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+        /*
+         * Need to scan fix up the bestfree table.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(dp, hdr, &needlog);
+        /*
+         * Need to log the data block's header.
+         */
+        if (needlog)
+                xfs_dir2_data_log_header(args, dbp);
+        xfs_dir2_data_log_entry(args, dbp, dep);
+        /*
+         * If the bests table needs to be changed, do it.
+         * Log the change unless we've already done that.
+         */
+        if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+                bestsp[use_block] = bf[0].length;
+                if (!grown)
+                        xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
+        }
+        lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+                                       highstale, &lfloglow, &lfloghigh);
+        /*
+         * Fill in the new leaf entry.
+         */
+        lep->hashval = cpu_to_be32(args->hashval);
+        lep->address = cpu_to_be32(
+                                xfs_dir2_db_off_to_dataptr(args->geo, use_block,
+                                be16_to_cpu(*tagp)));
+        /*
+         * Log the leaf fields and give up the buffers.
+         */
+        dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+        xfs_dir3_leaf_log_header(args, lbp);
+        xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
+        xfs_dir3_leaf_check(dp, lbp);
+        xfs_dir3_data_check(dp, dbp);
+        return 0;
+}
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir3_leaf_compact(
+        xfs_da_args_t   *args,          /* operation arguments */
+        struct xfs_dir3_icleaf_hdr *leafhdr,
+        struct xfs_buf  *bp)            /* leaf buffer */
+{
+        int             from;           /* source leaf index */
+        xfs_dir2_leaf_t *leaf;          /* leaf structure */
+        int             loglow;         /* first leaf entry to log */
+        int             to;             /* target leaf index */
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_inode *dp = args->dp;
+        leaf = bp->b_addr;
+        if (!leafhdr->stale)
+                return;
+        /*
+         * Compress out the stale entries in place.
+         */
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+                if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                        continue;
+                /*
+                 * Only actually copy the entries that are different.
+                 */
+                if (from > to) {
+                        if (loglow == -1)
+                                loglow = to;
+                        ents[to] = ents[from];
+                }
+                to++;
+        }
+        /*
+         * Update and log the header, log the leaf entries.
+         */
+        ASSERT(leafhdr->stale == from - to);
+        leafhdr->count -= leafhdr->stale;
+        leafhdr->stale = 0;
+        dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+        xfs_dir3_leaf_log_header(args, bp);
+        if (loglow != -1)
+                xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
+}
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir3_leaf_compact_x1(
+        struct xfs_dir3_icleaf_hdr *leafhdr,
+        struct xfs_dir2_leaf_entry *ents,
+        int             *indexp,        /* insertion index */
+        int             *lowstalep,     /* out: stale entry before us */
+        int             *highstalep,    /* out: stale entry after us */
+        int             *lowlogp,       /* out: low log index */
+        int             *highlogp)      /* out: high log index */
+{
+        int             from;           /* source copy index */
+        int             highstale;      /* stale entry at/after index */
+        int             index;          /* insertion index */
+        int             keepstale;      /* source index of kept stale */
+        int             lowstale;       /* stale entry before index */
+        int             newindex=0;     /* new insertion index */
+        int             to;             /* destination copy index */
+        ASSERT(leafhdr->stale > 1);
+        index = *indexp;
+        xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
+        /*
+         * Pick the better of lowstale and highstale.
+         */
+        if (lowstale >= 0 &&
+            (highstale == leafhdr->count ||
+             index - lowstale <= highstale - index))
+                keepstale = lowstale;
+        else
+                keepstale = highstale;
+        /*
+         * Copy the entries in place, removing all the stale entries
+         * except keepstale.
+         */
+        for (from = to = 0; from < leafhdr->count; from++) {
+                /*
+                 * Notice the new value of index.
+                 */
+                if (index == from)
+                        newindex = to;
+                if (from != keepstale &&
+                    ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+                        if (from == to)
+                                *lowlogp = to;
+                        continue;
+                }
+                /*
+                 * Record the new keepstale value for the insertion.
+                 */
+                if (from == keepstale)
+                        lowstale = highstale = to;
+                /*
+                 * Copy only the entries that have moved.
+                 */
+                if (from > to)
+                        ents[to] = ents[from];
+                to++;
+        }
+        ASSERT(from > to);
+        /*
+         * If the insertion point was past the last entry,
+         * set the new insertion point accordingly.
+         */
+        if (index == from)
+                newindex = to;
+        *indexp = newindex;
+        /*
+         * Adjust the leaf header values.
+         */
+        leafhdr->count -= from - to;
+        leafhdr->stale = 1;
+        /*
+         * Remember the low/high stale value only in the "right"
+         * direction.
+         */
+        if (lowstale >= newindex)
+                lowstale = -1;
+        else
+                highstale = leafhdr->count;
+        *highlogp = leafhdr->count - 1;
+        *lowstalep = lowstale;
+        *highstalep = highstale;
+}
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+static void
+xfs_dir3_leaf_log_bests(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp,            /* leaf buffer */
+        int                     first,          /* first entry to log */
+        int                     last)           /* last entry to log */
+{
+        __be16                  *firstb;        /* pointer to first entry */
+        __be16                  *lastb;         /* pointer to last entry */
+        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+        ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+        firstb = xfs_dir2_leaf_bests_p(ltp) + first;
+        lastb = xfs_dir2_leaf_bests_p(ltp) + last;
+        xfs_trans_log_buf(args->trans, bp,
+                (uint)((char *)firstb - (char *)leaf),
+                (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_ents(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp,
+        int                     first,
+        int                     last)
+{
+        xfs_dir2_leaf_entry_t   *firstlep;      /* pointer to first entry */
+        xfs_dir2_leaf_entry_t   *lastlep;       /* pointer to last entry */
+        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+        struct xfs_dir2_leaf_entry *ents;
+        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+        ents = args->dp->d_ops->leaf_ents_p(leaf);
+        firstlep = &ents[first];
+        lastlep = &ents[last];
+        xfs_trans_log_buf(args->trans, bp,
+                (uint)((char *)firstlep - (char *)leaf),
+                (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir3_leaf_log_header(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp)
+{
+        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+        xfs_trans_log_buf(args->trans, bp,
+                          (uint)((char *)&leaf->hdr - (char *)leaf),
+                          args->dp->d_ops->leaf_hdr_size - 1);
+}
+/*
+ * Log the tail of the leaf1 block.
+ */
+STATIC void
+xfs_dir3_leaf_log_tail(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp)
+{
+        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+               leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+        ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+        xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+                (uint)(args->geo->blksize - 1));
+}
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        struct xfs_buf          *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        int                     index;          /* found entry index */
+        struct xfs_buf          *lbp;           /* leaf buffer */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir2_leaf_entry *ents;
+        trace_xfs_dir2_leaf_lookup(args);
+        /*
+         * Look up name in the leaf block, returning both buffers and index.
+         */
+        if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+                return error;
+        }
+        tp = args->trans;
+        dp = args->dp;
+        xfs_dir3_leaf_check(dp, lbp);
+        leaf = lbp->b_addr;
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        /*
+         * Get to the leaf entry and contained data entry address.
+         */
+        lep = &ents[index];
+        /*
+         * Point to the data entry.
+         */
+        dep = (xfs_dir2_data_entry_t *)
+              ((char *)dbp->b_addr +
+               xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+        /*
+         * Return the found inode number & CI name if appropriate
+         */
+        args->inumber = be64_to_cpu(dep->inumber);
+        args->filetype = dp->d_ops->data_get_ftype(dep);
+        error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+        xfs_trans_brelse(tp, dbp);
+        xfs_trans_brelse(tp, lbp);
+        return error;
+}
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int                                      /* error */
+xfs_dir2_leaf_lookup_int(
+        xfs_da_args_t           *args,          /* operation arguments */
+        struct xfs_buf          **lbpp,         /* out: leaf buffer */
+        int                     *indexp,        /* out: index in leaf block */
+        struct xfs_buf          **dbpp)         /* out: data buffer */
+{
+        xfs_dir2_db_t           curdb = -1;     /* current data block number */
+        struct xfs_buf          *dbp = NULL;    /* data buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        int                     index;          /* index in leaf block */
+        struct xfs_buf          *lbp;           /* leaf buffer */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_db_t           newdb;          /* new data block number */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_db_t           cidb = -1;      /* case match data block no. */
+        enum xfs_dacmp          cmp;            /* name compare result */
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+        if (error)
+                return error;
+        *lbpp = lbp;
+        leaf = lbp->b_addr;
+        xfs_dir3_leaf_check(dp, lbp);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        /*
+         * Look for the first leaf entry with our hash value.
+         */
+        index = xfs_dir2_leaf_search_hash(args, lbp);
+        /*
+         * Loop over all the entries with the right hash value
+         * looking to match the name.
+         */
+        for (lep = &ents[index];
+             index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+             lep++, index++) {
+                /*
+                 * Skip over stale leaf entries.
+                 */
+                if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Get the new data block number.
+                 */
+                newdb = xfs_dir2_dataptr_to_db(args->geo,
+                                               be32_to_cpu(lep->address));
+                /*
+                 * If it's not the same as the old data block number,
+                 * need to pitch the old one and read the new one.
+                 */
+                if (newdb != curdb) {
+                        if (dbp)
+                                xfs_trans_brelse(tp, dbp);
+                        error = xfs_dir3_data_read(tp, dp,
+                                           xfs_dir2_db_to_da(args->geo, newdb),
+                                           -1, &dbp);
+                        if (error) {
+                                xfs_trans_brelse(tp, lbp);
+                                return error;
+                        }
+                        curdb = newdb;
+                }
+                /*
+                 * Point to the data entry.
+                 */
+                dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
+                        xfs_dir2_dataptr_to_off(args->geo,
+                                                be32_to_cpu(lep->address)));
+                /*
+                 * Compare name and if it's an exact match, return the index
+                 * and buffer. If it's the first case-insensitive match, store
+                 * the index and buffer and continue looking for an exact match.
+                 */
+                cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+                if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                        args->cmpresult = cmp;
+                        *indexp = index;
+                        /* case exact match: return the current buffer. */
+                        if (cmp == XFS_CMP_EXACT) {
+                                *dbpp = dbp;
+                                return 0;
+                        }
+                        cidb = curdb;
+                }
+        }
+        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+        /*
+         * Here, we can only be doing a lookup (not a rename or remove).
+         * If a case-insensitive match was found earlier, re-read the
+         * appropriate data block if required and return it.
+         */
+        if (args->cmpresult == XFS_CMP_CASE) {
+                ASSERT(cidb != -1);
+                if (cidb != curdb) {
+                        xfs_trans_brelse(tp, dbp);
+                        error = xfs_dir3_data_read(tp, dp,
+                                           xfs_dir2_db_to_da(args->geo, cidb),
+                                           -1, &dbp);
+                        if (error) {
+                                xfs_trans_brelse(tp, lbp);
+                                return error;
+                        }
+                }
+                *dbpp = dbp;
+                return 0;
+        }
+        /*
+         * No match found, return -ENOENT.
+         */
+        ASSERT(cidb == -1);
+        if (dbp)
+                xfs_trans_brelse(tp, dbp);
+        xfs_trans_brelse(tp, lbp);
+        return -ENOENT;
+}
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int                                             /* error */
+xfs_dir2_leaf_removename(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        __be16                  *bestsp;        /* leaf block best freespace */
+        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+        xfs_dir2_db_t           db;             /* data block number */
+        struct xfs_buf          *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry structure */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        xfs_dir2_db_t           i;              /* temporary data block # */
+        int                     index;          /* index into leaf entries */
+        struct xfs_buf          *lbp;           /* leaf buffer */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to rescan data frees */
+        xfs_dir2_data_off_t     oldbest;        /* old value of best free */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir2_data_free *bf;          /* bestfree table */
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        trace_xfs_dir2_leaf_removename(args);
+        /*
+         * Lookup the leaf entry, get the leaf and data blocks read in.
+         */
+        if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+                return error;
+        }
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = lbp->b_addr;
+        hdr = dbp->b_addr;
+        xfs_dir3_data_check(dp, dbp);
+        bf = dp->d_ops->data_bestfree_p(hdr);
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        /*
+         * Point to the leaf entry, use that to point to the data entry.
+         */
+        lep = &ents[index];
+        db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+        dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+                xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+        needscan = needlog = 0;
+        oldbest = be16_to_cpu(bf[0].length);
+        ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+        bestsp = xfs_dir2_leaf_bests_p(ltp);
+        ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
+        /*
+         * Mark the former data entry unused.
+         */
+        xfs_dir2_data_make_free(args, dbp,
+                (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
+                dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+        /*
+         * We just mark the leaf entry stale by putting a null in it.
+         */
+        leafhdr.stale++;
+        dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+        xfs_dir3_leaf_log_header(args, lbp);
+        lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+        xfs_dir3_leaf_log_ents(args, lbp, index, index);
+        /*
+         * Scan the freespace in the data block again if necessary,
+         * log the data block header if necessary.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(dp, hdr, &needlog);
+        if (needlog)
+                xfs_dir2_data_log_header(args, dbp);
+        /*
+         * If the longest freespace in the data block has changed,
+         * put the new value in the bests table and log that.
+         */
+        if (be16_to_cpu(bf[0].length) != oldbest) {
+                bestsp[db] = bf[0].length;
+                xfs_dir3_leaf_log_bests(args, lbp, db, db);
+        }
+        xfs_dir3_data_check(dp, dbp);
+        /*
+         * If the data block is now empty then get rid of the data block.
+         */
+        if (be16_to_cpu(bf[0].length) ==
+                        args->geo->blksize - dp->d_ops->data_entry_offset) {
+                ASSERT(db != args->geo->datablk);
+                if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+                        /*
+                         * Nope, can't get rid of it because it caused
+                         * allocation of a bmap btree block to do so.
+                         * Just go on, returning success, leaving the
+                         * empty block in place.
+                         */
+                        if (error == -ENOSPC && args->total == 0)
+                                error = 0;
+                        xfs_dir3_leaf_check(dp, lbp);
+                        return error;
+                }
+                dbp = NULL;
+                /*
+                 * If this is the last data block then compact the
+                 * bests table by getting rid of entries.
+                 */
+                if (db == be32_to_cpu(ltp->bestcount) - 1) {
+                        /*
+                         * Look for the last active entry (i).
+                         */
+                        for (i = db - 1; i > 0; i--) {
+                                if (bestsp[i] != cpu_to_be16(NULLDATAOFF))
+                                        break;
+                        }
+                        /*
+                         * Copy the table down so inactive entries at the
+                         * end are removed.
+                         */
+                        memmove(&bestsp[db - i], bestsp,
+                                (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
+                        be32_add_cpu(&ltp->bestcount, -(db - i));
+                        xfs_dir3_leaf_log_tail(args, lbp);
+                        xfs_dir3_leaf_log_bests(args, lbp, 0,
+                                                be32_to_cpu(ltp->bestcount) - 1);
+                } else
+                        bestsp[db] = cpu_to_be16(NULLDATAOFF);
+        }
+        /*
+         * If the data block was not the first one, drop it.
+         */
+        else if (db != args->geo->datablk)
+                dbp = NULL;
+        xfs_dir3_leaf_check(dp, lbp);
+        /*
+         * See if we can convert to block form.
+         */
+        return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int                                             /* error */
+xfs_dir2_leaf_replace(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        struct xfs_buf          *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        int                     index;          /* index of leaf entry */
+        struct xfs_buf          *lbp;           /* leaf buffer */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir2_leaf_entry *ents;
+        trace_xfs_dir2_leaf_replace(args);
+        /*
+         * Look up the entry.
+         */
+        if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+                return error;
+        }
+        dp = args->dp;
+        leaf = lbp->b_addr;
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        /*
+         * Point to the leaf entry, get data address from it.
+         */
+        lep = &ents[index];
+        /*
+         * Point to the data entry.
+         */
+        dep = (xfs_dir2_data_entry_t *)
+              ((char *)dbp->b_addr +
+               xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+        ASSERT(args->inumber != be64_to_cpu(dep->inumber));
+        /*
+         * Put the new inode number in, log it.
+         */
+        dep->inumber = cpu_to_be64(args->inumber);
+        dp->d_ops->data_put_ftype(dep, args->filetype);
+        tp = args->trans;
+        xfs_dir2_data_log_entry(args, dbp, dep);
+        xfs_dir3_leaf_check(dp, lbp);
+        xfs_trans_brelse(tp, lbp);
+        return 0;
+}
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int                                             /* index value */
+xfs_dir2_leaf_search_hash(
+        xfs_da_args_t           *args,          /* operation arguments */
+        struct xfs_buf          *lbp)           /* leaf buffer */
+{
+        xfs_dahash_t            hash=0;         /* hash from this entry */
+        xfs_dahash_t            hashwant;       /* hash value looking for */
+        int                     high;           /* high leaf index */
+        int                     low;            /* low leaf index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        int                     mid=0;          /* current leaf index */
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        leaf = lbp->b_addr;
+        ents = args->dp->d_ops->leaf_ents_p(leaf);
+        args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        /*
+         * Note, the table cannot be empty, so we have to go through the loop.
+         * Binary search the leaf entries looking for our hash value.
+         */
+        for (lep = ents, low = 0, high = leafhdr.count - 1,
+                hashwant = args->hashval;
+             low <= high; ) {
+                mid = (low + high) >> 1;
+                if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
+                        break;
+                if (hash < hashwant)
+                        low = mid + 1;
+                else
+                        high = mid - 1;
+        }
+        /*
+         * Found one, back up through all the equal hash values.
+         */
+        if (hash == hashwant) {
+                while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
+                        mid--;
+                }
+        }
+        /*
+         * Need to point to an entry higher than ours.
+         */
+        else if (hash < hashwant)
+                mid++;
+        return mid;
+}
+/*
+ * Trim off a trailing data block.  We know it's empty since the leaf
+ * freespace table says so.
+ */
+int                                             /* error */
+xfs_dir2_leaf_trim_data(
+        xfs_da_args_t           *args,          /* operation arguments */
+        struct xfs_buf          *lbp,           /* leaf buffer */
+        xfs_dir2_db_t           db)             /* data block number */
+{
+        __be16                  *bestsp;        /* leaf bests table */
+        struct xfs_buf          *dbp;           /* data block buffer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Read the offending data block.  We need its buffer.
+         */
+        error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
+                                   -1, &dbp);
+        if (error)
+                return error;
+        leaf = lbp->b_addr;
+        ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+#ifdef DEBUG
+{
+        struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+        struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
+        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+        ASSERT(be16_to_cpu(bf[0].length) ==
+               args->geo->blksize - dp->d_ops->data_entry_offset);
+        ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
+}
+#endif
+        /*
+         * Get rid of the data block.
+         */
+        if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+                ASSERT(error != -ENOSPC);
+                xfs_trans_brelse(tp, dbp);
+                return error;
+        }
+        /*
+         * Eliminate the last bests entry from the table.
+         */
+        bestsp = xfs_dir2_leaf_bests_p(ltp);
+        be32_add_cpu(&ltp->bestcount, -1);
+        memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
+        xfs_dir3_leaf_log_tail(args, lbp);
+        xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+        return 0;
+}
+static inline size_t
+xfs_dir3_leaf_size(
+        struct xfs_dir3_icleaf_hdr      *hdr,
+        int                             counts)
+{
+        int     entries;
+        int     hdrsize;
+        entries = hdr->count - hdr->stale;
+        if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+            hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+                hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+        else
+                hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
+        return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+                       + counts * sizeof(xfs_dir2_data_off_t)
+                       + sizeof(xfs_dir2_leaf_tail_t);
+}
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int                                             /* error */
+xfs_dir2_node_to_leaf(
+        xfs_da_state_t          *state)         /* directory operation state */
+{
+        xfs_da_args_t           *args;          /* operation arguments */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        struct xfs_buf          *fbp;           /* buffer for freespace block */
+        xfs_fileoff_t           fo;             /* freespace file offset */
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        struct xfs_buf          *lbp;           /* buffer for leaf block */
+        xfs_dir2_leaf_tail_t    *ltp;           /* tail of leaf structure */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     rval;           /* successful free trim? */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        struct xfs_dir3_icfree_hdr freehdr;
+        /*
+         * There's more than a leaf level in the btree, so there must
+         * be multiple leafn blocks.  Give up.
+         */
+        if (state->path.active > 1)
+                return 0;
+        args = state->args;
+        trace_xfs_dir2_node_to_leaf(args);
+        mp = state->mp;
+        dp = args->dp;
+        tp = args->trans;
+        /*
+         * Get the last offset in the file.
+         */
+        if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
+                return error;
+        }
+        fo -= args->geo->fsbcount;
+        /*
+         * If there are freespace blocks other than the first one,
+         * take this opportunity to remove trailing empty freespace blocks
+         * that may have been left behind during no-space-reservation
+         * operations.
+         */
+        while (fo > args->geo->freeblk) {
+                if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+                        return error;
+                }
+                if (rval)
+                        fo -= args->geo->fsbcount;
+                else
+                        return 0;
+        }
+        /*
+         * Now find the block just before the freespace block.
+         */
+        if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+                return error;
+        }
+        /*
+         * If it's not the single leaf block, give up.
+         */
+        if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
+                return 0;
+        lbp = state->path.blk[0].bp;
+        leaf = lbp->b_addr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+               leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+        /*
+         * Read the freespace block.
+         */
+        error = xfs_dir2_free_read(tp, dp,  args->geo->freeblk, &fbp);
+        if (error)
+                return error;
+        free = fbp->b_addr;
+        dp->d_ops->free_hdr_from_disk(&freehdr, free);
+        ASSERT(!freehdr.firstdb);
+        /*
+         * Now see if the leafn and free data will fit in a leaf1.
+         * If not, release the buffer and give up.
+         */
+        if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
+                xfs_trans_brelse(tp, fbp);
+                return 0;
+        }
+        /*
+         * If the leaf has any stale entries in it, compress them out.
+         */
+        if (leafhdr.stale)
+                xfs_dir3_leaf_compact(args, &leafhdr, lbp);
+        lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+        xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+        leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+                                        ? XFS_DIR2_LEAF1_MAGIC
+                                        : XFS_DIR3_LEAF1_MAGIC;
+        /*
+         * Set up the leaf tail from the freespace block.
+         */
+        ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+        ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+        /*
+         * Set up the leaf bests table.
+         */
+        memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
+                freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+        dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+        xfs_dir3_leaf_log_header(args, lbp);
+        xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+        xfs_dir3_leaf_log_tail(args, lbp);
+        xfs_dir3_leaf_check(dp, lbp);
+        /*
+         * Get rid of the freespace block.
+         */
+        error = xfs_dir2_shrink_inode(args,
+                        xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+                        fbp);
+        if (error) {
+                /*
+                 * This can't fail here because it can only happen when
+                 * punching out the middle of an extent, and this is an
+                 * isolated block.
+                 */
+                ASSERT(error != -ENOSPC);
+                return error;
+        }
+        fbp = NULL;
+        /*
+         * Now see if we can convert the single-leaf directory
+         * down to a block form directory.
+         * This routine always kills the dabuf for the leaf, so
+         * eliminate it from the path.
+         */
+        error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+        state->path.blk[0].bp = NULL;
+        return error;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
new file mode 100644
index 000000000000..2ae6ac2c11ae
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -0,0 +1,2284 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+/*
+ * Function declarations.
+ */
+static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
+                              int index);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+                                     xfs_da_state_blk_t *blk1,
+                                     xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
+                                 int index, xfs_da_state_blk_t *dblk,
+                                 int *rval);
+static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
+                                     xfs_da_state_blk_t *fblk);
+/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+#define xfs_dir3_leaf_check(dp, bp) \
+do { \
+        if (!xfs_dir3_leafn_check((dp), (bp))) \
+                ASSERT(0); \
+} while (0);
+static bool
+xfs_dir3_leafn_check(
+        struct xfs_inode        *dp,
+        struct xfs_buf          *bp)
+{
+        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+                struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+                if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+                        return false;
+        } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+                return false;
+        return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+}
+#else
+#define xfs_dir3_leaf_check(dp, bp)
+#endif
+static bool
+xfs_dir3_free_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+                if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
+                        return false;
+                if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+                        return false;
+        } else {
+                if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
+                        return false;
+        }
+        /* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+        return true;
+}
+static void
+xfs_dir3_free_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_dir3_free_verify(bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+static void
+xfs_dir3_free_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+        if (!xfs_dir3_free_verify(bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (bip)
+                hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
+}
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+        .verify_read = xfs_dir3_free_read_verify,
+        .verify_write = xfs_dir3_free_write_verify,
+};
+static int
+__xfs_dir3_free_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        int                     err;
+        err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                                XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+        /* try read returns without an error or *bpp if it lands in a hole */
+        if (!err && tp && *bpp)
+                xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+        return err;
+}
+int
+xfs_dir2_free_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        struct xfs_buf          **bpp)
+{
+        return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
+}
+static int
+xfs_dir2_free_try_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        struct xfs_buf          **bpp)
+{
+        return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+}
+static int
+xfs_dir3_free_get_buf(
+        xfs_da_args_t           *args,
+        xfs_dir2_db_t           fbno,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_trans        *tp = args->trans;
+        struct xfs_inode        *dp = args->dp;
+        struct xfs_mount        *mp = dp->i_mount;
+        struct xfs_buf          *bp;
+        int                     error;
+        struct xfs_dir3_icfree_hdr hdr;
+        error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
+                                   -1, &bp, XFS_DATA_FORK);
+        if (error)
+                return error;
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+        bp->b_ops = &xfs_dir3_free_buf_ops;
+        /*
+         * Initialize the new block to be empty, and remember
+         * its first slot as our empty slot.
+         */
+        memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+        memset(&hdr, 0, sizeof(hdr));
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+                hdr.magic = XFS_DIR3_FREE_MAGIC;
+                hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+                hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+                uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+        } else
+                hdr.magic = XFS_DIR2_FREE_MAGIC;
+        dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Log entries from a freespace block.
+ */
+STATIC void
+xfs_dir2_free_log_bests(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp,
+        int                     first,          /* first entry to log */
+        int                     last)           /* last entry to log */
+{
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        __be16                  *bests;
+        free = bp->b_addr;
+        bests = args->dp->d_ops->free_bests_p(free);
+        ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+               free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+        xfs_trans_log_buf(args->trans, bp,
+                (uint)((char *)&bests[first] - (char *)free),
+                (uint)((char *)&bests[last] - (char *)free +
+                       sizeof(bests[0]) - 1));
+}
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+        struct xfs_da_args      *args,
+        struct xfs_buf          *bp)
+{
+#ifdef DEBUG
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        free = bp->b_addr;
+        ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+               free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
+        xfs_trans_log_buf(args->trans, bp, 0,
+                          args->dp->d_ops->free_hdr_size - 1);
+}
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int                                             /* error */
+xfs_dir2_leaf_to_node(
+        xfs_da_args_t           *args,          /* operation arguments */
+        struct xfs_buf          *lbp)           /* leaf buffer */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        struct xfs_buf          *fbp;           /* freespace buffer */
+        xfs_dir2_db_t           fdb;            /* freespace block number */
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        __be16                  *from;          /* pointer to freespace entry */
+        int                     i;              /* leaf freespace index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     n;              /* count of live freespc ents */
+        xfs_dir2_data_off_t     off;            /* freespace entry value */
+        __be16                  *to;            /* pointer to freespace entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir3_icfree_hdr freehdr;
+        trace_xfs_dir2_leaf_to_node(args);
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Add a freespace block to the directory.
+         */
+        if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+                return error;
+        }
+        ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
+        /*
+         * Get the buffer for the new freespace block.
+         */
+        error = xfs_dir3_free_get_buf(args, fdb, &fbp);
+        if (error)
+                return error;
+        free = fbp->b_addr;
+        dp->d_ops->free_hdr_from_disk(&freehdr, free);
+        leaf = lbp->b_addr;
+        ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+        ASSERT(be32_to_cpu(ltp->bestcount) <=
+                                (uint)dp->i_d.di_size / args->geo->blksize);
+        /*
+         * Copy freespace entries from the leaf block to the new block.
+         * Count active entries.
+         */
+        from = xfs_dir2_leaf_bests_p(ltp);
+        to = dp->d_ops->free_bests_p(free);
+        for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+                if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
+                        n++;
+                *to = cpu_to_be16(off);
+        }
+        /*
+         * Now initialize the freespace block header.
+         */
+        freehdr.nused = n;
+        freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+        dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+        xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+        xfs_dir2_free_log_header(args, fbp);
+        /*
+         * Converting the leaf to a leafnode is just a matter of changing the
+         * magic number and the ops. Do the change directly to the buffer as
+         * it's less work (and less code) than decoding the header to host
+         * format and back again.
+         */
+        if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+                leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+        else
+                leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+        lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+        xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+        xfs_dir3_leaf_log_header(args, lbp);
+        xfs_dir3_leaf_check(dp, lbp);
+        return 0;
+}
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int                                      /* error */
+xfs_dir2_leafn_add(
+        struct xfs_buf          *bp,            /* leaf buffer */
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     index)          /* insertion pt for new entry */
+{
+        int                     compact;        /* compacting stale leaves */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     highstale;      /* next stale entry */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        int                     lfloghigh;      /* high leaf entry logging */
+        int                     lfloglow;       /* low leaf entry logging */
+        int                     lowstale;       /* previous stale entry */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        struct xfs_dir2_leaf_entry *ents;
+        trace_xfs_dir2_leafn_add(args, index);
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        leaf = bp->b_addr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        /*
+         * Quick check just to make sure we are not going to index
+         * into other peoples memory
+         */
+        if (index < 0)
+                return -EFSCORRUPTED;
+        /*
+         * If there are already the maximum number of leaf entries in
+         * the block, if there are no stale entries it won't fit.
+         * Caller will do a split.  If there are stale entries we'll do
+         * a compact.
+         */
+        if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
+                if (!leafhdr.stale)
+                        return -ENOSPC;
+                compact = leafhdr.stale > 1;
+        } else
+                compact = 0;
+        ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+        ASSERT(index == leafhdr.count ||
+               be32_to_cpu(ents[index].hashval) >= args->hashval);
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+                return 0;
+        /*
+         * Compact out all but one stale leaf entry.  Leaves behind
+         * the entry closest to index.
+         */
+        if (compact)
+                xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+                                         &highstale, &lfloglow, &lfloghigh);
+        else if (leafhdr.stale) {
+                /*
+                 * Set impossible logging indices for this case.
+                 */
+                lfloglow = leafhdr.count;
+                lfloghigh = -1;
+        }
+        /*
+         * Insert the new entry, log everything.
+         */
+        lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
+                                       highstale, &lfloglow, &lfloghigh);
+        lep->hashval = cpu_to_be32(args->hashval);
+        lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
+                                args->blkno, args->index));
+        dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+        xfs_dir3_leaf_log_header(args, bp);
+        xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
+        xfs_dir3_leaf_check(dp, bp);
+        return 0;
+}
+#ifdef DEBUG
+static void
+xfs_dir2_free_hdr_check(
+        struct xfs_inode *dp,
+        struct xfs_buf  *bp,
+        xfs_dir2_db_t   db)
+{
+        struct xfs_dir3_icfree_hdr hdr;
+        dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
+        ASSERT((hdr.firstdb %
+                dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
+        ASSERT(hdr.firstdb <= db);
+        ASSERT(db < hdr.firstdb + hdr.nvalid);
+}
+#else
+#define xfs_dir2_free_hdr_check(dp, bp, db)
+#endif  /* DEBUG */
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t                                    /* hash value */
+xfs_dir2_leafn_lasthash(
+        struct xfs_inode *dp,
+        struct xfs_buf  *bp,                    /* leaf buffer */
+        int             *count)                 /* count of entries in leaf */
+{
+        struct xfs_dir2_leaf    *leaf = bp->b_addr;
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+               leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+        if (count)
+                *count = leafhdr.count;
+        if (!leafhdr.count)
+                return 0;
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        return be32_to_cpu(ents[leafhdr.count - 1].hashval);
+}
+/*
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
+        struct xfs_buf          *bp,            /* leaf buffer */
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     *indexp,        /* out: leaf entry index */
+        xfs_da_state_t          *state)         /* state to fill in */
+{
+        struct xfs_buf          *curbp = NULL;  /* current data/free buffer */
+        xfs_dir2_db_t           curdb = -1;     /* current data block number */
+        xfs_dir2_db_t           curfdb = -1;    /* current free block number */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        int                     fi;             /* free entry index */
+        xfs_dir2_free_t         *free = NULL;   /* free block structure */
+        int                     index;          /* leaf entry index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        int                     length;         /* length of new data entry */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_db_t           newdb;          /* new data block number */
+        xfs_dir2_db_t           newfdb;         /* new free block number */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = bp->b_addr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        xfs_dir3_leaf_check(dp, bp);
+        ASSERT(leafhdr.count > 0);
+        /*
+         * Look up the hash value in the leaf entries.
+         */
+        index = xfs_dir2_leaf_search_hash(args, bp);
+        /*
+         * Do we have a buffer coming in?
+         */
+        if (state->extravalid) {
+                /* If so, it's a free block buffer, get the block number. */
+                curbp = state->extrablk.bp;
+                curfdb = state->extrablk.blkno;
+                free = curbp->b_addr;
+                ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+                       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+        }
+        length = dp->d_ops->data_entsize(args->namelen);
+        /*
+         * Loop over leaf entries with the right hash value.
+         */
+        for (lep = &ents[index];
+             index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+             lep++, index++) {
+                /*
+                 * Skip stale leaf entries.
+                 */
+                if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Pull the data block number from the entry.
+                 */
+                newdb = xfs_dir2_dataptr_to_db(args->geo,
+                                               be32_to_cpu(lep->address));
+                /*
+                 * For addname, we're looking for a place to put the new entry.
+                 * We want to use a data block with an entry of equal
+                 * hash value to ours if there is one with room.
+                 *
+                 * If this block isn't the data block we already have
+                 * in hand, take a look at it.
+                 */
+                if (newdb != curdb) {
+                        __be16 *bests;
+                        curdb = newdb;
+                        /*
+                         * Convert the data block to the free block
+                         * holding its freespace information.
+                         */
+                        newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
+                        /*
+                         * If it's not the one we have in hand, read it in.
+                         */
+                        if (newfdb != curfdb) {
+                                /*
+                                 * If we had one before, drop it.
+                                 */
+                                if (curbp)
+                                        xfs_trans_brelse(tp, curbp);
+                                error = xfs_dir2_free_read(tp, dp,
+                                                xfs_dir2_db_to_da(args->geo,
+                                                                  newfdb),
+                                                &curbp);
+                                if (error)
+                                        return error;
+                                free = curbp->b_addr;
+                                xfs_dir2_free_hdr_check(dp, curbp, curdb);
+                        }
+                        /*
+                         * Get the index for our entry.
+                         */
+                        fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
+                        /*
+                         * If it has room, return it.
+                         */
+                        bests = dp->d_ops->free_bests_p(free);
+                        if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
+                                XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+                                                        XFS_ERRLEVEL_LOW, mp);
+                                if (curfdb != newfdb)
+                                        xfs_trans_brelse(tp, curbp);
+                                return -EFSCORRUPTED;
+                        }
+                        curfdb = newfdb;
+                        if (be16_to_cpu(bests[fi]) >= length)
+                                goto out;
+                }
+        }
+        /* Didn't find any space */
+        fi = -1;
+out:
+        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+        if (curbp) {
+                /* Giving back a free block. */
+                state->extravalid = 1;
+                state->extrablk.bp = curbp;
+                state->extrablk.index = fi;
+                state->extrablk.blkno = curfdb;
+                /*
+                 * Important: this magic number is not in the buffer - it's for
+                 * buffer type information and therefore only the free/data type
+                 * matters here, not whether CRCs are enabled or not.
+                 */
+                state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+        } else {
+                state->extravalid = 0;
+        }
+        /*
+         * Return the index, that will be the insertion point.
+         */
+        *indexp = index;
+        return -ENOENT;
+}
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+        struct xfs_buf          *bp,            /* leaf buffer */
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     *indexp,        /* out: leaf entry index */
+        xfs_da_state_t          *state)         /* state to fill in */
+{
+        struct xfs_buf          *curbp = NULL;  /* current data/free buffer */
+        xfs_dir2_db_t           curdb = -1;     /* current data block number */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        int                     index;          /* leaf entry index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_db_t           newdb;          /* new data block number */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        enum xfs_dacmp          cmp;            /* comparison result */
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = bp->b_addr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        xfs_dir3_leaf_check(dp, bp);
+        ASSERT(leafhdr.count > 0);
+        /*
+         * Look up the hash value in the leaf entries.
+         */
+        index = xfs_dir2_leaf_search_hash(args, bp);
+        /*
+         * Do we have a buffer coming in?
+         */
+        if (state->extravalid) {
+                curbp = state->extrablk.bp;
+                curdb = state->extrablk.blkno;
+        }
+        /*
+         * Loop over leaf entries with the right hash value.
+         */
+        for (lep = &ents[index];
+             index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+             lep++, index++) {
+                /*
+                 * Skip stale leaf entries.
+                 */
+                if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Pull the data block number from the entry.
+                 */
+                newdb = xfs_dir2_dataptr_to_db(args->geo,
+                                               be32_to_cpu(lep->address));
+                /*
+                 * Not adding a new entry, so we really want to find
+                 * the name given to us.
+                 *
+                 * If it's a different data block, go get it.
+                 */
+                if (newdb != curdb) {
+                        /*
+                         * If we had a block before that we aren't saving
+                         * for a CI name, drop it
+                         */
+                        if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+                                                curdb != state->extrablk.blkno))
+                                xfs_trans_brelse(tp, curbp);
+                        /*
+                         * If needing the block that is saved with a CI match,
+                         * use it otherwise read in the new data block.
+                         */
+                        if (args->cmpresult != XFS_CMP_DIFFERENT &&
+                                        newdb == state->extrablk.blkno) {
+                                ASSERT(state->extravalid);
+                                curbp = state->extrablk.bp;
+                        } else {
+                                error = xfs_dir3_data_read(tp, dp,
+                                                xfs_dir2_db_to_da(args->geo,
+                                                                  newdb),
+                                                -1, &curbp);
+                                if (error)
+                                        return error;
+                        }
+                        xfs_dir3_data_check(dp, curbp);
+                        curdb = newdb;
+                }
+                /*
+                 * Point to the data entry.
+                 */
+                dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
+                        xfs_dir2_dataptr_to_off(args->geo,
+                                                be32_to_cpu(lep->address)));
+                /*
+                 * Compare the entry and if it's an exact match, return
+                 * EEXIST immediately. If it's the first case-insensitive
+                 * match, store the block & inode number and continue looking.
+                 */
+                cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+                if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                        /* If there is a CI match block, drop it */
+                        if (args->cmpresult != XFS_CMP_DIFFERENT &&
+                                                curdb != state->extrablk.blkno)
+                                xfs_trans_brelse(tp, state->extrablk.bp);
+                        args->cmpresult = cmp;
+                        args->inumber = be64_to_cpu(dep->inumber);
+                        args->filetype = dp->d_ops->data_get_ftype(dep);
+                        *indexp = index;
+                        state->extravalid = 1;
+                        state->extrablk.bp = curbp;
+                        state->extrablk.blkno = curdb;
+                        state->extrablk.index = (int)((char *)dep -
+                                                        (char *)curbp->b_addr);
+                        state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                        curbp->b_ops = &xfs_dir3_data_buf_ops;
+                        xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+                        if (cmp == XFS_CMP_EXACT)
+                                return -EEXIST;
+                }
+        }
+        ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
+        if (curbp) {
+                if (args->cmpresult == XFS_CMP_DIFFERENT) {
+                        /* Giving back last used data block. */
+                        state->extravalid = 1;
+                        state->extrablk.bp = curbp;
+                        state->extrablk.index = -1;
+                        state->extrablk.blkno = curdb;
+                        state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                        curbp->b_ops = &xfs_dir3_data_buf_ops;
+                        xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
+                } else {
+                        /* If the curbp is not the CI match block, drop it */
+                        if (state->extrablk.bp != curbp)
+                                xfs_trans_brelse(tp, curbp);
+                }
+        } else {
+                state->extravalid = 0;
+        }
+        *indexp = index;
+        return -ENOENT;
+}
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+        struct xfs_buf          *bp,            /* leaf buffer */
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     *indexp,        /* out: leaf entry index */
+        xfs_da_state_t          *state)         /* state to fill in */
+{
+        if (args->op_flags & XFS_DA_OP_ADDNAME)
+                return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+                                                        state);
+        return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers.  Stale entries are preserved.
+ */
+static void
+xfs_dir3_leafn_moveents(
+        xfs_da_args_t                   *args,  /* operation arguments */
+        struct xfs_buf                  *bp_s,  /* source */
+        struct xfs_dir3_icleaf_hdr      *shdr,
+        struct xfs_dir2_leaf_entry      *sents,
+        int                             start_s,/* source leaf index */
+        struct xfs_buf                  *bp_d,  /* destination */
+        struct xfs_dir3_icleaf_hdr      *dhdr,
+        struct xfs_dir2_leaf_entry      *dents,
+        int                             start_d,/* destination leaf index */
+        int                             count)  /* count of leaves to copy */
+{
+        int                             stale;  /* count stale leaves copied */
+        trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
+        /*
+         * Silently return if nothing to do.
+         */
+        if (count == 0)
+                return;
+        /*
+         * If the destination index is not the end of the current
+         * destination leaf entries, open up a hole in the destination
+         * to hold the new entries.
+         */
+        if (start_d < dhdr->count) {
+                memmove(&dents[start_d + count], &dents[start_d],
+                        (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+                xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
+                                       count + dhdr->count - 1);
+        }
+        /*
+         * If the source has stale leaves, count the ones in the copy range
+         * so we can update the header correctly.
+         */
+        if (shdr->stale) {
+                int     i;                      /* temp leaf index */
+                for (i = start_s, stale = 0; i < start_s + count; i++) {
+                        if (sents[i].address ==
+                                        cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+                                stale++;
+                }
+        } else
+                stale = 0;
+        /*
+         * Copy the leaf entries from source to destination.
+         */
+        memcpy(&dents[start_d], &sents[start_s],
+                count * sizeof(xfs_dir2_leaf_entry_t));
+        xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
+        /*
+         * If there are source entries after the ones we copied,
+         * delete the ones we copied by sliding the next ones down.
+         */
+        if (start_s + count < shdr->count) {
+                memmove(&sents[start_s], &sents[start_s + count],
+                        count * sizeof(xfs_dir2_leaf_entry_t));
+                xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
+        }
+        /*
+         * Update the headers and log them.
+         */
+        shdr->count -= count;
+        shdr->stale -= stale;
+        dhdr->count += count;
+        dhdr->stale += stale;
+}
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int                                             /* sort order */
+xfs_dir2_leafn_order(
+        struct xfs_inode        *dp,
+        struct xfs_buf          *leaf1_bp,              /* leaf1 buffer */
+        struct xfs_buf          *leaf2_bp)              /* leaf2 buffer */
+{
+        struct xfs_dir2_leaf    *leaf1 = leaf1_bp->b_addr;
+        struct xfs_dir2_leaf    *leaf2 = leaf2_bp->b_addr;
+        struct xfs_dir2_leaf_entry *ents1;
+        struct xfs_dir2_leaf_entry *ents2;
+        struct xfs_dir3_icleaf_hdr hdr1;
+        struct xfs_dir3_icleaf_hdr hdr2;
+        dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+        dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+        ents1 = dp->d_ops->leaf_ents_p(leaf1);
+        ents2 = dp->d_ops->leaf_ents_p(leaf2);
+        if (hdr1.count > 0 && hdr2.count > 0 &&
+            (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+             be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+                                be32_to_cpu(ents1[hdr1.count - 1].hashval)))
+                return 1;
+        return 0;
+}
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+        xfs_da_state_t          *state,         /* btree cursor */
+        xfs_da_state_blk_t      *blk1,          /* first btree block */
+        xfs_da_state_blk_t      *blk2)          /* second btree block */
+{
+        xfs_da_args_t           *args;          /* operation arguments */
+        int                     count;          /* count (& direction) leaves */
+        int                     isleft;         /* new goes in left leaf */
+        xfs_dir2_leaf_t         *leaf1;         /* first leaf structure */
+        xfs_dir2_leaf_t         *leaf2;         /* second leaf structure */
+        int                     mid;            /* midpoint leaf index */
+#if defined(DEBUG) || defined(XFS_WARN)
+        int                     oldstale;       /* old count of stale leaves */
+#endif
+        int                     oldsum;         /* old total leaf count */
+        int                     swap;           /* swapped leaf blocks */
+        struct xfs_dir2_leaf_entry *ents1;
+        struct xfs_dir2_leaf_entry *ents2;
+        struct xfs_dir3_icleaf_hdr hdr1;
+        struct xfs_dir3_icleaf_hdr hdr2;
+        struct xfs_inode        *dp = state->args->dp;
+        args = state->args;
+        /*
+         * If the block order is wrong, swap the arguments.
+         */
+        if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
+                xfs_da_state_blk_t      *tmp;   /* temp for block swap */
+                tmp = blk1;
+                blk1 = blk2;
+                blk2 = tmp;
+        }
+        leaf1 = blk1->bp->b_addr;
+        leaf2 = blk2->bp->b_addr;
+        dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+        dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+        ents1 = dp->d_ops->leaf_ents_p(leaf1);
+        ents2 = dp->d_ops->leaf_ents_p(leaf2);
+        oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+        oldstale = hdr1.stale + hdr2.stale;
+#endif
+        mid = oldsum >> 1;
+        /*
+         * If the old leaf count was odd then the new one will be even,
+         * so we need to divide the new count evenly.
+         */
+        if (oldsum & 1) {
+                xfs_dahash_t    midhash;        /* middle entry hash value */
+                if (mid >= hdr1.count)
+                        midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
+                else
+                        midhash = be32_to_cpu(ents1[mid].hashval);
+                isleft = args->hashval <= midhash;
+        }
+        /*
+         * If the old count is even then the new count is odd, so there's
+         * no preferred side for the new entry.
+         * Pick the left one.
+         */
+        else
+                isleft = 1;
+        /*
+         * Calculate moved entry count.  Positive means left-to-right,
+         * negative means right-to-left.  Then move the entries.
+         */
+        count = hdr1.count - mid + (isleft == 0);
+        if (count > 0)
+                xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+                                        hdr1.count - count, blk2->bp,
+                                        &hdr2, ents2, 0, count);
+        else if (count < 0)
+                xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+                                        blk1->bp, &hdr1, ents1,
+                                        hdr1.count, count);
+        ASSERT(hdr1.count + hdr2.count == oldsum);
+        ASSERT(hdr1.stale + hdr2.stale == oldstale);
+        /* log the changes made when moving the entries */
+        dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
+        dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
+        xfs_dir3_leaf_log_header(args, blk1->bp);
+        xfs_dir3_leaf_log_header(args, blk2->bp);
+        xfs_dir3_leaf_check(dp, blk1->bp);
+        xfs_dir3_leaf_check(dp, blk2->bp);
+        /*
+         * Mark whether we're inserting into the old or new leaf.
+         */
+        if (hdr1.count < hdr2.count)
+                state->inleaf = swap;
+        else if (hdr1.count > hdr2.count)
+                state->inleaf = !swap;
+        else
+                state->inleaf = swap ^ (blk1->index <= hdr1.count);
+        /*
+         * Adjust the expected index for insertion.
+         */
+        if (!state->inleaf)
+                blk2->index = blk1->index - hdr1.count;
+        /*
+         * Finally sanity check just to make sure we are not returning a
+         * negative index
+         */
+        if (blk2->index < 0) {
+                state->inleaf = 1;
+                blk2->index = 0;
+                xfs_alert(dp->i_mount,
+        "%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
+                        __func__, blk1->index);
+        }
+}
+static int
+xfs_dir3_data_block_free(
+        xfs_da_args_t           *args,
+        struct xfs_dir2_data_hdr *hdr,
+        struct xfs_dir2_free    *free,
+        xfs_dir2_db_t           fdb,
+        int                     findex,
+        struct xfs_buf          *fbp,
+        int                     longest)
+{
+        int                     logfree = 0;
+        __be16                  *bests;
+        struct xfs_dir3_icfree_hdr freehdr;
+        struct xfs_inode        *dp = args->dp;
+        dp->d_ops->free_hdr_from_disk(&freehdr, free);
+        bests = dp->d_ops->free_bests_p(free);
+        if (hdr) {
+                /*
+                 * Data block is not empty, just set the free entry to the new
+                 * value.
+                 */
+                bests[findex] = cpu_to_be16(longest);
+                xfs_dir2_free_log_bests(args, fbp, findex, findex);
+                return 0;
+        }
+        /* One less used entry in the free table. */
+        freehdr.nused--;
+        /*
+         * If this was the last entry in the table, we can trim the table size
+         * back.  There might be other entries at the end referring to
+         * non-existent data blocks, get those too.
+         */
+        if (findex == freehdr.nvalid - 1) {
+                int     i;              /* free entry index */
+                for (i = findex - 1; i >= 0; i--) {
+                        if (bests[i] != cpu_to_be16(NULLDATAOFF))
+                                break;
+                }
+                freehdr.nvalid = i + 1;
+                logfree = 0;
+        } else {
+                /* Not the last entry, just punch it out.  */
+                bests[findex] = cpu_to_be16(NULLDATAOFF);
+                logfree = 1;
+        }
+        dp->d_ops->free_hdr_to_disk(free, &freehdr);
+        xfs_dir2_free_log_header(args, fbp);
+        /*
+         * If there are no useful entries left in the block, get rid of the
+         * block if we can.
+         */
+        if (!freehdr.nused) {
+                int error;
+                error = xfs_dir2_shrink_inode(args, fdb, fbp);
+                if (error == 0) {
+                        fbp = NULL;
+                        logfree = 0;
+                } else if (error != -ENOSPC || args->total != 0)
+                        return error;
+                /*
+                 * It's possible to get ENOSPC if there is no
+                 * space reservation.  In this case some one
+                 * else will eventually get rid of this block.
+                 */
+        }
+        /* Log the free entry that changed, unless we got rid of it.  */
+        if (logfree)
+                xfs_dir2_free_log_bests(args, fbp, findex, findex);
+        return 0;
+}
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int                                      /* error */
+xfs_dir2_leafn_remove(
+        xfs_da_args_t           *args,          /* operation arguments */
+        struct xfs_buf          *bp,            /* leaf buffer */
+        int                     index,          /* leaf entry index */
+        xfs_da_state_blk_t      *dblk,          /* data block */
+        int                     *rval)          /* resulting block needs join */
+{
+        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+        xfs_dir2_db_t           db;             /* data block number */
+        struct xfs_buf          *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        int                     longest;        /* longest data free entry */
+        int                     off;            /* data block entry offset */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to rescan data frees */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir2_data_free *bf;          /* bestfree table */
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        struct xfs_dir2_leaf_entry *ents;
+        trace_xfs_dir2_leafn_remove(args, index);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = bp->b_addr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        /*
+         * Point to the entry we're removing.
+         */
+        lep = &ents[index];
+        /*
+         * Extract the data block and offset from the entry.
+         */
+        db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+        ASSERT(dblk->blkno == db);
+        off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
+        ASSERT(dblk->index == off);
+        /*
+         * Kill the leaf entry by marking it stale.
+         * Log the leaf block changes.
+         */
+        leafhdr.stale++;
+        dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+        xfs_dir3_leaf_log_header(args, bp);
+        lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+        xfs_dir3_leaf_log_ents(args, bp, index, index);
+        /*
+         * Make the data entry free.  Keep track of the longest freespace
+         * in the data block in case it changes.
+         */
+        dbp = dblk->bp;
+        hdr = dbp->b_addr;
+        dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
+        bf = dp->d_ops->data_bestfree_p(hdr);
+        longest = be16_to_cpu(bf[0].length);
+        needlog = needscan = 0;
+        xfs_dir2_data_make_free(args, dbp, off,
+                dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+        /*
+         * Rescan the data block freespaces for bestfree.
+         * Log the data block header if needed.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(dp, hdr, &needlog);
+        if (needlog)
+                xfs_dir2_data_log_header(args, dbp);
+        xfs_dir3_data_check(dp, dbp);
+        /*
+         * If the longest data block freespace changes, need to update
+         * the corresponding freeblock entry.
+         */
+        if (longest < be16_to_cpu(bf[0].length)) {
+                int             error;          /* error return value */
+                struct xfs_buf  *fbp;           /* freeblock buffer */
+                xfs_dir2_db_t   fdb;            /* freeblock block number */
+                int             findex;         /* index in freeblock entries */
+                xfs_dir2_free_t *free;          /* freeblock structure */
+                /*
+                 * Convert the data block number to a free block,
+                 * read in the free block.
+                 */
+                fdb = dp->d_ops->db_to_fdb(args->geo, db);
+                error = xfs_dir2_free_read(tp, dp,
+                                           xfs_dir2_db_to_da(args->geo, fdb),
+                                           &fbp);
+                if (error)
+                        return error;
+                free = fbp->b_addr;
+#ifdef DEBUG
+        {
+                struct xfs_dir3_icfree_hdr freehdr;
+                dp->d_ops->free_hdr_from_disk(&freehdr, free);
+                ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
+                        (fdb - xfs_dir2_byte_to_db(args->geo,
+                                                   XFS_DIR2_FREE_OFFSET)));
+        }
+#endif
+                /*
+                 * Calculate which entry we need to fix.
+                 */
+                findex = dp->d_ops->db_to_fdindex(args->geo, db);
+                longest = be16_to_cpu(bf[0].length);
+                /*
+                 * If the data block is now empty we can get rid of it
+                 * (usually).
+                 */
+                if (longest == args->geo->blksize -
+                               dp->d_ops->data_entry_offset) {
+                        /*
+                         * Try to punch out the data block.
+                         */
+                        error = xfs_dir2_shrink_inode(args, db, dbp);
+                        if (error == 0) {
+                                dblk->bp = NULL;
+                                hdr = NULL;
+                        }
+                        /*
+                         * We can get ENOSPC if there's no space reservation.
+                         * In this case just drop the buffer and some one else
+                         * will eventually get rid of the empty block.
+                         */
+                        else if (!(error == -ENOSPC && args->total == 0))
+                                return error;
+                }
+                /*
+                 * If we got rid of the data block, we can eliminate that entry
+                 * in the free block.
+                 */
+                error = xfs_dir3_data_block_free(args, hdr, free,
+                                                 fdb, findex, fbp, longest);
+                if (error)
+                        return error;
+        }
+        xfs_dir3_leaf_check(dp, bp);
+        /*
+         * Return indication of whether this leaf block is empty enough
+         * to justify trying to join it with a neighbor.
+         */
+        *rval = (dp->d_ops->leaf_hdr_size +
+                 (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
+                args->geo->magicpct;
+        return 0;
+}
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int                                             /* error */
+xfs_dir2_leafn_split(
+        xfs_da_state_t          *state,         /* btree cursor */
+        xfs_da_state_blk_t      *oldblk,        /* original block */
+        xfs_da_state_blk_t      *newblk)        /* newly created block */
+{
+        xfs_da_args_t           *args;          /* operation arguments */
+        xfs_dablk_t             blkno;          /* new leaf block number */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        struct xfs_inode        *dp;
+        /*
+         * Allocate space for a new leaf node.
+         */
+        args = state->args;
+        dp = args->dp;
+        mp = dp->i_mount;
+        ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+        error = xfs_da_grow_inode(args, &blkno);
+        if (error) {
+                return error;
+        }
+        /*
+         * Initialize the new leaf block.
+         */
+        error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
+                                      &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+        if (error)
+                return error;
+        newblk->blkno = blkno;
+        newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+        /*
+         * Rebalance the entries across the two leaves, link the new
+         * block into the leaves.
+         */
+        xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+        error = xfs_da3_blk_link(state, oldblk, newblk);
+        if (error) {
+                return error;
+        }
+        /*
+         * Insert the new entry in the correct block.
+         */
+        if (state->inleaf)
+                error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+        else
+                error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+        /*
+         * Update last hashval in each block since we added the name.
+         */
+        oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
+        newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
+        xfs_dir3_leaf_check(dp, oldblk->bp);
+        xfs_dir3_leaf_check(dp, newblk->bp);
+        return error;
+}
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int                                             /* error */
+xfs_dir2_leafn_toosmall(
+        xfs_da_state_t          *state,         /* btree cursor */
+        int                     *action)        /* resulting action to take */
+{
+        xfs_da_state_blk_t      *blk;           /* leaf block */
+        xfs_dablk_t             blkno;          /* leaf block number */
+        struct xfs_buf          *bp;            /* leaf buffer */
+        int                     bytes;          /* bytes in use */
+        int                     count;          /* leaf live entry count */
+        int                     error;          /* error return value */
+        int                     forward;        /* sibling block direction */
+        int                     i;              /* sibling counter */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        int                     rval;           /* result from path_shift */
+        struct xfs_dir3_icleaf_hdr leafhdr;
+        struct xfs_dir2_leaf_entry *ents;
+        struct xfs_inode        *dp = state->args->dp;
+        /*
+         * Check for the degenerate case of the block being over 50% full.
+         * If so, it's not worth even looking to see if we might be able
+         * to coalesce with a sibling.
+         */
+        blk = &state->path.blk[state->path.active - 1];
+        leaf = blk->bp->b_addr;
+        dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+        ents = dp->d_ops->leaf_ents_p(leaf);
+        xfs_dir3_leaf_check(dp, blk->bp);
+        count = leafhdr.count - leafhdr.stale;
+        bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
+        if (bytes > (state->args->geo->blksize >> 1)) {
+                /*
+                 * Blk over 50%, don't try to join.
+                 */
+                *action = 0;
+                return 0;
+        }
+        /*
+         * Check for the degenerate case of the block being empty.
+         * If the block is empty, we'll simply delete it, no need to
+         * coalesce it with a sibling block.  We choose (arbitrarily)
+         * to merge with the forward block unless it is NULL.
+         */
+        if (count == 0) {
+                /*
+                 * Make altpath point to the block we want to keep and
+                 * path point to the block we want to drop (this one).
+                 */
+                forward = (leafhdr.forw != 0);
+                memcpy(&state->altpath, &state->path, sizeof(state->path));
+                error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+                        &rval);
+                if (error)
+                        return error;
+                *action = rval ? 2 : 0;
+                return 0;
+        }
+        /*
+         * Examine each sibling block to see if we can coalesce with
+         * at least 25% free space to spare.  We need to figure out
+         * whether to merge with the forward or the backward block.
+         * We prefer coalescing with the lower numbered sibling so as
+         * to shrink a directory over time.
+         */
+        forward = leafhdr.forw < leafhdr.back;
+        for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+                struct xfs_dir3_icleaf_hdr hdr2;
+                blkno = forward ? leafhdr.forw : leafhdr.back;
+                if (blkno == 0)
+                        continue;
+                /*
+                 * Read the sibling leaf block.
+                 */
+                error = xfs_dir3_leafn_read(state->args->trans, dp,
+                                            blkno, -1, &bp);
+                if (error)
+                        return error;
+                /*
+                 * Count bytes in the two blocks combined.
+                 */
+                count = leafhdr.count - leafhdr.stale;
+                bytes = state->args->geo->blksize -
+                        (state->args->geo->blksize >> 2);
+                leaf = bp->b_addr;
+                dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
+                ents = dp->d_ops->leaf_ents_p(leaf);
+                count += hdr2.count - hdr2.stale;
+                bytes -= count * sizeof(ents[0]);
+                /*
+                 * Fits with at least 25% to spare.
+                 */
+                if (bytes >= 0)
+                        break;
+                xfs_trans_brelse(state->args->trans, bp);
+        }
+        /*
+         * Didn't like either block, give up.
+         */
+        if (i >= 2) {
+                *action = 0;
+                return 0;
+        }
+        /*
+         * Make altpath point to the block we want to keep (the lower
+         * numbered block) and path point to the block we want to drop.
+         */
+        memcpy(&state->altpath, &state->path, sizeof(state->path));
+        if (blkno < blk->blkno)
+                error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
+                        &rval);
+        else
+                error = xfs_da3_path_shift(state, &state->path, forward, 0,
+                        &rval);
+        if (error) {
+                return error;
+        }
+        *action = rval ? 0 : 1;
+        return 0;
+}
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+        xfs_da_state_t          *state,         /* cursor */
+        xfs_da_state_blk_t      *drop_blk,      /* dead block */
+        xfs_da_state_blk_t      *save_blk)      /* surviving block */
+{
+        xfs_da_args_t           *args;          /* operation arguments */
+        xfs_dir2_leaf_t         *drop_leaf;     /* dead leaf structure */
+        xfs_dir2_leaf_t         *save_leaf;     /* surviving leaf structure */
+        struct xfs_dir3_icleaf_hdr savehdr;
+        struct xfs_dir3_icleaf_hdr drophdr;
+        struct xfs_dir2_leaf_entry *sents;
+        struct xfs_dir2_leaf_entry *dents;
+        struct xfs_inode        *dp = state->args->dp;
+        args = state->args;
+        ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        drop_leaf = drop_blk->bp->b_addr;
+        save_leaf = save_blk->bp->b_addr;
+        dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
+        dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
+        sents = dp->d_ops->leaf_ents_p(save_leaf);
+        dents = dp->d_ops->leaf_ents_p(drop_leaf);
+        /*
+         * If there are any stale leaf entries, take this opportunity
+         * to purge them.
+         */
+        if (drophdr.stale)
+                xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+        if (savehdr.stale)
+                xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+        /*
+         * Move the entries from drop to the appropriate end of save.
+         */
+        drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
+        if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
+                xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+                                        save_blk->bp, &savehdr, sents, 0,
+                                        drophdr.count);
+        else
+                xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+                                        save_blk->bp, &savehdr, sents,
+                                        savehdr.count, drophdr.count);
+        save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+        /* log the changes made when moving the entries */
+        dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
+        dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
+        xfs_dir3_leaf_log_header(args, save_blk->bp);
+        xfs_dir3_leaf_log_header(args, drop_blk->bp);
+        xfs_dir3_leaf_check(dp, save_blk->bp);
+        xfs_dir3_leaf_check(dp, drop_blk->bp);
+}
+/*
+ * Top-level node form directory addname routine.
+ */
+int                                             /* error */
+xfs_dir2_node_addname(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_da_state_blk_t      *blk;           /* leaf block for insert */
+        int                     error;          /* error return value */
+        int                     rval;           /* sub-return value */
+        xfs_da_state_t          *state;         /* btree cursor */
+        trace_xfs_dir2_node_addname(args);
+        /*
+         * Allocate and initialize the state (btree cursor).
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        /*
+         * Look up the name.  We're not supposed to find it, but
+         * this gives us the insertion point.
+         */
+        error = xfs_da3_node_lookup_int(state, &rval);
+        if (error)
+                rval = error;
+        if (rval != -ENOENT) {
+                goto done;
+        }
+        /*
+         * Add the data entry to a data block.
+         * Extravalid is set to a freeblock found by lookup.
+         */
+        rval = xfs_dir2_node_addname_int(args,
+                state->extravalid ? &state->extrablk : NULL);
+        if (rval) {
+                goto done;
+        }
+        blk = &state->path.blk[state->path.active - 1];
+        ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        /*
+         * Add the new leaf entry.
+         */
+        rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+        if (rval == 0) {
+                /*
+                 * It worked, fix the hash values up the btree.
+                 */
+                if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+                        xfs_da3_fixhashpath(state, &state->path);
+        } else {
+                /*
+                 * It didn't work, we need to split the leaf block.
+                 */
+                if (args->total == 0) {
+                        ASSERT(rval == -ENOSPC);
+                        goto done;
+                }
+                /*
+                 * Split the leaf block and insert the new entry.
+                 */
+                rval = xfs_da3_split(state);
+        }
+done:
+        xfs_da_state_free(state);
+        return rval;
+}
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int                                      /* error */
+xfs_dir2_node_addname_int(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_da_state_blk_t      *fblk)          /* optional freespace block */
+{
+        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+        xfs_dir2_db_t           dbno;           /* data block number */
+        struct xfs_buf          *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* data unused entry pointer */
+        int                     error;          /* error return value */
+        xfs_dir2_db_t           fbno;           /* freespace block number */
+        struct xfs_buf          *fbp;           /* freespace buffer */
+        int                     findex;         /* freespace entry index */
+        xfs_dir2_free_t         *free=NULL;     /* freespace block structure */
+        xfs_dir2_db_t           ifbno;          /* initial freespace block no */
+        xfs_dir2_db_t           lastfbno=0;     /* highest freespace block no */
+        int                     length;         /* length of the new entry */
+        int                     logfree;        /* need to log free entry */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to rescan data frees */
+        __be16                  *tagp;          /* data entry tag pointer */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        __be16                  *bests;
+        struct xfs_dir3_icfree_hdr freehdr;
+        struct xfs_dir2_data_free *bf;
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        length = dp->d_ops->data_entsize(args->namelen);
+        /*
+         * If we came in with a freespace block that means that lookup
+         * found an entry with our hash value.  This is the freespace
+         * block for that data entry.
+         */
+        if (fblk) {
+                fbp = fblk->bp;
+                /*
+                 * Remember initial freespace block number.
+                 */
+                ifbno = fblk->blkno;
+                free = fbp->b_addr;
+                findex = fblk->index;
+                bests = dp->d_ops->free_bests_p(free);
+                dp->d_ops->free_hdr_from_disk(&freehdr, free);
+                /*
+                 * This means the free entry showed that the data block had
+                 * space for our entry, so we remembered it.
+                 * Use that data block.
+                 */
+                if (findex >= 0) {
+                        ASSERT(findex < freehdr.nvalid);
+                        ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+                        ASSERT(be16_to_cpu(bests[findex]) >= length);
+                        dbno = freehdr.firstdb + findex;
+                } else {
+                        /*
+                         * The data block looked at didn't have enough room.
+                         * We'll start at the beginning of the freespace entries.
+                         */
+                        dbno = -1;
+                        findex = 0;
+                }
+        } else {
+                /*
+                 * Didn't come in with a freespace block, so no data block.
+                 */
+                ifbno = dbno = -1;
+                fbp = NULL;
+                findex = 0;
+        }
+        /*
+         * If we don't have a data block yet, we're going to scan the
+         * freespace blocks looking for one.  Figure out what the
+         * highest freespace block number is.
+         */
+        if (dbno == -1) {
+                xfs_fileoff_t   fo;             /* freespace block number */
+                if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
+                        return error;
+                lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+                fbno = ifbno;
+        }
+        /*
+         * While we haven't identified a data block, search the freeblock
+         * data for a good data block.  If we find a null freeblock entry,
+         * indicating a hole in the data blocks, remember that.
+         */
+        while (dbno == -1) {
+                /*
+                 * If we don't have a freeblock in hand, get the next one.
+                 */
+                if (fbp == NULL) {
+                        /*
+                         * Happens the first time through unless lookup gave
+                         * us a freespace block to start with.
+                         */
+                        if (++fbno == 0)
+                                fbno = xfs_dir2_byte_to_db(args->geo,
+                                                        XFS_DIR2_FREE_OFFSET);
+                        /*
+                         * If it's ifbno we already looked at it.
+                         */
+                        if (fbno == ifbno)
+                                fbno++;
+                        /*
+                         * If it's off the end we're done.
+                         */
+                        if (fbno >= lastfbno)
+                                break;
+                        /*
+                         * Read the block.  There can be holes in the
+                         * freespace blocks, so this might not succeed.
+                         * This should be really rare, so there's no reason
+                         * to avoid it.
+                         */
+                        error = xfs_dir2_free_try_read(tp, dp,
+                                        xfs_dir2_db_to_da(args->geo, fbno),
+                                        &fbp);
+                        if (error)
+                                return error;
+                        if (!fbp)
+                                continue;
+                        free = fbp->b_addr;
+                        findex = 0;
+                }
+                /*
+                 * Look at the current free entry.  Is it good enough?
+                 *
+                 * The bests initialisation should be where the bufer is read in
+                 * the above branch. But gcc is too stupid to realise that bests
+                 * and the freehdr are actually initialised if they are placed
+                 * there, so we have to do it here to avoid warnings. Blech.
+                 */
+                bests = dp->d_ops->free_bests_p(free);
+                dp->d_ops->free_hdr_from_disk(&freehdr, free);
+                if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+                    be16_to_cpu(bests[findex]) >= length)
+                        dbno = freehdr.firstdb + findex;
+                else {
+                        /*
+                         * Are we done with the freeblock?
+                         */
+                        if (++findex == freehdr.nvalid) {
+                                /*
+                                 * Drop the block.
+                                 */
+                                xfs_trans_brelse(tp, fbp);
+                                fbp = NULL;
+                                if (fblk && fblk->bp)
+                                        fblk->bp = NULL;
+                        }
+                }
+        }
+        /*
+         * If we don't have a data block, we need to allocate one and make
+         * the freespace entries refer to it.
+         */
+        if (unlikely(dbno == -1)) {
+                /*
+                 * Not allowed to allocate, return failure.
+                 */
+                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+                        return -ENOSPC;
+                /*
+                 * Allocate and initialize the new data block.
+                 */
+                if (unlikely((error = xfs_dir2_grow_inode(args,
+                                                         XFS_DIR2_DATA_SPACE,
+                                                         &dbno)) ||
+                    (error = xfs_dir3_data_init(args, dbno, &dbp))))
+                        return error;
+                /*
+                 * If (somehow) we have a freespace block, get rid of it.
+                 */
+                if (fbp)
+                        xfs_trans_brelse(tp, fbp);
+                if (fblk && fblk->bp)
+                        fblk->bp = NULL;
+                /*
+                 * Get the freespace block corresponding to the data block
+                 * that was just allocated.
+                 */
+                fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
+                error = xfs_dir2_free_try_read(tp, dp,
+                                       xfs_dir2_db_to_da(args->geo, fbno),
+                                       &fbp);
+                if (error)
+                        return error;
+                /*
+                 * If there wasn't a freespace block, the read will
+                 * return a NULL fbp.  Allocate and initialize a new one.
+                 */
+                if (!fbp) {
+                        error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+                                                    &fbno);
+                        if (error)
+                                return error;
+                        if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
+                                xfs_alert(mp,
+                        "%s: dir ino %llu needed freesp block %lld for\n"
+                        "  data block %lld, got %lld ifbno %llu lastfbno %d",
+                                        __func__, (unsigned long long)dp->i_ino,
+                                        (long long)dp->d_ops->db_to_fdb(
+                                                                args->geo, dbno),
+                                        (long long)dbno, (long long)fbno,
+                                        (unsigned long long)ifbno, lastfbno);
+                                if (fblk) {
+                                        xfs_alert(mp,
+                                " fblk 0x%p blkno %llu index %d magic 0x%x",
+                                                fblk,
+                                                (unsigned long long)fblk->blkno,
+                                                fblk->index,
+                                                fblk->magic);
+                                } else {
+                                        xfs_alert(mp, " ... fblk is NULL");
+                                }
+                                XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
+                                                 XFS_ERRLEVEL_LOW, mp);
+                                return -EFSCORRUPTED;
+                        }
+                        /*
+                         * Get a buffer for the new block.
+                         */
+                        error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+                        if (error)
+                                return error;
+                        free = fbp->b_addr;
+                        bests = dp->d_ops->free_bests_p(free);
+                        dp->d_ops->free_hdr_from_disk(&freehdr, free);
+                        /*
+                         * Remember the first slot as our empty slot.
+                         */
+                        freehdr.firstdb =
+                                (fbno - xfs_dir2_byte_to_db(args->geo,
+                                                        XFS_DIR2_FREE_OFFSET)) *
+                                        dp->d_ops->free_max_bests(args->geo);
+                } else {
+                        free = fbp->b_addr;
+                        bests = dp->d_ops->free_bests_p(free);
+                        dp->d_ops->free_hdr_from_disk(&freehdr, free);
+                }
+                /*
+                 * Set the freespace block index from the data block number.
+                 */
+                findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
+                /*
+                 * If it's after the end of the current entries in the
+                 * freespace block, extend that table.
+                 */
+                if (findex >= freehdr.nvalid) {
+                        ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
+                        freehdr.nvalid = findex + 1;
+                        /*
+                         * Tag new entry so nused will go up.
+                         */
+                        bests[findex] = cpu_to_be16(NULLDATAOFF);
+                }
+                /*
+                 * If this entry was for an empty data block
+                 * (this should always be true) then update the header.
+                 */
+                if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
+                        freehdr.nused++;
+                        dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+                        xfs_dir2_free_log_header(args, fbp);
+                }
+                /*
+                 * Update the real value in the table.
+                 * We haven't allocated the data entry yet so this will
+                 * change again.
+                 */
+                hdr = dbp->b_addr;
+                bf = dp->d_ops->data_bestfree_p(hdr);
+                bests[findex] = bf[0].length;
+                logfree = 1;
+        }
+        /*
+         * We had a data block so we don't have to make a new one.
+         */
+        else {
+                /*
+                 * If just checking, we succeeded.
+                 */
+                if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+                        return 0;
+                /*
+                 * Read the data block in.
+                 */
+                error = xfs_dir3_data_read(tp, dp,
+                                           xfs_dir2_db_to_da(args->geo, dbno),
+                                           -1, &dbp);
+                if (error)
+                        return error;
+                hdr = dbp->b_addr;
+                bf = dp->d_ops->data_bestfree_p(hdr);
+                logfree = 0;
+        }
+        ASSERT(be16_to_cpu(bf[0].length) >= length);
+        /*
+         * Point to the existing unused space.
+         */
+        dup = (xfs_dir2_data_unused_t *)
+              ((char *)hdr + be16_to_cpu(bf[0].offset));
+        needscan = needlog = 0;
+        /*
+         * Mark the first part of the unused space, inuse for us.
+         */
+        xfs_dir2_data_use_free(args, dbp, dup,
+                (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
+                &needlog, &needscan);
+        /*
+         * Fill in the new entry and log it.
+         */
+        dep = (xfs_dir2_data_entry_t *)dup;
+        dep->inumber = cpu_to_be64(args->inumber);
+        dep->namelen = args->namelen;
+        memcpy(dep->name, args->name, dep->namelen);
+        dp->d_ops->data_put_ftype(dep, args->filetype);
+        tagp = dp->d_ops->data_entry_tag_p(dep);
+        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+        xfs_dir2_data_log_entry(args, dbp, dep);
+        /*
+         * Rescan the block for bestfree if needed.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(dp, hdr, &needlog);
+        /*
+         * Log the data block header if needed.
+         */
+        if (needlog)
+                xfs_dir2_data_log_header(args, dbp);
+        /*
+         * If the freespace entry is now wrong, update it.
+         */
+        bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
+        if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+                bests[findex] = bf[0].length;
+                logfree = 1;
+        }
+        /*
+         * Log the freespace entry if needed.
+         */
+        if (logfree)
+                xfs_dir2_free_log_bests(args, fbp, findex, findex);
+        /*
+         * Return the data block and offset in args, then drop the data block.
+         */
+        args->blkno = (xfs_dablk_t)dbno;
+        args->index = be16_to_cpu(*tagp);
+        return 0;
+}
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da3_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int                                             /* error */
+xfs_dir2_node_lookup(
+        xfs_da_args_t   *args)                  /* operation arguments */
+{
+        int             error;                  /* error return value */
+        int             i;                      /* btree level */
+        int             rval;                   /* operation return value */
+        xfs_da_state_t  *state;                 /* btree cursor */
+        trace_xfs_dir2_node_lookup(args);
+        /*
+         * Allocate and initialize the btree cursor.
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        /*
+         * Fill in the path to the entry in the cursor.
+         */
+        error = xfs_da3_node_lookup_int(state, &rval);
+        if (error)
+                rval = error;
+        else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
+                /* If a CI match, dup the actual name and return -EEXIST */
+                xfs_dir2_data_entry_t   *dep;
+                dep = (xfs_dir2_data_entry_t *)
+                        ((char *)state->extrablk.bp->b_addr +
+                                                 state->extrablk.index);
+                rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+        }
+        /*
+         * Release the btree blocks and leaf block.
+         */
+        for (i = 0; i < state->path.active; i++) {
+                xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+                state->path.blk[i].bp = NULL;
+        }
+        /*
+         * Release the data block if we have it.
+         */
+        if (state->extravalid && state->extrablk.bp) {
+                xfs_trans_brelse(args->trans, state->extrablk.bp);
+                state->extrablk.bp = NULL;
+        }
+        xfs_da_state_free(state);
+        return rval;
+}
+/*
+ * Remove an entry from a node-format directory.
+ */
+int                                             /* error */
+xfs_dir2_node_removename(
+        struct xfs_da_args      *args)          /* operation arguments */
+{
+        struct xfs_da_state_blk *blk;           /* leaf block */
+        int                     error;          /* error return value */
+        int                     rval;           /* operation return value */
+        struct xfs_da_state     *state;         /* btree cursor */
+        trace_xfs_dir2_node_removename(args);
+        /*
+         * Allocate and initialize the btree cursor.
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        /* Look up the entry we're deleting, set up the cursor. */
+        error = xfs_da3_node_lookup_int(state, &rval);
+        if (error)
+                goto out_free;
+        /* Didn't find it, upper layer screwed up. */
+        if (rval != -EEXIST) {
+                error = rval;
+                goto out_free;
+        }
+        blk = &state->path.blk[state->path.active - 1];
+        ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        ASSERT(state->extravalid);
+        /*
+         * Remove the leaf and data entries.
+         * Extrablk refers to the data block.
+         */
+        error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+                &state->extrablk, &rval);
+        if (error)
+                goto out_free;
+        /*
+         * Fix the hash values up the btree.
+         */
+        xfs_da3_fixhashpath(state, &state->path);
+        /*
+         * If we need to join leaf blocks, do it.
+         */
+        if (rval && state->path.active > 1)
+                error = xfs_da3_join(state);
+        /*
+         * If no errors so far, try conversion to leaf format.
+         */
+        if (!error)
+                error = xfs_dir2_node_to_leaf(state);
+out_free:
+        xfs_da_state_free(state);
+        return error;
+}
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int                                             /* error */
+xfs_dir2_node_replace(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_da_state_blk_t      *blk;           /* leaf block */
+        xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+        xfs_dir2_data_entry_t   *dep;           /* data entry changed */
+        int                     error;          /* error return value */
+        int                     i;              /* btree level */
+        xfs_ino_t               inum;           /* new inode number */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry being changed */
+        int                     rval;           /* internal return value */
+        xfs_da_state_t          *state;         /* btree cursor */
+        trace_xfs_dir2_node_replace(args);
+        /*
+         * Allocate and initialize the btree cursor.
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        inum = args->inumber;
+        /*
+         * Lookup the entry to change in the btree.
+         */
+        error = xfs_da3_node_lookup_int(state, &rval);
+        if (error) {
+                rval = error;
+        }
+        /*
+         * It should be found, since the vnodeops layer has looked it up
+         * and locked it.  But paranoia is good.
+         */
+        if (rval == -EEXIST) {
+                struct xfs_dir2_leaf_entry *ents;
+                /*
+                 * Find the leaf entry.
+                 */
+                blk = &state->path.blk[state->path.active - 1];
+                ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+                leaf = blk->bp->b_addr;
+                ents = args->dp->d_ops->leaf_ents_p(leaf);
+                lep = &ents[blk->index];
+                ASSERT(state->extravalid);
+                /*
+                 * Point to the data entry.
+                 */
+                hdr = state->extrablk.bp->b_addr;
+                ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+                       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+                dep = (xfs_dir2_data_entry_t *)
+                      ((char *)hdr +
+                       xfs_dir2_dataptr_to_off(args->geo,
+                                               be32_to_cpu(lep->address)));
+                ASSERT(inum != be64_to_cpu(dep->inumber));
+                /*
+                 * Fill in the new inode number and log the entry.
+                 */
+                dep->inumber = cpu_to_be64(inum);
+                args->dp->d_ops->data_put_ftype(dep, args->filetype);
+                xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
+                rval = 0;
+        }
+        /*
+         * Didn't find it, and we're holding a data block.  Drop it.
+         */
+        else if (state->extravalid) {
+                xfs_trans_brelse(args->trans, state->extrablk.bp);
+                state->extrablk.bp = NULL;
+        }
+        /*
+         * Release all the buffers in the cursor.
+         */
+        for (i = 0; i < state->path.active; i++) {
+                xfs_trans_brelse(args->trans, state->path.blk[i].bp);
+                state->path.blk[i].bp = NULL;
+        }
+        xfs_da_state_free(state);
+        return rval;
+}
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int                                             /* error */
+xfs_dir2_node_trim_free(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_fileoff_t           fo,             /* free block number */
+        int                     *rvalp)         /* out: did something */
+{
+        struct xfs_buf          *bp;            /* freespace buffer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_dir3_icfree_hdr freehdr;
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Read the freespace block.
+         */
+        error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+        if (error)
+                return error;
+        /*
+         * There can be holes in freespace.  If fo is a hole, there's
+         * nothing to do.
+         */
+        if (!bp)
+                return 0;
+        free = bp->b_addr;
+        dp->d_ops->free_hdr_from_disk(&freehdr, free);
+        /*
+         * If there are used entries, there's nothing to do.
+         */
+        if (freehdr.nused > 0) {
+                xfs_trans_brelse(tp, bp);
+                *rvalp = 0;
+                return 0;
+        }
+        /*
+         * Blow the block away.
+         */
+        error = xfs_dir2_shrink_inode(args,
+                        xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+        if (error) {
+                /*
+                 * Can't fail with ENOSPC since that only happens with no
+                 * space reservation, when breaking up an extent into two
+                 * pieces.  This is the last block of an extent.
+                 */
+                ASSERT(error != -ENOSPC);
+                xfs_trans_brelse(tp, bp);
+                return error;
+        }
+        /*
+         * Return that we succeeded.
+         */
+        *rvalp = 1;
+        return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
new file mode 100644
index 000000000000..27ce0794d196
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DIR2_PRIV_H__
+#define __XFS_DIR2_PRIV_H__
+struct dir_context;
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+        return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+        return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+        return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+        return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+        return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+        return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+                        xfs_dir2_data_aoff_t o)
+{
+        return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+        return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+        return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+                           xfs_dir2_data_aoff_t o)
+{
+        return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+        return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+        return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+        return ((struct xfs_dir2_block_tail *)
+                ((char *)hdr + geo->blksize)) - 1;
+}
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+        return (struct xfs_dir2_leaf_tail *)
+                ((char *)lp + geo->blksize -
+                  sizeof(struct xfs_dir2_leaf_tail));
+}
+/* xfs_dir2.c */
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+                                xfs_dir2_db_t *dbp);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+                                const unsigned char *name, int len);
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
+                                        __uint8_t filetype);
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                               struct xfs_buf **bpp);
+extern int xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
+                struct xfs_buf *lbp, struct xfs_buf *dbp);
+/* xfs_dir2_data.c */
+#ifdef DEBUG
+#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
+#else
+#define xfs_dir3_data_check(dp,bp)
+#endif
+extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+                xfs_daddr_t mapped_bno);
+extern struct xfs_dir2_data_free *
+xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
+                struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+                int *loghead);
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+                struct xfs_buf **bpp);
+/* xfs_dir2_leaf.c */
+extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
+extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
+                struct xfs_buf *dbp);
+extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+                struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+                struct xfs_dir2_leaf_entry *ents, int *indexp,
+                int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+                struct xfs_buf **bpp, __uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
+                struct xfs_buf *bp, int first, int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
+                struct xfs_buf *bp);
+extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+                struct xfs_buf *lbp);
+extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
+                struct xfs_buf *lbp, xfs_dir2_db_t db);
+extern struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+                struct xfs_dir2_leaf_entry *ents, int index, int compact,
+                int lowstale, int highstale, int *lfloglow, int *lfloghigh);
+extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
+                struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
+/* xfs_dir2_node.c */
+extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
+                struct xfs_buf *lbp);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
+                struct xfs_buf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
+                struct xfs_da_args *args, int *indexp,
+                struct xfs_da_state *state);
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
+                struct xfs_buf *leaf2_bp);
+extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
+        struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
+extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+                struct xfs_da_state_blk *drop_blk,
+                struct xfs_da_state_blk *save_blk);
+extern int xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+                int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t fbno, struct xfs_buf **bpp);
+/* xfs_dir2_sf.c */
+extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
+                struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
+                int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
+                       size_t bufsize);
+#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
new file mode 100644
index 000000000000..8f4f26af35e1
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -0,0 +1,1184 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+#include "xfs_dinode.h"
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+                                     xfs_dir2_sf_entry_t *sfep,
+                                     xfs_dir2_data_aoff_t offset,
+                                     int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+                                     int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+                                    xfs_dir2_sf_entry_t **sfepp,
+                                    xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+#if XFS_BIG_INUMS
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+#endif /* XFS_BIG_INUMS */
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode.  If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int                                             /* size for sf form */
+xfs_dir2_block_sfsize(
+        xfs_inode_t             *dp,            /* incore inode pointer */
+        xfs_dir2_data_hdr_t     *hdr,           /* block directory data */
+        xfs_dir2_sf_hdr_t       *sfhp)          /* output: header for sf form */
+{
+        xfs_dir2_dataptr_t      addr;           /* data entry address */
+        xfs_dir2_leaf_entry_t   *blp;           /* leaf area of the block */
+        xfs_dir2_block_tail_t   *btp;           /* tail area of the block */
+        int                     count;          /* shortform entry count */
+        xfs_dir2_data_entry_t   *dep;           /* data entry in the block */
+        int                     i;              /* block entry index */
+        int                     i8count;        /* count of big-inode entries */
+        int                     isdot;          /* entry is "." */
+        int                     isdotdot;       /* entry is ".." */
+        xfs_mount_t             *mp;            /* mount structure pointer */
+        int                     namelen;        /* total name bytes */
+        xfs_ino_t               parent = 0;     /* parent inode number */
+        int                     size=0;         /* total computed size */
+        int                     has_ftype;
+        struct xfs_da_geometry  *geo;
+        mp = dp->i_mount;
+        geo = mp->m_dir_geo;
+        /*
+         * if there is a filetype field, add the extra byte to the namelen
+         * for each entry that we see.
+         */
+        has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
+        count = i8count = namelen = 0;
+        btp = xfs_dir2_block_tail_p(geo, hdr);
+        blp = xfs_dir2_block_leaf_p(btp);
+        /*
+         * Iterate over the block's data entries by using the leaf pointers.
+         */
+        for (i = 0; i < be32_to_cpu(btp->count); i++) {
+                if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Calculate the pointer to the entry at hand.
+                 */
+                dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+                                xfs_dir2_dataptr_to_off(geo, addr));
+                /*
+                 * Detect . and .., so we can special-case them.
+                 * . is not included in sf directories.
+                 * .. is included by just the parent inode number.
+                 */
+                isdot = dep->namelen == 1 && dep->name[0] == '.';
+                isdotdot =
+                        dep->namelen == 2 &&
+                        dep->name[0] == '.' && dep->name[1] == '.';
+#if XFS_BIG_INUMS
+                if (!isdot)
+                        i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
+#endif
+                /* take into account the file type field */
+                if (!isdot && !isdotdot) {
+                        count++;
+                        namelen += dep->namelen + has_ftype;
+                } else if (isdotdot)
+                        parent = be64_to_cpu(dep->inumber);
+                /*
+                 * Calculate the new size, see if we should give up yet.
+                 */
+                size = xfs_dir2_sf_hdr_size(i8count) +          /* header */
+                       count +                                  /* namelen */
+                       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+                       namelen +                                /* name */
+                       (i8count ?                               /* inumber */
+                                (uint)sizeof(xfs_dir2_ino8_t) * count :
+                                (uint)sizeof(xfs_dir2_ino4_t) * count);
+                if (size > XFS_IFORK_DSIZE(dp))
+                        return size;            /* size value is a failure */
+        }
+        /*
+         * Create the output header, if it worked.
+         */
+        sfhp->count = count;
+        sfhp->i8count = i8count;
+        dp->d_ops->sf_put_parent_ino(sfhp, parent);
+        return size;
+}
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int                                             /* error */
+xfs_dir2_block_to_sf(
+        xfs_da_args_t           *args,          /* operation arguments */
+        struct xfs_buf          *bp,
+        int                     size,           /* shortform directory size */
+        xfs_dir2_sf_hdr_t       *sfhp)          /* shortform directory hdr */
+{
+        xfs_dir2_data_hdr_t     *hdr;           /* block header */
+        xfs_dir2_block_tail_t   *btp;           /* block tail pointer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* unused data pointer */
+        char                    *endptr;        /* end of data entries */
+        int                     error;          /* error return value */
+        int                     logflags;       /* inode logging flags */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        char                    *ptr;           /* current data pointer */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
+        xfs_dir2_sf_hdr_t       *sfp;           /* shortform directory header */
+        xfs_dir2_sf_hdr_t       *dst;           /* temporary data buffer */
+        trace_xfs_dir2_block_to_sf(args);
+        dp = args->dp;
+        mp = dp->i_mount;
+        /*
+         * allocate a temporary destination buffer the size of the inode
+         * to format the data into. Once we have formatted the data, we
+         * can free the block and copy the formatted data into the inode literal
+         * area.
+         */
+        dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+        hdr = bp->b_addr;
+        /*
+         * Copy the header into the newly allocate local space.
+         */
+        sfp = (xfs_dir2_sf_hdr_t *)dst;
+        memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
+        /*
+         * Set up to loop over the block's entries.
+         */
+        btp = xfs_dir2_block_tail_p(args->geo, hdr);
+        ptr = (char *)dp->d_ops->data_entry_p(hdr);
+        endptr = (char *)xfs_dir2_block_leaf_p(btp);
+        sfep = xfs_dir2_sf_firstentry(sfp);
+        /*
+         * Loop over the active and unused entries.
+         * Stop when we reach the leaf/tail portion of the block.
+         */
+        while (ptr < endptr) {
+                /*
+                 * If it's unused, just skip over it.
+                 */
+                dup = (xfs_dir2_data_unused_t *)ptr;
+                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                        ptr += be16_to_cpu(dup->length);
+                        continue;
+                }
+                dep = (xfs_dir2_data_entry_t *)ptr;
+                /*
+                 * Skip .
+                 */
+                if (dep->namelen == 1 && dep->name[0] == '.')
+                        ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
+                /*
+                 * Skip .., but make sure the inode number is right.
+                 */
+                else if (dep->namelen == 2 &&
+                         dep->name[0] == '.' && dep->name[1] == '.')
+                        ASSERT(be64_to_cpu(dep->inumber) ==
+                               dp->d_ops->sf_get_parent_ino(sfp));
+                /*
+                 * Normal entry, copy it into shortform.
+                 */
+                else {
+                        sfep->namelen = dep->namelen;
+                        xfs_dir2_sf_put_offset(sfep,
+                                (xfs_dir2_data_aoff_t)
+                                ((char *)dep - (char *)hdr));
+                        memcpy(sfep->name, dep->name, dep->namelen);
+                        dp->d_ops->sf_put_ino(sfp, sfep,
+                                              be64_to_cpu(dep->inumber));
+                        dp->d_ops->sf_put_ftype(sfep,
+                                        dp->d_ops->data_get_ftype(dep));
+                        sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+                }
+                ptr += dp->d_ops->data_entsize(dep->namelen);
+        }
+        ASSERT((char *)sfep - (char *)sfp == size);
+        /* now we are done with the block, we can shrink the inode */
+        logflags = XFS_ILOG_CORE;
+        error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
+        if (error) {
+                ASSERT(error != -ENOSPC);
+                goto out;
+        }
+        /*
+         * The buffer is now unconditionally gone, whether
+         * xfs_dir2_shrink_inode worked or not.
+         *
+         * Convert the inode to local format and copy the data in.
+         */
+        dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+        dp->i_df.if_flags |= XFS_IFINLINE;
+        dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+        ASSERT(dp->i_df.if_bytes == 0);
+        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+        logflags |= XFS_ILOG_DDATA;
+        memcpy(dp->i_df.if_u1.if_data, dst, size);
+        dp->i_d.di_size = size;
+        xfs_dir2_sf_check(args);
+out:
+        xfs_trans_log_inode(args->trans, dp, logflags);
+        kmem_free(dst);
+        return error;
+}
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int                                             /* error */
+xfs_dir2_sf_addname(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        int                     incr_isize;     /* total change in size */
+        int                     new_isize;      /* di_size after adding name */
+        int                     objchange;      /* changing to 8-byte inodes */
+        xfs_dir2_data_aoff_t    offset = 0;     /* offset for new entry */
+        int                     pick;           /* which algorithm to use */
+        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+        xfs_dir2_sf_entry_t     *sfep = NULL;   /* shortform entry */
+        trace_xfs_dir2_sf_addname(args);
+        ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Make sure the shortform value has some of its header.
+         */
+        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return -EIO;
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+        /*
+         * Compute entry (and change in) size.
+         */
+        incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
+        objchange = 0;
+#if XFS_BIG_INUMS
+        /*
+         * Do we have to change to 8 byte inodes?
+         */
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+                /*
+                 * Yes, adjust the inode size.  old count + (parent + new)
+                 */
+                incr_isize +=
+                        (sfp->count + 2) *
+                        ((uint)sizeof(xfs_dir2_ino8_t) -
+                         (uint)sizeof(xfs_dir2_ino4_t));
+                objchange = 1;
+        }
+#endif
+        new_isize = (int)dp->i_d.di_size + incr_isize;
+        /*
+         * Won't fit as shortform any more (due to size),
+         * or the pick routine says it won't (due to offset values).
+         */
+        if (new_isize > XFS_IFORK_DSIZE(dp) ||
+            (pick =
+             xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+                /*
+                 * Just checking or no space reservation, it doesn't fit.
+                 */
+                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+                        return -ENOSPC;
+                /*
+                 * Convert to block form then add the name.
+                 */
+                error = xfs_dir2_sf_to_block(args);
+                if (error)
+                        return error;
+                return xfs_dir2_block_addname(args);
+        }
+        /*
+         * Just checking, it fits.
+         */
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+                return 0;
+        /*
+         * Do it the easy way - just add it at the end.
+         */
+        if (pick == 1)
+                xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+        /*
+         * Do it the hard way - look for a place to insert the new entry.
+         * Convert to 8 byte inode numbers first if necessary.
+         */
+        else {
+                ASSERT(pick == 2);
+#if XFS_BIG_INUMS
+                if (objchange)
+                        xfs_dir2_sf_toino8(args);
+#endif
+                xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+        }
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+        return 0;
+}
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dir2_sf_entry_t     *sfep,          /* pointer to new entry */
+        xfs_dir2_data_aoff_t    offset,         /* offset to use for new ent */
+        int                     new_isize)      /* new directory size */
+{
+        int                     byteoff;        /* byte offset in sf dir */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+        dp = args->dp;
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        byteoff = (int)((char *)sfep - (char *)sfp);
+        /*
+         * Grow the in-inode space.
+         */
+        xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
+                          XFS_DATA_FORK);
+        /*
+         * Need to set up again due to realloc of the inode data.
+         */
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+        /*
+         * Fill in the new entry.
+         */
+        sfep->namelen = args->namelen;
+        xfs_dir2_sf_put_offset(sfep, offset);
+        memcpy(sfep->name, args->name, sfep->namelen);
+        dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+        dp->d_ops->sf_put_ftype(sfep, args->filetype);
+        /*
+         * Update the header and inode.
+         */
+        sfp->count++;
+#if XFS_BIG_INUMS
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+                sfp->i8count++;
+#endif
+        dp->i_d.di_size = new_isize;
+        xfs_dir2_sf_check(args);
+}
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     objchange,      /* changing inode number size */
+        int                     new_isize)      /* new directory size */
+{
+        int                     add_datasize;   /* data size need for new ent */
+        char                    *buf;           /* buffer for old */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     eof;            /* reached end of old dir */
+        int                     nbytes;         /* temp for byte copies */
+        xfs_dir2_data_aoff_t    new_offset;     /* next offset value */
+        xfs_dir2_data_aoff_t    offset;         /* current offset value */
+        int                     old_isize;      /* previous di_size */
+        xfs_dir2_sf_entry_t     *oldsfep;       /* entry in original dir */
+        xfs_dir2_sf_hdr_t       *oldsfp;        /* original shortform dir */
+        xfs_dir2_sf_entry_t     *sfep;          /* entry in new dir */
+        xfs_dir2_sf_hdr_t       *sfp;           /* new shortform dir */
+        struct xfs_mount        *mp;
+        /*
+         * Copy the old directory to the stack buffer.
+         */
+        dp = args->dp;
+        mp = dp->i_mount;
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        old_isize = (int)dp->i_d.di_size;
+        buf = kmem_alloc(old_isize, KM_SLEEP);
+        oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+        memcpy(oldsfp, sfp, old_isize);
+        /*
+         * Loop over the old directory finding the place we're going
+         * to insert the new entry.
+         * If it's going to end up at the end then oldsfep will point there.
+         */
+        for (offset = dp->d_ops->data_first_offset,
+              oldsfep = xfs_dir2_sf_firstentry(oldsfp),
+              add_datasize = dp->d_ops->data_entsize(args->namelen),
+              eof = (char *)oldsfep == &buf[old_isize];
+             !eof;
+             offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
+              oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
+              eof = (char *)oldsfep == &buf[old_isize]) {
+                new_offset = xfs_dir2_sf_get_offset(oldsfep);
+                if (offset + add_datasize <= new_offset)
+                        break;
+        }
+        /*
+         * Get rid of the old directory, then allocate space for
+         * the new one.  We do this so xfs_idata_realloc won't copy
+         * the data.
+         */
+        xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+        xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+        /*
+         * Reset the pointer since the buffer was reallocated.
+         */
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        /*
+         * Copy the first part of the directory, including the header.
+         */
+        nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+        memcpy(sfp, oldsfp, nbytes);
+        sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+        /*
+         * Fill in the new entry, and update the header counts.
+         */
+        sfep->namelen = args->namelen;
+        xfs_dir2_sf_put_offset(sfep, offset);
+        memcpy(sfep->name, args->name, sfep->namelen);
+        dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+        dp->d_ops->sf_put_ftype(sfep, args->filetype);
+        sfp->count++;
+#if XFS_BIG_INUMS
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+                sfp->i8count++;
+#endif
+        /*
+         * If there's more left to copy, do that.
+         */
+        if (!eof) {
+                sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+                memcpy(sfep, oldsfep, old_isize - nbytes);
+        }
+        kmem_free(buf);
+        dp->i_d.di_size = new_isize;
+        xfs_dir2_sf_check(args);
+}
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int                                      /* pick result */
+xfs_dir2_sf_addname_pick(
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     objchange,      /* inode # size changes */
+        xfs_dir2_sf_entry_t     **sfepp,        /* out(1): new entry ptr */
+        xfs_dir2_data_aoff_t    *offsetp)       /* out(1): new offset */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     holefit;        /* found hole it will fit in */
+        int                     i;              /* entry number */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_data_aoff_t    offset;         /* data block offset */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
+        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+        int                     size;           /* entry's data size */
+        int                     used;           /* data bytes used */
+        dp = args->dp;
+        mp = dp->i_mount;
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        size = dp->d_ops->data_entsize(args->namelen);
+        offset = dp->d_ops->data_first_offset;
+        sfep = xfs_dir2_sf_firstentry(sfp);
+        holefit = 0;
+        /*
+         * Loop over sf entries.
+         * Keep track of data offset and whether we've seen a place
+         * to insert the new entry.
+         */
+        for (i = 0; i < sfp->count; i++) {
+                if (!holefit)
+                        holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
+                offset = xfs_dir2_sf_get_offset(sfep) +
+                         dp->d_ops->data_entsize(sfep->namelen);
+                sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+        }
+        /*
+         * Calculate data bytes used excluding the new entry, if this
+         * was a data block (block form directory).
+         */
+        used = offset +
+               (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+               (uint)sizeof(xfs_dir2_block_tail_t);
+        /*
+         * If it won't fit in a block form then we can't insert it,
+         * we'll go back, convert to block, then try the insert and convert
+         * to leaf.
+         */
+        if (used + (holefit ? 0 : size) > args->geo->blksize)
+                return 0;
+        /*
+         * If changing the inode number size, do it the hard way.
+         */
+#if XFS_BIG_INUMS
+        if (objchange) {
+                return 2;
+        }
+#else
+        ASSERT(objchange == 0);
+#endif
+        /*
+         * If it won't fit at the end then do it the hard way (use the hole).
+         */
+        if (used + size > args->geo->blksize)
+                return 2;
+        /*
+         * Do it the easy way.
+         */
+        *sfepp = sfep;
+        *offsetp = offset;
+        return 1;
+}
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry number */
+        int                     i8count;        /* number of big inode#s */
+        xfs_ino_t               ino;            /* entry inode number */
+        int                     offset;         /* data offset */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform dir entry */
+        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+        struct xfs_mount        *mp;
+        dp = args->dp;
+        mp = dp->i_mount;
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        offset = dp->d_ops->data_first_offset;
+        ino = dp->d_ops->sf_get_parent_ino(sfp);
+        i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+             i < sfp->count;
+             i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+                ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
+                ino = dp->d_ops->sf_get_ino(sfp, sfep);
+                i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+                offset =
+                        xfs_dir2_sf_get_offset(sfep) +
+                        dp->d_ops->data_entsize(sfep->namelen);
+                ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
+        }
+        ASSERT(i8count == sfp->i8count);
+        ASSERT(XFS_BIG_INUMS || i8count == 0);
+        ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+        ASSERT(offset +
+               (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+               (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
+}
+#endif  /* DEBUG */
+/*
+ * Create a new (shortform) directory.
+ */
+int                                     /* error, always 0 */
+xfs_dir2_sf_create(
+        xfs_da_args_t   *args,          /* operation arguments */
+        xfs_ino_t       pino)           /* parent inode number */
+{
+        xfs_inode_t     *dp;            /* incore directory inode */
+        int             i8count;        /* parent inode is an 8-byte number */
+        xfs_dir2_sf_hdr_t *sfp;         /* shortform structure */
+        int             size;           /* directory size */
+        trace_xfs_dir2_sf_create(args);
+        dp = args->dp;
+        ASSERT(dp != NULL);
+        ASSERT(dp->i_d.di_size == 0);
+        /*
+         * If it's currently a zero-length extent file,
+         * convert it to local format.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+                dp->i_df.if_flags &= ~XFS_IFEXTENTS;    /* just in case */
+                dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+                xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+                dp->i_df.if_flags |= XFS_IFINLINE;
+        }
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        ASSERT(dp->i_df.if_bytes == 0);
+        i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+        size = xfs_dir2_sf_hdr_size(i8count);
+        /*
+         * Make a buffer for the data.
+         */
+        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+        /*
+         * Fill in the header,
+         */
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        sfp->i8count = i8count;
+        /*
+         * Now can put in the inode number, since i8count is set.
+         */
+        dp->d_ops->sf_put_parent_ino(sfp, pino);
+        sfp->count = 0;
+        dp->i_d.di_size = size;
+        xfs_dir2_sf_check(args);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+        return 0;
+}
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int                                             /* error */
+xfs_dir2_sf_lookup(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry index */
+        int                     error;
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+        enum xfs_dacmp          cmp;            /* comparison result */
+        xfs_dir2_sf_entry_t     *ci_sfep;       /* case-insens. entry */
+        trace_xfs_dir2_sf_lookup(args);
+        xfs_dir2_sf_check(args);
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Bail out if the directory is way too short.
+         */
+        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return -EIO;
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+        /*
+         * Special case for .
+         */
+        if (args->namelen == 1 && args->name[0] == '.') {
+                args->inumber = dp->i_ino;
+                args->cmpresult = XFS_CMP_EXACT;
+                args->filetype = XFS_DIR3_FT_DIR;
+                return -EEXIST;
+        }
+        /*
+         * Special case for ..
+         */
+        if (args->namelen == 2 &&
+            args->name[0] == '.' && args->name[1] == '.') {
+                args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
+                args->cmpresult = XFS_CMP_EXACT;
+                args->filetype = XFS_DIR3_FT_DIR;
+                return -EEXIST;
+        }
+        /*
+         * Loop over all the entries trying to match ours.
+         */
+        ci_sfep = NULL;
+        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+             i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+                /*
+                 * Compare name and if it's an exact match, return the inode
+                 * number. If it's the first case-insensitive match, store the
+                 * inode number and continue looking for an exact match.
+                 */
+                cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
+                                                                sfep->namelen);
+                if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                        args->cmpresult = cmp;
+                        args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
+                        args->filetype = dp->d_ops->sf_get_ftype(sfep);
+                        if (cmp == XFS_CMP_EXACT)
+                                return -EEXIST;
+                        ci_sfep = sfep;
+                }
+        }
+        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+        /*
+         * Here, we can only be doing a lookup (not a rename or replace).
+         * If a case-insensitive match was not found, return -ENOENT.
+         */
+        if (!ci_sfep)
+                return -ENOENT;
+        /* otherwise process the CI match as required by the caller */
+        error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+        return error;
+}
+/*
+ * Remove an entry from a shortform directory.
+ */
+int                                             /* error */
+xfs_dir2_sf_removename(
+        xfs_da_args_t           *args)
+{
+        int                     byteoff;        /* offset of removed entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     entsize;        /* this entry's size */
+        int                     i;              /* shortform entry index */
+        int                     newsize;        /* new inode size */
+        int                     oldsize;        /* old inode size */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+        trace_xfs_dir2_sf_removename(args);
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        oldsize = (int)dp->i_d.di_size;
+        /*
+         * Bail out if the directory is way too short.
+         */
+        if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return -EIO;
+        }
+        ASSERT(dp->i_df.if_bytes == oldsize);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
+        /*
+         * Loop over the old directory entries.
+         * Find the one we're deleting.
+         */
+        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+             i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+                if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+                                                                XFS_CMP_EXACT) {
+                        ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
+                               args->inumber);
+                        break;
+                }
+        }
+        /*
+         * Didn't find it.
+         */
+        if (i == sfp->count)
+                return -ENOENT;
+        /*
+         * Calculate sizes.
+         */
+        byteoff = (int)((char *)sfep - (char *)sfp);
+        entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
+        newsize = oldsize - entsize;
+        /*
+         * Copy the part if any after the removed entry, sliding it down.
+         */
+        if (byteoff + entsize < oldsize)
+                memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
+                        oldsize - (byteoff + entsize));
+        /*
+         * Fix up the header and file size.
+         */
+        sfp->count--;
+        dp->i_d.di_size = newsize;
+        /*
+         * Reallocate, making it smaller.
+         */
+        xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+#if XFS_BIG_INUMS
+        /*
+         * Are we changing inode number size?
+         */
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+                if (sfp->i8count == 1)
+                        xfs_dir2_sf_toino4(args);
+                else
+                        sfp->i8count--;
+        }
+#endif
+        xfs_dir2_sf_check(args);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+        return 0;
+}
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int                                             /* error */
+xfs_dir2_sf_replace(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry index */
+#if XFS_BIG_INUMS || defined(DEBUG)
+        xfs_ino_t               ino=0;          /* entry old inode number */
+#endif
+#if XFS_BIG_INUMS
+        int                     i8elevated;     /* sf_toino8 set i8count=1 */
+#endif
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+        trace_xfs_dir2_sf_replace(args);
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Bail out if the shortform directory is way too small.
+         */
+        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return -EIO;
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+#if XFS_BIG_INUMS
+        /*
+         * New inode number is large, and need to convert to 8-byte inodes.
+         */
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
+                int     error;                  /* error return value */
+                int     newsize;                /* new inode size */
+                newsize =
+                        dp->i_df.if_bytes +
+                        (sfp->count + 1) *
+                        ((uint)sizeof(xfs_dir2_ino8_t) -
+                         (uint)sizeof(xfs_dir2_ino4_t));
+                /*
+                 * Won't fit as shortform, convert to block then do replace.
+                 */
+                if (newsize > XFS_IFORK_DSIZE(dp)) {
+                        error = xfs_dir2_sf_to_block(args);
+                        if (error) {
+                                return error;
+                        }
+                        return xfs_dir2_block_replace(args);
+                }
+                /*
+                 * Still fits, convert to 8-byte now.
+                 */
+                xfs_dir2_sf_toino8(args);
+                i8elevated = 1;
+                sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        } else
+                i8elevated = 0;
+#endif
+        ASSERT(args->namelen != 1 || args->name[0] != '.');
+        /*
+         * Replace ..'s entry.
+         */
+        if (args->namelen == 2 &&
+            args->name[0] == '.' && args->name[1] == '.') {
+#if XFS_BIG_INUMS || defined(DEBUG)
+                ino = dp->d_ops->sf_get_parent_ino(sfp);
+                ASSERT(args->inumber != ino);
+#endif
+                dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
+        }
+        /*
+         * Normal entry, look for the name.
+         */
+        else {
+                for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+                     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+                        if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+                                                                XFS_CMP_EXACT) {
+#if XFS_BIG_INUMS || defined(DEBUG)
+                                ino = dp->d_ops->sf_get_ino(sfp, sfep);
+                                ASSERT(args->inumber != ino);
+#endif
+                                dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+                                dp->d_ops->sf_put_ftype(sfep, args->filetype);
+                                break;
+                        }
+                }
+                /*
+                 * Didn't find it.
+                 */
+                if (i == sfp->count) {
+                        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+#if XFS_BIG_INUMS
+                        if (i8elevated)
+                                xfs_dir2_sf_toino4(args);
+#endif
+                        return -ENOENT;
+                }
+        }
+#if XFS_BIG_INUMS
+        /*
+         * See if the old number was large, the new number is small.
+         */
+        if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+            args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+                /*
+                 * And the old count was one, so need to convert to small.
+                 */
+                if (sfp->i8count == 1)
+                        xfs_dir2_sf_toino4(args);
+                else
+                        sfp->i8count--;
+        }
+        /*
+         * See if the old number was small, the new number is large.
+         */
+        if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+            args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+                /*
+                 * add to the i8count unless we just converted to 8-byte
+                 * inodes (which does an implied i8count = 1)
+                 */
+                ASSERT(sfp->i8count != 0);
+                if (!i8elevated)
+                        sfp->i8count++;
+        }
+#endif
+        xfs_dir2_sf_check(args);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+        return 0;
+}
+#if XFS_BIG_INUMS
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        char                    *buf;           /* old dir's buffer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry index */
+        int                     newsize;        /* new inode size */
+        xfs_dir2_sf_entry_t     *oldsfep;       /* old sf entry */
+        xfs_dir2_sf_hdr_t       *oldsfp;        /* old sf directory */
+        int                     oldsize;        /* old inode size */
+        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
+        xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
+        struct xfs_mount        *mp;
+        trace_xfs_dir2_sf_toino4(args);
+        dp = args->dp;
+        mp = dp->i_mount;
+        /*
+         * Copy the old directory to the buffer.
+         * Then nuke it from the inode, and add the new buffer to the inode.
+         * Don't want xfs_idata_realloc copying the data here.
+         */
+        oldsize = dp->i_df.if_bytes;
+        buf = kmem_alloc(oldsize, KM_SLEEP);
+        oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        ASSERT(oldsfp->i8count == 1);
+        memcpy(buf, oldsfp, oldsize);
+        /*
+         * Compute the new inode size.
+         */
+        newsize =
+                oldsize -
+                (oldsfp->count + 1) *
+                ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+        xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+        xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+        /*
+         * Reset our pointers, the data has moved.
+         */
+        oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        /*
+         * Fill in the new header.
+         */
+        sfp->count = oldsfp->count;
+        sfp->i8count = 0;
+        dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+        /*
+         * Copy the entries field by field.
+         */
+        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+                    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+             i < sfp->count;
+             i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+                  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+                sfep->namelen = oldsfep->namelen;
+                sfep->offset = oldsfep->offset;
+                memcpy(sfep->name, oldsfep->name, sfep->namelen);
+                dp->d_ops->sf_put_ino(sfp, sfep,
+                                      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+                dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+        }
+        /*
+         * Clean up the inode.
+         */
+        kmem_free(buf);
+        dp->i_d.di_size = newsize;
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+/*
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        char                    *buf;           /* old dir's buffer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry index */
+        int                     newsize;        /* new inode size */
+        xfs_dir2_sf_entry_t     *oldsfep;       /* old sf entry */
+        xfs_dir2_sf_hdr_t       *oldsfp;        /* old sf directory */
+        int                     oldsize;        /* old inode size */
+        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
+        xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
+        struct xfs_mount        *mp;
+        trace_xfs_dir2_sf_toino8(args);
+        dp = args->dp;
+        mp = dp->i_mount;
+        /*
+         * Copy the old directory to the buffer.
+         * Then nuke it from the inode, and add the new buffer to the inode.
+         * Don't want xfs_idata_realloc copying the data here.
+         */
+        oldsize = dp->i_df.if_bytes;
+        buf = kmem_alloc(oldsize, KM_SLEEP);
+        oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        ASSERT(oldsfp->i8count == 0);
+        memcpy(buf, oldsfp, oldsize);
+        /*
+         * Compute the new inode size (nb: entry count + 1 for parent)
+         */
+        newsize =
+                oldsize +
+                (oldsfp->count + 1) *
+                ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+        xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+        xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+        /*
+         * Reset our pointers, the data has moved.
+         */
+        oldsfp = (xfs_dir2_sf_hdr_t *)buf;
+        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        /*
+         * Fill in the new header.
+         */
+        sfp->count = oldsfp->count;
+        sfp->i8count = 1;
+        dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+        /*
+         * Copy the entries field by field.
+         */
+        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+                    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+             i < sfp->count;
+             i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+                  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+                sfep->namelen = oldsfep->namelen;
+                sfep->offset = oldsfep->offset;
+                memcpy(sfep->name, oldsfep->name, sfep->namelen);
+                dp->d_ops->sf_put_ino(sfp, sfep,
+                                      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+                dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+        }
+        /*
+         * Clean up the inode.
+         */
+        kmem_free(buf);
+        dp->i_d.di_size = newsize;
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+#endif  /* XFS_BIG_INUMS */
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
new file mode 100644
index 000000000000..bb969337efc8
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+int
+xfs_calc_dquots_per_chunk(
+        unsigned int            nbblks) /* basic block units */
+{
+        unsigned int    ndquots;
+        ASSERT(nbblks > 0);
+        ndquots = BBTOB(nbblks);
+        do_div(ndquots, sizeof(xfs_dqblk_t));
+        return ndquots;
+}
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_dqcheck(
+        struct xfs_mount *mp,
+        xfs_disk_dquot_t *ddq,
+        xfs_dqid_t       id,
+        uint             type,    /* used only when IO_dorepair is true */
+        uint             flags,
+        char             *str)
+{
+        xfs_dqblk_t      *d = (xfs_dqblk_t *)ddq;
+        int             errs = 0;
+        /*
+         * We can encounter an uninitialized dquot buffer for 2 reasons:
+         * 1. If we crash while deleting the quotainode(s), and those blks got
+         *    used for user data. This is because we take the path of regular
+         *    file deletion; however, the size field of quotainodes is never
+         *    updated, so all the tricks that we play in itruncate_finish
+         *    don't quite matter.
+         *
+         * 2. We don't play the quota buffers when there's a quotaoff logitem.
+         *    But the allocation will be replayed so we'll end up with an
+         *    uninitialized quota block.
+         *
+         * This is all fine; things are still consistent, and we haven't lost
+         * any quota information. Just don't complain about bad dquot blks.
+         */
+        if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
+                if (flags & XFS_QMOPT_DOWARN)
+                        xfs_alert(mp,
+                        "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+                        str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+                errs++;
+        }
+        if (ddq->d_version != XFS_DQUOT_VERSION) {
+                if (flags & XFS_QMOPT_DOWARN)
+                        xfs_alert(mp,
+                        "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+                        str, id, ddq->d_version, XFS_DQUOT_VERSION);
+                errs++;
+        }
+        if (ddq->d_flags != XFS_DQ_USER &&
+            ddq->d_flags != XFS_DQ_PROJ &&
+            ddq->d_flags != XFS_DQ_GROUP) {
+                if (flags & XFS_QMOPT_DOWARN)
+                        xfs_alert(mp,
+                        "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+                        str, id, ddq->d_flags);
+                errs++;
+        }
+        if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+                if (flags & XFS_QMOPT_DOWARN)
+                        xfs_alert(mp,
+                        "%s : ondisk-dquot 0x%p, ID mismatch: "
+                        "0x%x expected, found id 0x%x",
+                        str, ddq, id, be32_to_cpu(ddq->d_id));
+                errs++;
+        }
+        if (!errs && ddq->d_id) {
+                if (ddq->d_blk_softlimit &&
+                    be64_to_cpu(ddq->d_bcount) >
+                                be64_to_cpu(ddq->d_blk_softlimit)) {
+                        if (!ddq->d_btimer) {
+                                if (flags & XFS_QMOPT_DOWARN)
+                                        xfs_alert(mp,
+                        "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
+                                errs++;
+                        }
+                }
+                if (ddq->d_ino_softlimit &&
+                    be64_to_cpu(ddq->d_icount) >
+                                be64_to_cpu(ddq->d_ino_softlimit)) {
+                        if (!ddq->d_itimer) {
+                                if (flags & XFS_QMOPT_DOWARN)
+                                        xfs_alert(mp,
+                        "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
+                                errs++;
+                        }
+                }
+                if (ddq->d_rtb_softlimit &&
+                    be64_to_cpu(ddq->d_rtbcount) >
+                                be64_to_cpu(ddq->d_rtb_softlimit)) {
+                        if (!ddq->d_rtbtimer) {
+                                if (flags & XFS_QMOPT_DOWARN)
+                                        xfs_alert(mp,
+                        "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
+                                errs++;
+                        }
+                }
+        }
+        if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+                return errs;
+        if (flags & XFS_QMOPT_DOWARN)
+                xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+        /*
+         * Typically, a repair is only requested by quotacheck.
+         */
+        ASSERT(id != -1);
+        ASSERT(flags & XFS_QMOPT_DQREPAIR);
+        memset(d, 0, sizeof(xfs_dqblk_t));
+        d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+        d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+        d->dd_diskdq.d_flags = type;
+        d->dd_diskdq.d_id = cpu_to_be32(id);
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+                xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+                                 XFS_DQUOT_CRC_OFF);
+        }
+        return errs;
+}
+STATIC bool
+xfs_dquot_buf_verify_crc(
+        struct xfs_mount        *mp,
+        struct xfs_buf          *bp)
+{
+        struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
+        int                     ndquots;
+        int                     i;
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return true;
+        /*
+         * if we are in log recovery, the quota subsystem has not been
+         * initialised so we have no quotainfo structure. In that case, we need
+         * to manually calculate the number of dquots in the buffer.
+         */
+        if (mp->m_quotainfo)
+                ndquots = mp->m_quotainfo->qi_dqperchunk;
+        else
+                ndquots = xfs_calc_dquots_per_chunk(
+                                        XFS_BB_TO_FSB(mp, bp->b_length));
+        for (i = 0; i < ndquots; i++, d++) {
+                if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+                                 XFS_DQUOT_CRC_OFF))
+                        return false;
+                if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+                        return false;
+        }
+        return true;
+}
+STATIC bool
+xfs_dquot_buf_verify(
+        struct xfs_mount        *mp,
+        struct xfs_buf          *bp)
+{
+        struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
+        xfs_dqid_t              id = 0;
+        int                     ndquots;
+        int                     i;
+        /*
+         * if we are in log recovery, the quota subsystem has not been
+         * initialised so we have no quotainfo structure. In that case, we need
+         * to manually calculate the number of dquots in the buffer.
+         */
+        if (mp->m_quotainfo)
+                ndquots = mp->m_quotainfo->qi_dqperchunk;
+        else
+                ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+        /*
+         * On the first read of the buffer, verify that each dquot is valid.
+         * We don't know what the id of the dquot is supposed to be, just that
+         * they should be increasing monotonically within the buffer. If the
+         * first id is corrupt, then it will fail on the second dquot in the
+         * buffer so corruptions could point to the wrong dquot in this case.
+         */
+        for (i = 0; i < ndquots; i++) {
+                struct xfs_disk_dquot   *ddq;
+                int                     error;
+                ddq = &d[i].dd_diskdq;
+                if (i == 0)
+                        id = be32_to_cpu(ddq->d_id);
+                error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+                                       "xfs_dquot_buf_verify");
+                if (error)
+                        return false;
+        }
+        return true;
+}
+static void
+xfs_dquot_buf_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        if (!xfs_dquot_buf_verify_crc(mp, bp))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_dquot_buf_verify(mp, bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+static void
+xfs_dquot_buf_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        if (!xfs_dquot_buf_verify(mp, bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+}
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+        .verify_read = xfs_dquot_buf_read_verify,
+        .verify_write = xfs_dquot_buf_write_verify,
+};
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
new file mode 100644
index 000000000000..34d85aca3058
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FORMAT_H__
+#define __XFS_FORMAT_H__
+/*
+ * XFS On Disk Format Definitions
+ *
+ * This header file defines all the on-disk format definitions for 
+ * general XFS objects. Directory and attribute related objects are defined in
+ * xfs_da_format.h, which log and log item formats are defined in
+ * xfs_log_format.h. Everything else goes here.
+ */
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+/*
+ * RealTime Device format definitions
+ */
+/* Min and max rt extent sizes, specified in bytes */
+#define XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
+#define XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64kB */
+#define XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4kB */
+#define XFS_BLOCKSIZE(mp)       ((mp)->m_sb.sb_blocksize)
+#define XFS_BLOCKMASK(mp)       ((mp)->m_blockmask)
+#define XFS_BLOCKWSIZE(mp)      ((mp)->m_blockwsize)
+#define XFS_BLOCKWMASK(mp)      ((mp)->m_blockwmask)
+/*
+ * RT Summary and bit manipulation macros.
+ */
+#define XFS_SUMOFFS(mp,ls,bb)   ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define XFS_SUMOFFSTOBLOCK(mp,s)        \
+        (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_SUMPTR(mp,bp,so)    \
+        ((xfs_suminfo_t *)((bp)->b_addr + \
+                (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+#define XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
+#define XFS_BLOCKTOBIT(mp,bb)   ((bb) << (mp)->m_blkbit_log)
+#define XFS_BITTOWORD(mp,bi)    \
+        ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+#define XFS_RTMIN(a,b)  ((a) < (b) ? (a) : (b))
+#define XFS_RTMAX(a,b)  ((a) > (b) ? (a) : (b))
+#define XFS_RTLOBIT(w)  xfs_lowbit32(w)
+#define XFS_RTHIBIT(w)  xfs_highbit32(w)
+#if XFS_BIG_BLKNOS
+#define XFS_RTBLOCKLOG(b)       xfs_highbit64(b)
+#else
+#define XFS_RTBLOCKLOG(b)       xfs_highbit32(b)
+#endif
+/*
+ * Dquot and dquot block format definitions
+ */
+#define XFS_DQUOT_MAGIC         0x4451          /* 'DQ' */
+#define XFS_DQUOT_VERSION       (u_int8_t)0x01  /* latest version number */
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct  xfs_disk_dquot {
+        __be16          d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
+        __u8            d_version;      /* dquot version */
+        __u8            d_flags;        /* XFS_DQ_USER/PROJ/GROUP */
+        __be32          d_id;           /* user,project,group id */
+        __be64          d_blk_hardlimit;/* absolute limit on disk blks */
+        __be64          d_blk_softlimit;/* preferred limit on disk blks */
+        __be64          d_ino_hardlimit;/* maximum # allocated inodes */
+        __be64          d_ino_softlimit;/* preferred inode limit */
+        __be64          d_bcount;       /* disk blocks owned by the user */
+        __be64          d_icount;       /* inodes owned by the user */
+        __be32          d_itimer;       /* zero if within inode limits if not,
+                                           this is when we refuse service */
+        __be32          d_btimer;       /* similar to above; for disk blocks */
+        __be16          d_iwarns;       /* warnings issued wrt num inodes */
+        __be16          d_bwarns;       /* warnings issued wrt disk blocks */
+        __be32          d_pad0;         /* 64 bit align */
+        __be64          d_rtb_hardlimit;/* absolute limit on realtime blks */
+        __be64          d_rtb_softlimit;/* preferred limit on RT disk blks */
+        __be64          d_rtbcount;     /* realtime blocks owned */
+        __be32          d_rtbtimer;     /* similar to above; for RT disk blocks */
+        __be16          d_rtbwarns;     /* warnings issued wrt RT disk blocks */
+        __be16          d_pad;
+} xfs_disk_dquot_t;
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+        xfs_disk_dquot_t  dd_diskdq;    /* portion that lives incore as well */
+        char              dd_fill[4];   /* filling for posterity */
+        /*
+         * These two are only present on filesystems with the CRC bits set.
+         */
+        __be32            dd_crc;       /* checksum */
+        __be64            dd_lsn;       /* last modification in log */
+        uuid_t            dd_uuid;      /* location information */
+} xfs_dqblk_t;
+#define XFS_DQUOT_CRC_OFF       offsetof(struct xfs_dqblk, dd_crc)
+/*
+ * Remote symlink format and access functions.
+ */
+#define XFS_SYMLINK_MAGIC       0x58534c4d      /* XSLM */
+struct xfs_dsymlink_hdr {
+        __be32  sl_magic;
+        __be32  sl_offset;
+        __be32  sl_bytes;
+        __be32  sl_crc;
+        uuid_t  sl_uuid;
+        __be64  sl_owner;
+        __be64  sl_blkno;
+        __be64  sl_lsn;
+};
+#define XFS_SYMLINK_CRC_OFF     offsetof(struct xfs_dsymlink_hdr, sl_crc)
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)      \
+        ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+                        sizeof(struct xfs_dsymlink_hdr) : 0))
+/*
+ * Allocation Btree format definitions
+ *
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno.  All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define XFS_ABTB_MAGIC          0x41425442      /* 'ABTB' for bno tree */
+#define XFS_ABTB_CRC_MAGIC      0x41423342      /* 'AB3B' */
+#define XFS_ABTC_MAGIC          0x41425443      /* 'ABTC' for cnt tree */
+#define XFS_ABTC_CRC_MAGIC      0x41423343      /* 'AB3C' */
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+        __be32          ar_startblock;  /* starting block number */
+        __be32          ar_blockcount;  /* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+typedef struct xfs_alloc_rec_incore {
+        xfs_agblock_t   ar_startblock;  /* starting block number */
+        xfs_extlen_t    ar_blockcount;  /* count of free blocks */
+} xfs_alloc_rec_incore_t;
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define XFS_BNO_BLOCK(mp)       ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define XFS_CNT_BLOCK(mp)       ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+/*
+ * Inode Allocation Btree format definitions
+ *
+ * There is a btree for the inode map per allocation group.
+ */
+#define XFS_IBT_MAGIC           0x49414254      /* 'IABT' */
+#define XFS_IBT_CRC_MAGIC       0x49414233      /* 'IAB3' */
+#define XFS_FIBT_MAGIC          0x46494254      /* 'FIBT' */
+#define XFS_FIBT_CRC_MAGIC      0x46494233      /* 'FIB3' */
+typedef __uint64_t      xfs_inofree_t;
+#define XFS_INODES_PER_CHUNK            (NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK_LOG        (XFS_NBBYLOG + 3)
+#define XFS_INOBT_ALL_FREE              ((xfs_inofree_t)-1)
+#define XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+        return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec {
+        __be32          ir_startino;    /* starting inode number */
+        __be32          ir_freecount;   /* count of free inodes (set bits) */
+        __be64          ir_free;        /* free inode mask */
+} xfs_inobt_rec_t;
+typedef struct xfs_inobt_rec_incore {
+        xfs_agino_t     ir_startino;    /* starting inode number */
+        __int32_t       ir_freecount;   /* count of free inodes (set bits) */
+        xfs_inofree_t   ir_free;        /* free inode mask */
+} xfs_inobt_rec_incore_t;
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+        __be32          ir_startino;    /* starting inode number */
+} xfs_inobt_key_t;
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+/*
+ * block numbers in the AG.
+ */
+#define XFS_IBT_BLOCK(mp)               ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define XFS_FIBT_BLOCK(mp)              ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+/*
+ * The first data block of an AG depends on whether the filesystem was formatted
+ * with the finobt feature. If so, account for the finobt reserved root btree
+ * block.
+ */
+#define XFS_PREALLOC_BLOCKS(mp) \
+        (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+         XFS_FIBT_BLOCK(mp) + 1 : \
+         XFS_IBT_BLOCK(mp) + 1)
+/*
+ * BMAP Btree format definitions
+ *
+ * This includes both the root block definition that sits inside an inode fork
+ * and the record/pointer formats for the leaf/node in the blocks.
+ */
+#define XFS_BMAP_MAGIC          0x424d4150      /* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC      0x424d4133      /* 'BMA3' */
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+        __be16          bb_level;       /* 0 is a leaf */
+        __be16          bb_numrecs;     /* current # of data records */
+} xfs_bmdr_block_t;
+/*
+ * Bmap btree record and extent descriptor.
+ *  l0:63 is an extent flag (value 1 indicates non-normal).
+ *  l0:9-62 are startoff.
+ *  l0:0-8 and l1:21-63 are startblock.
+ *  l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN    1
+#define BMBT_STARTOFF_BITLEN    54
+#define BMBT_STARTBLOCK_BITLEN  52
+#define BMBT_BLOCKCOUNT_BITLEN  21
+typedef struct xfs_bmbt_rec {
+        __be64                  l0, l1;
+} xfs_bmbt_rec_t;
+typedef __uint64_t      xfs_bmbt_rec_base_t;    /* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+typedef struct xfs_bmbt_rec_host {
+        __uint64_t              l0, l1;
+} xfs_bmbt_rec_host_t;
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS       17
+#define STARTBLOCKMASKBITS      (15 + XFS_BIG_BLKNOS * 20)
+#define DSTARTBLOCKMASKBITS     (15 + 20)
+#define STARTBLOCKMASK          \
+        (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK         \
+        (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+        return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+static inline int isnulldstartblock(xfs_dfsbno_t x)
+{
+        return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
+}
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+        ASSERT(k < (1 << STARTBLOCKVALBITS));
+        return STARTBLOCKMASK | (k);
+}
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+        return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+        XFS_EXTFMT_NOSTATE = 0,
+        XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+/*
+ * Possible extent states.
+ */
+typedef enum {
+        XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+        XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+        xfs_fileoff_t   br_startoff;    /* starting file offset */
+        xfs_fsblock_t   br_startblock;  /* starting block number */
+        xfs_filblks_t   br_blockcount;  /* number of blocks */
+        xfs_exntst_t    br_state;       /* extent state */
+} xfs_bmbt_irec_t;
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+        __be64          br_startoff;    /* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+/*
+ * Generic Btree block format definitions
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees.  The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use the size
+ * macros below.  Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
+ */
+struct xfs_btree_block {
+        __be32          bb_magic;       /* magic number for block type */
+        __be16          bb_level;       /* 0 is a leaf */
+        __be16          bb_numrecs;     /* current # of data records */
+        union {
+                struct {
+                        __be32          bb_leftsib;
+                        __be32          bb_rightsib;
+                        __be64          bb_blkno;
+                        __be64          bb_lsn;
+                        uuid_t          bb_uuid;
+                        __be32          bb_owner;
+                        __le32          bb_crc;
+                } s;                    /* short form pointers */
+                struct  {
+                        __be64          bb_leftsib;
+                        __be64          bb_rightsib;
+                        __be64          bb_blkno;
+                        __be64          bb_lsn;
+                        uuid_t          bb_uuid;
+                        __be64          bb_owner;
+                        __le32          bb_crc;
+                        __be32          bb_pad; /* padding for alignment */
+                } l;                    /* long form pointers */
+        } bb_u;                         /* rest */
+};
+#define XFS_BTREE_SBLOCK_LEN    16      /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN    24      /* size of a long form block */
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN        (XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_LBLOCK_CRC_LEN        (XFS_BTREE_LBLOCK_LEN + 48)
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+        offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+        offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
new file mode 100644
index 000000000000..b62771f1f4b5
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -0,0 +1,2189 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_trace.h"
+/*
+ * Allocation group level functions.
+ */
+static inline int
+xfs_ialloc_cluster_alignment(
+        xfs_alloc_arg_t *args)
+{
+        if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+            args->mp->m_sb.sb_inoalignmt >=
+             XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
+                return args->mp->m_sb.sb_inoalignmt;
+        return 1;
+}
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        xfs_lookup_t            dir,    /* <=, >=, == */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = 0;
+        cur->bc_rec.i.ir_free = 0;
+        return xfs_btree_lookup(cur, dir, stat);
+}
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                              /* error */
+xfs_inobt_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_inobt_rec_incore_t  *irec)  /* btree record */
+{
+        union xfs_btree_rec     rec;
+        rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+        rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+        rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+        return xfs_btree_update(cur, &rec);
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_inobt_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_inobt_rec_incore_t  *irec,  /* btree record */
+        int                     *stat)  /* output: success/failure */
+{
+        union xfs_btree_rec     *rec;
+        int                     error;
+        error = xfs_btree_get_rec(cur, &rec, stat);
+        if (!error && *stat == 1) {
+                irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+                irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+                irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+        }
+        return error;
+}
+/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+        struct xfs_btree_cur    *cur,
+        __int32_t               freecount,
+        xfs_inofree_t           free,
+        int                     *stat)
+{
+        cur->bc_rec.i.ir_freecount = freecount;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_insert(cur, stat);
+}
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_buf          *agbp,
+        xfs_agino_t             newino,
+        xfs_agino_t             newlen,
+        xfs_btnum_t             btnum)
+{
+        struct xfs_btree_cur    *cur;
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+        xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
+        xfs_agino_t             thisino;
+        int                     i;
+        int                     error;
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+        for (thisino = newino;
+             thisino < newino + newlen;
+             thisino += XFS_INODES_PER_CHUNK) {
+                error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+                if (error) {
+                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                        return error;
+                }
+                ASSERT(i == 0);
+                error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+                                             XFS_INOBT_ALL_FREE, &i);
+                if (error) {
+                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                        return error;
+                }
+                ASSERT(i == 1);
+        }
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        return 0;
+}
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+        struct xfs_btree_cur    *cur,
+        struct xfs_agi          *agi)
+{
+        if (cur->bc_nlevels == 1) {
+                xfs_inobt_rec_incore_t rec;
+                int             freecount = 0;
+                int             error;
+                int             i;
+                error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+                if (error)
+                        return error;
+                do {
+                        error = xfs_inobt_get_rec(cur, &rec, &i);
+                        if (error)
+                                return error;
+                        if (i) {
+                                freecount += rec.ir_freecount;
+                                error = xfs_btree_increment(cur, 0, &i);
+                                if (error)
+                                        return error;
+                        }
+                } while (i == 1);
+                if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+                        ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+        }
+        return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi)       0
+#endif
+/*
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
+ */
+int
+xfs_ialloc_inode_init(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct list_head        *buffer_list,
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           agbno,
+        xfs_agblock_t           length,
+        unsigned int            gen)
+{
+        struct xfs_buf          *fbuf;
+        struct xfs_dinode       *free;
+        int                     nbufs, blks_per_cluster, inodes_per_cluster;
+        int                     version;
+        int                     i, j;
+        xfs_daddr_t             d;
+        xfs_ino_t               ino = 0;
+        /*
+         * Loop over the new block(s), filling in the inodes.  For small block
+         * sizes, manipulate the inodes in buffers  which are multiples of the
+         * blocks size.
+         */
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
+        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+        nbufs = length / blks_per_cluster;
+        /*
+         * Figure out what version number to use in the inodes we create.  If
+         * the superblock version has caught up to the one that supports the new
+         * inode format, then use the new inode version.  Otherwise use the old
+         * version so that old kernels will continue to be able to use the file
+         * system.
+         *
+         * For v3 inodes, we also need to write the inode number into the inode,
+         * so calculate the first inode number of the chunk here as
+         * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+         * across multiple filesystem blocks (such as a cluster) and so cannot
+         * be used in the cluster buffer loop below.
+         *
+         * Further, because we are writing the inode directly into the buffer
+         * and calculating a CRC on the entire inode, we have ot log the entire
+         * inode so that the entire range the CRC covers is present in the log.
+         * That means for v3 inode we log the entire buffer rather than just the
+         * inode cores.
+         */
+        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                version = 3;
+                ino = XFS_AGINO_TO_INO(mp, agno,
+                                       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+                /*
+                 * log the initialisation that is about to take place as an
+                 * logical operation. This means the transaction does not
+                 * need to log the physical changes to the inode buffers as log
+                 * recovery will know what initialisation is actually needed.
+                 * Hence we only need to log the buffers as "ordered" buffers so
+                 * they track in the AIL as if they were physically logged.
+                 */
+                if (tp)
+                        xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+                                        mp->m_sb.sb_inodesize, length, gen);
+        } else
+                version = 2;
+        for (j = 0; j < nbufs; j++) {
+                /*
+                 * Get the block.
+                 */
+                d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+                                         mp->m_bsize * blks_per_cluster,
+                                         XBF_UNMAPPED);
+                if (!fbuf)
+                        return -ENOMEM;
+                /* Initialize the inode buffers and log them appropriately. */
+                fbuf->b_ops = &xfs_inode_buf_ops;
+                xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
+                for (i = 0; i < inodes_per_cluster; i++) {
+                        int     ioffset = i << mp->m_sb.sb_inodelog;
+                        uint    isize = xfs_dinode_size(version);
+                        free = xfs_make_iptr(mp, fbuf, i);
+                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+                        free->di_version = version;
+                        free->di_gen = cpu_to_be32(gen);
+                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+                        if (version == 3) {
+                                free->di_ino = cpu_to_be64(ino);
+                                ino++;
+                                uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+                                xfs_dinode_calc_crc(mp, free);
+                        } else if (tp) {
+                                /* just log the inode core */
+                                xfs_trans_log_buf(tp, fbuf, ioffset,
+                                                  ioffset + isize - 1);
+                        }
+                }
+                if (tp) {
+                        /*
+                         * Mark the buffer as an inode allocation buffer so it
+                         * sticks in AIL at the point of this allocation
+                         * transaction. This ensures the they are on disk before
+                         * the tail of the log can be moved past this
+                         * transaction (i.e. by preventing relogging from moving
+                         * it forward in the log).
+                         */
+                        xfs_trans_inode_alloc_buf(tp, fbuf);
+                        if (version == 3) {
+                                /*
+                                 * Mark the buffer as ordered so that they are
+                                 * not physically logged in the transaction but
+                                 * still tracked in the AIL as part of the
+                                 * transaction and pin the log appropriately.
+                                 */
+                                xfs_trans_ordered_buf(tp, fbuf);
+                                xfs_trans_log_buf(tp, fbuf, 0,
+                                                  BBTOB(fbuf->b_length) - 1);
+                        }
+                } else {
+                        fbuf->b_flags |= XBF_DONE;
+                        xfs_buf_delwri_queue(fbuf, buffer_list);
+                        xfs_buf_relse(fbuf);
+                }
+        }
+        return 0;
+}
+/*
+ * Allocate new inodes in the allocation group specified by agbp.
+ * Return 0 for success, else error code.
+ */
+STATIC int                              /* error code or 0 */
+xfs_ialloc_ag_alloc(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_buf_t       *agbp,          /* alloc group buffer */
+        int             *alloc)
+{
+        xfs_agi_t       *agi;           /* allocation group header */
+        xfs_alloc_arg_t args;           /* allocation argument structure */
+        xfs_agnumber_t  agno;
+        int             error;
+        xfs_agino_t     newino;         /* new first inode's number */
+        xfs_agino_t     newlen;         /* new number of inodes */
+        int             isaligned = 0;  /* inode allocation at stripe unit */
+                                        /* boundary */
+        struct xfs_perag *pag;
+        memset(&args, 0, sizeof(args));
+        args.tp = tp;
+        args.mp = tp->t_mountp;
+        /*
+         * Locking will ensure that we don't have two callers in here
+         * at one time.
+         */
+        newlen = args.mp->m_ialloc_inos;
+        if (args.mp->m_maxicount &&
+            args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+                return -ENOSPC;
+        args.minlen = args.maxlen = args.mp->m_ialloc_blks;
+        /*
+         * First try to allocate inodes contiguous with the last-allocated
+         * chunk of inodes.  If the filesystem is striped, this will fill
+         * an entire stripe unit with inodes.
+         */
+        agi = XFS_BUF_TO_AGI(agbp);
+        newino = be32_to_cpu(agi->agi_newino);
+        agno = be32_to_cpu(agi->agi_seqno);
+        args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+                     args.mp->m_ialloc_blks;
+        if (likely(newino != NULLAGINO &&
+                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
+                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+                args.type = XFS_ALLOCTYPE_THIS_BNO;
+                args.prod = 1;
+                /*
+                 * We need to take into account alignment here to ensure that
+                 * we don't modify the free list if we fail to have an exact
+                 * block. If we don't have an exact match, and every oher
+                 * attempt allocation attempt fails, we'll end up cancelling
+                 * a dirty transaction and shutting down.
+                 *
+                 * For an exact allocation, alignment must be 1,
+                 * however we need to take cluster alignment into account when
+                 * fixing up the freelist. Use the minalignslop field to
+                 * indicate that extra blocks might be required for alignment,
+                 * but not to use them in the actual exact allocation.
+                 */
+                args.alignment = 1;
+                args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+                /* Allow space for the inode btree to split. */
+                args.minleft = args.mp->m_in_maxlevels - 1;
+                if ((error = xfs_alloc_vextent(&args)))
+                        return error;
+                /*
+                 * This request might have dirtied the transaction if the AG can
+                 * satisfy the request, but the exact block was not available.
+                 * If the allocation did fail, subsequent requests will relax
+                 * the exact agbno requirement and increase the alignment
+                 * instead. It is critical that the total size of the request
+                 * (len + alignment + slop) does not increase from this point
+                 * on, so reset minalignslop to ensure it is not included in
+                 * subsequent requests.
+                 */
+                args.minalignslop = 0;
+        } else
+                args.fsbno = NULLFSBLOCK;
+        if (unlikely(args.fsbno == NULLFSBLOCK)) {
+                /*
+                 * Set the alignment for the allocation.
+                 * If stripe alignment is turned on then align at stripe unit
+                 * boundary.
+                 * If the cluster size is smaller than a filesystem block
+                 * then we're doing I/O for inodes in filesystem block size
+                 * pieces, so don't need alignment anyway.
+                 */
+                isaligned = 0;
+                if (args.mp->m_sinoalign) {
+                        ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+                        args.alignment = args.mp->m_dalign;
+                        isaligned = 1;
+                } else
+                        args.alignment = xfs_ialloc_cluster_alignment(&args);
+                /*
+                 * Need to figure out where to allocate the inode blocks.
+                 * Ideally they should be spaced out through the a.g.
+                 * For now, just allocate blocks up front.
+                 */
+                args.agbno = be32_to_cpu(agi->agi_root);
+                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+                /*
+                 * Allocate a fixed-size extent of inodes.
+                 */
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+                args.prod = 1;
+                /*
+                 * Allow space for the inode btree to split.
+                 */
+                args.minleft = args.mp->m_in_maxlevels - 1;
+                if ((error = xfs_alloc_vextent(&args)))
+                        return error;
+        }
+        /*
+         * If stripe alignment is turned on, then try again with cluster
+         * alignment.
+         */
+        if (isaligned && args.fsbno == NULLFSBLOCK) {
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+                args.agbno = be32_to_cpu(agi->agi_root);
+                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+                args.alignment = xfs_ialloc_cluster_alignment(&args);
+                if ((error = xfs_alloc_vextent(&args)))
+                        return error;
+        }
+        if (args.fsbno == NULLFSBLOCK) {
+                *alloc = 0;
+                return 0;
+        }
+        ASSERT(args.len == args.minlen);
+        /*
+         * Stamp and write the inode buffers.
+         *
+         * Seed the new inode cluster with a random generation number. This
+         * prevents short-term reuse of generation numbers if a chunk is
+         * freed and then immediately reallocated. We use random numbers
+         * rather than a linear progression to prevent the next generation
+         * number from being easily guessable.
+         */
+        error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
+                        args.len, prandom_u32());
+        if (error)
+                return error;
+        /*
+         * Convert the results.
+         */
+        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+        be32_add_cpu(&agi->agi_count, newlen);
+        be32_add_cpu(&agi->agi_freecount, newlen);
+        pag = xfs_perag_get(args.mp, agno);
+        pag->pagi_freecount += newlen;
+        xfs_perag_put(pag);
+        agi->agi_newino = cpu_to_be32(newino);
+        /*
+         * Insert records describing the new inode chunk into the btrees.
+         */
+        error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+                                 XFS_BTNUM_INO);
+        if (error)
+                return error;
+        if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+                error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+                                         XFS_BTNUM_FINO);
+                if (error)
+                        return error;
+        }
+        /*
+         * Log allocation group header fields
+         */
+        xfs_ialloc_log_agi(tp, agbp,
+                XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+        /*
+         * Modify/log superblock values for inode count and inode free count.
+         */
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+        *alloc = 1;
+        return 0;
+}
+STATIC xfs_agnumber_t
+xfs_ialloc_next_ag(
+        xfs_mount_t     *mp)
+{
+        xfs_agnumber_t  agno;
+        spin_lock(&mp->m_agirotor_lock);
+        agno = mp->m_agirotor;
+        if (++mp->m_agirotor >= mp->m_maxagi)
+                mp->m_agirotor = 0;
+        spin_unlock(&mp->m_agirotor_lock);
+        return agno;
+}
+/*
+ * Select an allocation group to look for a free inode in, based on the parent
+ * inode and the mode.  Return the allocation group buffer.
+ */
+STATIC xfs_agnumber_t
+xfs_ialloc_ag_select(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_ino_t       parent,         /* parent directory inode number */
+        umode_t         mode,           /* bits set to indicate file type */
+        int             okalloc)        /* ok to allocate more space */
+{
+        xfs_agnumber_t  agcount;        /* number of ag's in the filesystem */
+        xfs_agnumber_t  agno;           /* current ag number */
+        int             flags;          /* alloc buffer locking flags */
+        xfs_extlen_t    ineed;          /* blocks needed for inode allocation */
+        xfs_extlen_t    longest = 0;    /* longest extent available */
+        xfs_mount_t     *mp;            /* mount point structure */
+        int             needspace;      /* file mode implies space allocated */
+        xfs_perag_t     *pag;           /* per allocation group data */
+        xfs_agnumber_t  pagno;          /* parent (starting) ag number */
+        int             error;
+        /*
+         * Files of these types need at least one block if length > 0
+         * (and they won't fit in the inode, but that's hard to figure out).
+         */
+        needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+        mp = tp->t_mountp;
+        agcount = mp->m_maxagi;
+        if (S_ISDIR(mode))
+                pagno = xfs_ialloc_next_ag(mp);
+        else {
+                pagno = XFS_INO_TO_AGNO(mp, parent);
+                if (pagno >= agcount)
+                        pagno = 0;
+        }
+        ASSERT(pagno < agcount);
+        /*
+         * Loop through allocation groups, looking for one with a little
+         * free space in it.  Note we don't look for free inodes, exactly.
+         * Instead, we include whether there is a need to allocate inodes
+         * to mean that blocks must be allocated for them,
+         * if none are currently free.
+         */
+        agno = pagno;
+        flags = XFS_ALLOC_FLAG_TRYLOCK;
+        for (;;) {
+                pag = xfs_perag_get(mp, agno);
+                if (!pag->pagi_inodeok) {
+                        xfs_ialloc_next_ag(mp);
+                        goto nextag;
+                }
+                if (!pag->pagi_init) {
+                        error = xfs_ialloc_pagi_init(mp, tp, agno);
+                        if (error)
+                                goto nextag;
+                }
+                if (pag->pagi_freecount) {
+                        xfs_perag_put(pag);
+                        return agno;
+                }
+                if (!okalloc)
+                        goto nextag;
+                if (!pag->pagf_init) {
+                        error = xfs_alloc_pagf_init(mp, tp, agno, flags);
+                        if (error)
+                                goto nextag;
+                }
+                /*
+                 * Is there enough free space for the file plus a block of
+                 * inodes? (if we need to allocate some)?
+                 */
+                ineed = mp->m_ialloc_blks;
+                longest = pag->pagf_longest;
+                if (!longest)
+                        longest = pag->pagf_flcount > 0;
+                if (pag->pagf_freeblks >= needspace + ineed &&
+                    longest >= ineed) {
+                        xfs_perag_put(pag);
+                        return agno;
+                }
+nextag:
+                xfs_perag_put(pag);
+                /*
+                 * No point in iterating over the rest, if we're shutting
+                 * down.
+                 */
+                if (XFS_FORCED_SHUTDOWN(mp))
+                        return NULLAGNUMBER;
+                agno++;
+                if (agno >= agcount)
+                        agno = 0;
+                if (agno == pagno) {
+                        if (flags == 0)
+                                return NULLAGNUMBER;
+                        flags = 0;
+                }
+        }
+}
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+        struct xfs_btree_cur    *cur,
+        xfs_inobt_rec_incore_t  *rec,
+        int                     *done,
+        int                     left)
+{
+        int                     error;
+        int                     i;
+        if (left)
+                error = xfs_btree_decrement(cur, 0, &i);
+        else
+                error = xfs_btree_increment(cur, 0, &i);
+        if (error)
+                return error;
+        *done = !i;
+        if (i) {
+                error = xfs_inobt_get_rec(cur, rec, &i);
+                if (error)
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        return 0;
+}
+STATIC int
+xfs_ialloc_get_rec(
+        struct xfs_btree_cur    *cur,
+        xfs_agino_t             agino,
+        xfs_inobt_rec_incore_t  *rec,
+        int                     *done)
+{
+        int                     error;
+        int                     i;
+        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+        if (error)
+                return error;
+        *done = !i;
+        if (i) {
+                error = xfs_inobt_get_rec(cur, rec, &i);
+                if (error)
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        return 0;
+}
+/*
+ * Allocate an inode using the inobt-only algorithm.
+ */
+STATIC int
+xfs_dialloc_ag_inobt(
+        struct xfs_trans        *tp,
+        struct xfs_buf          *agbp,
+        xfs_ino_t               parent,
+        xfs_ino_t               *inop)
+{
+        struct xfs_mount        *mp = tp->t_mountp;
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+        xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
+        xfs_agnumber_t          pagno = XFS_INO_TO_AGNO(mp, parent);
+        xfs_agino_t             pagino = XFS_INO_TO_AGINO(mp, parent);
+        struct xfs_perag        *pag;
+        struct xfs_btree_cur    *cur, *tcur;
+        struct xfs_inobt_rec_incore rec, trec;
+        xfs_ino_t               ino;
+        int                     error;
+        int                     offset;
+        int                     i, j;
+        pag = xfs_perag_get(mp, agno);
+        ASSERT(pag->pagi_init);
+        ASSERT(pag->pagi_inodeok);
+        ASSERT(pag->pagi_freecount > 0);
+ restart_pagno:
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+        /*
+         * If pagino is 0 (this is the root inode allocation) use newino.
+         * This must work because we've just allocated some.
+         */
+        if (!pagino)
+                pagino = be32_to_cpu(agi->agi_newino);
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error0;
+        /*
+         * If in the same AG as the parent, try to get near the parent.
+         */
+        if (pagno == agno) {
+                int             doneleft;       /* done, to the left */
+                int             doneright;      /* done, to the right */
+                int             searchdistance = 10;
+                error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_inobt_get_rec(cur, &rec, &j);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
+                if (rec.ir_freecount > 0) {
+                        /*
+                         * Found a free inode in the same chunk
+                         * as the parent, done.
+                         */
+                        goto alloc_inode;
+                }
+                /*
+                 * In the same AG as parent, but parent's chunk is full.
+                 */
+                /* duplicate the cursor, search left & right simultaneously */
+                error = xfs_btree_dup_cursor(cur, &tcur);
+                if (error)
+                        goto error0;
+                /*
+                 * Skip to last blocks looked up if same parent inode.
+                 */
+                if (pagino != NULLAGINO &&
+                    pag->pagl_pagino == pagino &&
+                    pag->pagl_leftrec != NULLAGINO &&
+                    pag->pagl_rightrec != NULLAGINO) {
+                        error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+                                                   &trec, &doneleft);
+                        if (error)
+                                goto error1;
+                        error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+                                                   &rec, &doneright);
+                        if (error)
+                                goto error1;
+                } else {
+                        /* search left with tcur, back up 1 record */
+                        error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+                        if (error)
+                                goto error1;
+                        /* search right with cur, go forward 1 record. */
+                        error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+                        if (error)
+                                goto error1;
+                }
+                /*
+                 * Loop until we find an inode chunk with a free inode.
+                 */
+                while (!doneleft || !doneright) {
+                        int     useleft;  /* using left inode chunk this time */
+                        if (!--searchdistance) {
+                                /*
+                                 * Not in range - save last search
+                                 * location and allocate a new inode
+                                 */
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                pag->pagl_leftrec = trec.ir_startino;
+                                pag->pagl_rightrec = rec.ir_startino;
+                                pag->pagl_pagino = pagino;
+                                goto newino;
+                        }
+                        /* figure out the closer block if both are valid. */
+                        if (!doneleft && !doneright) {
+                                useleft = pagino -
+                                 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+                                  rec.ir_startino - pagino;
+                        } else {
+                                useleft = !doneleft;
+                        }
+                        /* free inodes to the left? */
+                        if (useleft && trec.ir_freecount) {
+                                rec = trec;
+                                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                                cur = tcur;
+                                pag->pagl_leftrec = trec.ir_startino;
+                                pag->pagl_rightrec = rec.ir_startino;
+                                pag->pagl_pagino = pagino;
+                                goto alloc_inode;
+                        }
+                        /* free inodes to the right? */
+                        if (!useleft && rec.ir_freecount) {
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                pag->pagl_leftrec = trec.ir_startino;
+                                pag->pagl_rightrec = rec.ir_startino;
+                                pag->pagl_pagino = pagino;
+                                goto alloc_inode;
+                        }
+                        /* get next record to check */
+                        if (useleft) {
+                                error = xfs_ialloc_next_rec(tcur, &trec,
+                                                                 &doneleft, 1);
+                        } else {
+                                error = xfs_ialloc_next_rec(cur, &rec,
+                                                                 &doneright, 0);
+                        }
+                        if (error)
+                                goto error1;
+                }
+                /*
+                 * We've reached the end of the btree. because
+                 * we are only searching a small chunk of the
+                 * btree each search, there is obviously free
+                 * inodes closer to the parent inode than we
+                 * are now. restart the search again.
+                 */
+                pag->pagl_pagino = NULLAGINO;
+                pag->pagl_leftrec = NULLAGINO;
+                pag->pagl_rightrec = NULLAGINO;
+                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                goto restart_pagno;
+        }
+        /*
+         * In a different AG from the parent.
+         * See if the most recently allocated block has any free.
+         */
+newino:
+        if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+                error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+                                         XFS_LOOKUP_EQ, &i);
+                if (error)
+                        goto error0;
+                if (i == 1) {
+                        error = xfs_inobt_get_rec(cur, &rec, &j);
+                        if (error)
+                                goto error0;
+                        if (j == 1 && rec.ir_freecount > 0) {
+                                /*
+                                 * The last chunk allocated in the group
+                                 * still has a free inode.
+                                 */
+                                goto alloc_inode;
+                        }
+                }
+        }
+        /*
+         * None left in the last group, search the whole AG
+         */
+        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+        if (error)
+                goto error0;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        for (;;) {
+                error = xfs_inobt_get_rec(cur, &rec, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if (rec.ir_freecount > 0)
+                        break;
+                error = xfs_btree_increment(cur, 0, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        }
+alloc_inode:
+        offset = xfs_lowbit64(rec.ir_free);
+        ASSERT(offset >= 0);
+        ASSERT(offset < XFS_INODES_PER_CHUNK);
+        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+                                   XFS_INODES_PER_CHUNK) == 0);
+        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+        rec.ir_free &= ~XFS_INOBT_MASK(offset);
+        rec.ir_freecount--;
+        error = xfs_inobt_update(cur, &rec);
+        if (error)
+                goto error0;
+        be32_add_cpu(&agi->agi_freecount, -1);
+        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+        pag->pagi_freecount--;
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error0;
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+        xfs_perag_put(pag);
+        *inop = ino;
+        return 0;
+error1:
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        xfs_perag_put(pag);
+        return error;
+}
+/*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+        xfs_agino_t                     pagino,
+        struct xfs_btree_cur            **ocur,
+        struct xfs_inobt_rec_incore     *rec)
+{
+        struct xfs_btree_cur            *lcur = *ocur;  /* left search cursor */
+        struct xfs_btree_cur            *rcur;  /* right search cursor */
+        struct xfs_inobt_rec_incore     rrec;
+        int                             error;
+        int                             i, j;
+        error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+        if (error)
+                return error;
+        if (i == 1) {
+                error = xfs_inobt_get_rec(lcur, rec, &i);
+                if (error)
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                /*
+                 * See if we've landed in the parent inode record. The finobt
+                 * only tracks chunks with at least one free inode, so record
+                 * existence is enough.
+                 */
+                if (pagino >= rec->ir_startino &&
+                    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+                        return 0;
+        }
+        error = xfs_btree_dup_cursor(lcur, &rcur);
+        if (error)
+                return error;
+        error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+        if (error)
+                goto error_rcur;
+        if (j == 1) {
+                error = xfs_inobt_get_rec(rcur, &rrec, &j);
+                if (error)
+                        goto error_rcur;
+                XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+        }
+        XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+        if (i == 1 && j == 1) {
+                /*
+                 * Both the left and right records are valid. Choose the closer
+                 * inode chunk to the target.
+                 */
+                if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+                    (rrec.ir_startino - pagino)) {
+                        *rec = rrec;
+                        xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+                        *ocur = rcur;
+                } else {
+                        xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+                }
+        } else if (j == 1) {
+                /* only the right record is valid */
+                *rec = rrec;
+                xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+                *ocur = rcur;
+        } else if (i == 1) {
+                /* only the left record is valid */
+                xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+        }
+        return 0;
+error_rcur:
+        xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+        struct xfs_agi                  *agi,
+        struct xfs_btree_cur            *cur,
+        struct xfs_inobt_rec_incore     *rec)
+{
+        int error;
+        int i;
+        if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+                error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+                                         &i);
+                if (error)
+                        return error;
+                if (i == 1) {
+                        error = xfs_inobt_get_rec(cur, rec, &i);
+                        if (error)
+                                return error;
+                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        return 0;
+                }
+        }
+        /*
+         * Find the first inode available in the AG.
+         */
+        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        error = xfs_inobt_get_rec(cur, rec, &i);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        return 0;
+}
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+        struct xfs_btree_cur            *cur,   /* inobt cursor */
+        struct xfs_inobt_rec_incore     *frec,  /* finobt record */
+        int                             offset) /* inode offset */
+{
+        struct xfs_inobt_rec_incore     rec;
+        int                             error;
+        int                             i;
+        error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        error = xfs_inobt_get_rec(cur, &rec, &i);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+                                   XFS_INODES_PER_CHUNK) == 0);
+        rec.ir_free &= ~XFS_INOBT_MASK(offset);
+        rec.ir_freecount--;
+        XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+                                  (rec.ir_freecount == frec->ir_freecount));
+        error = xfs_inobt_update(cur, &rec);
+        if (error)
+                return error;
+        return 0;
+}
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+        struct xfs_trans        *tp,
+        struct xfs_buf          *agbp,
+        xfs_ino_t               parent,
+        xfs_ino_t               *inop)
+{
+        struct xfs_mount                *mp = tp->t_mountp;
+        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+        xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
+        xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
+        struct xfs_perag                *pag;
+        struct xfs_btree_cur            *cur;   /* finobt cursor */
+        struct xfs_btree_cur            *icur;  /* inobt cursor */
+        struct xfs_inobt_rec_incore     rec;
+        xfs_ino_t                       ino;
+        int                             error;
+        int                             offset;
+        int                             i;
+        if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+                return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+        pag = xfs_perag_get(mp, agno);
+        /*
+         * If pagino is 0 (this is the root inode allocation) use newino.
+         * This must work because we've just allocated some.
+         */
+        if (!pagino)
+                pagino = be32_to_cpu(agi->agi_newino);
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error_cur;
+        /*
+         * The search algorithm depends on whether we're in the same AG as the
+         * parent. If so, find the closest available inode to the parent. If
+         * not, consider the agi hint or find the first free inode in the AG.
+         */
+        if (agno == pagno)
+                error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+        else
+                error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+        if (error)
+                goto error_cur;
+        offset = xfs_lowbit64(rec.ir_free);
+        ASSERT(offset >= 0);
+        ASSERT(offset < XFS_INODES_PER_CHUNK);
+        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+                                   XFS_INODES_PER_CHUNK) == 0);
+        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+        /*
+         * Modify or remove the finobt record.
+         */
+        rec.ir_free &= ~XFS_INOBT_MASK(offset);
+        rec.ir_freecount--;
+        if (rec.ir_freecount)
+                error = xfs_inobt_update(cur, &rec);
+        else
+                error = xfs_btree_delete(cur, &i);
+        if (error)
+                goto error_cur;
+        /*
+         * The finobt has now been updated appropriately. We haven't updated the
+         * agi and superblock yet, so we can create an inobt cursor and validate
+         * the original freecount. If all is well, make the equivalent update to
+         * the inobt using the finobt record and offset information.
+         */
+        icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+        error = xfs_check_agi_freecount(icur, agi);
+        if (error)
+                goto error_icur;
+        error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+        if (error)
+                goto error_icur;
+        /*
+         * Both trees have now been updated. We must update the perag and
+         * superblock before we can check the freecount for each btree.
+         */
+        be32_add_cpu(&agi->agi_freecount, -1);
+        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+        pag->pagi_freecount--;
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+        error = xfs_check_agi_freecount(icur, agi);
+        if (error)
+                goto error_icur;
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error_icur;
+        xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        xfs_perag_put(pag);
+        *inop = ino;
+        return 0;
+error_icur:
+        xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        xfs_perag_put(pag);
+        return error;
+}
+/*
+ * Allocate an inode on disk.
+ *
+ * Mode is used to tell whether the new inode will need space, and whether it
+ * is a directory.
+ *
+ * This function is designed to be called twice if it has to do an allocation
+ * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
+ * If an inode is available without having to performn an allocation, an inode
+ * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
+ * new transaction, and call xfs_dialloc() again, passing in the previous value
+ * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
+ * buffer is locked across the two calls, the second call is guaranteed to have
+ * a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the on-disk
+ * data structures are updated.  The inode itself is not read in, since doing so
+ * would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+        struct xfs_trans        *tp,
+        xfs_ino_t               parent,
+        umode_t                 mode,
+        int                     okalloc,
+        struct xfs_buf          **IO_agbp,
+        xfs_ino_t               *inop)
+{
+        struct xfs_mount        *mp = tp->t_mountp;
+        struct xfs_buf          *agbp;
+        xfs_agnumber_t          agno;
+        int                     error;
+        int                     ialloced;
+        int                     noroom = 0;
+        xfs_agnumber_t          start_agno;
+        struct xfs_perag        *pag;
+        if (*IO_agbp) {
+                /*
+                 * If the caller passes in a pointer to the AGI buffer,
+                 * continue where we left off before.  In this case, we
+                 * know that the allocation group has free inodes.
+                 */
+                agbp = *IO_agbp;
+                goto out_alloc;
+        }
+        /*
+         * We do not have an agbp, so select an initial allocation
+         * group for inode allocation.
+         */
+        start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+        if (start_agno == NULLAGNUMBER) {
+                *inop = NULLFSINO;
+                return 0;
+        }
+        /*
+         * If we have already hit the ceiling of inode blocks then clear
+         * okalloc so we scan all available agi structures for a free
+         * inode.
+         */
+        if (mp->m_maxicount &&
+            mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
+                noroom = 1;
+                okalloc = 0;
+        }
+        /*
+         * Loop until we find an allocation group that either has free inodes
+         * or in which we can allocate some inodes.  Iterate through the
+         * allocation groups upward, wrapping at the end.
+         */
+        agno = start_agno;
+        for (;;) {
+                pag = xfs_perag_get(mp, agno);
+                if (!pag->pagi_inodeok) {
+                        xfs_ialloc_next_ag(mp);
+                        goto nextag;
+                }
+                if (!pag->pagi_init) {
+                        error = xfs_ialloc_pagi_init(mp, tp, agno);
+                        if (error)
+                                goto out_error;
+                }
+                /*
+                 * Do a first racy fast path check if this AG is usable.
+                 */
+                if (!pag->pagi_freecount && !okalloc)
+                        goto nextag;
+                /*
+                 * Then read in the AGI buffer and recheck with the AGI buffer
+                 * lock held.
+                 */
+                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+                if (error)
+                        goto out_error;
+                if (pag->pagi_freecount) {
+                        xfs_perag_put(pag);
+                        goto out_alloc;
+                }
+                if (!okalloc)
+                        goto nextag_relse_buffer;
+                error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
+                if (error) {
+                        xfs_trans_brelse(tp, agbp);
+                        if (error != -ENOSPC)
+                                goto out_error;
+                        xfs_perag_put(pag);
+                        *inop = NULLFSINO;
+                        return 0;
+                }
+                if (ialloced) {
+                        /*
+                         * We successfully allocated some inodes, return
+                         * the current context to the caller so that it
+                         * can commit the current transaction and call
+                         * us again where we left off.
+                         */
+                        ASSERT(pag->pagi_freecount > 0);
+                        xfs_perag_put(pag);
+                        *IO_agbp = agbp;
+                        *inop = NULLFSINO;
+                        return 0;
+                }
+nextag_relse_buffer:
+                xfs_trans_brelse(tp, agbp);
+nextag:
+                xfs_perag_put(pag);
+                if (++agno == mp->m_sb.sb_agcount)
+                        agno = 0;
+                if (agno == start_agno) {
+                        *inop = NULLFSINO;
+                        return noroom ? -ENOSPC : 0;
+                }
+        }
+out_alloc:
+        *IO_agbp = NULL;
+        return xfs_dialloc_ag(tp, agbp, parent, inop);
+out_error:
+        xfs_perag_put(pag);
+        return error;
+}
+STATIC int
+xfs_difree_inobt(
+        struct xfs_mount                *mp,
+        struct xfs_trans                *tp,
+        struct xfs_buf                  *agbp,
+        xfs_agino_t                     agino,
+        struct xfs_bmap_free            *flist,
+        int                             *deleted,
+        xfs_ino_t                       *first_ino,
+        struct xfs_inobt_rec_incore     *orec)
+{
+        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+        struct xfs_perag                *pag;
+        struct xfs_btree_cur            *cur;
+        struct xfs_inobt_rec_incore     rec;
+        int                             ilen;
+        int                             error;
+        int                             i;
+        int                             off;
+        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+        ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+        /*
+         * Initialize the cursor.
+         */
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error0;
+        /*
+         * Look for the entry describing this inode.
+         */
+        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+                xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+                        __func__, error);
+                goto error0;
+        }
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        error = xfs_inobt_get_rec(cur, &rec, &i);
+        if (error) {
+                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+                        __func__, error);
+                goto error0;
+        }
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        /*
+         * Get the offset in the inode chunk.
+         */
+        off = agino - rec.ir_startino;
+        ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+        ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
+        /*
+         * Mark the inode free & increment the count.
+         */
+        rec.ir_free |= XFS_INOBT_MASK(off);
+        rec.ir_freecount++;
+        /*
+         * When an inode cluster is free, it becomes eligible for removal
+         */
+        if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
+            (rec.ir_freecount == mp->m_ialloc_inos)) {
+                *deleted = 1;
+                *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+                /*
+                 * Remove the inode cluster from the AGI B+Tree, adjust the
+                 * AGI and Superblock inode counts, and mark the disk space
+                 * to be freed when the transaction is committed.
+                 */
+                ilen = mp->m_ialloc_inos;
+                be32_add_cpu(&agi->agi_count, -ilen);
+                be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
+                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+                pag = xfs_perag_get(mp, agno);
+                pag->pagi_freecount -= ilen - 1;
+                xfs_perag_put(pag);
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
+                if ((error = xfs_btree_delete(cur, &i))) {
+                        xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+                                __func__, error);
+                        goto error0;
+                }
+                xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+                                  XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
+                                  mp->m_ialloc_blks, flist, mp);
+        } else {
+                *deleted = 0;
+                error = xfs_inobt_update(cur, &rec);
+                if (error) {
+                        xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+                                __func__, error);
+                        goto error0;
+                }
+                /* 
+                 * Change the inode free counts and log the ag/sb changes.
+                 */
+                be32_add_cpu(&agi->agi_freecount, 1);
+                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+                pag = xfs_perag_get(mp, agno);
+                pag->pagi_freecount++;
+                xfs_perag_put(pag);
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+        }
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error0;
+        *orec = rec;
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        return 0;
+error0:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+        struct xfs_mount                *mp,
+        struct xfs_trans                *tp,
+        struct xfs_buf                  *agbp,
+        xfs_agino_t                     agino,
+        struct xfs_inobt_rec_incore     *ibtrec) /* inobt record */
+{
+        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
+        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
+        struct xfs_btree_cur            *cur;
+        struct xfs_inobt_rec_incore     rec;
+        int                             offset = agino - ibtrec->ir_startino;
+        int                             error;
+        int                             i;
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+        error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+        if (error)
+                goto error;
+        if (i == 0) {
+                /*
+                 * If the record does not exist in the finobt, we must have just
+                 * freed an inode in a previously fully allocated chunk. If not,
+                 * something is out of sync.
+                 */
+                XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+                error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+                                             ibtrec->ir_free, &i);
+                if (error)
+                        goto error;
+                ASSERT(i == 1);
+                goto out;
+        }
+        /*
+         * Read and update the existing record. We could just copy the ibtrec
+         * across here, but that would defeat the purpose of having redundant
+         * metadata. By making the modifications independently, we can catch
+         * corruptions that we wouldn't see if we just copied from one record
+         * to another.
+         */
+        error = xfs_inobt_get_rec(cur, &rec, &i);
+        if (error)
+                goto error;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+        rec.ir_free |= XFS_INOBT_MASK(offset);
+        rec.ir_freecount++;
+        XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+                                (rec.ir_freecount == ibtrec->ir_freecount),
+                                error);
+        /*
+         * The content of inobt records should always match between the inobt
+         * and finobt. The lifecycle of records in the finobt is different from
+         * the inobt in that the finobt only tracks records with at least one
+         * free inode. Hence, if all of the inodes are free and we aren't
+         * keeping inode chunks permanently on disk, remove the record.
+         * Otherwise, update the record with the new information.
+         */
+        if (rec.ir_freecount == mp->m_ialloc_inos &&
+            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+                error = xfs_btree_delete(cur, &i);
+                if (error)
+                        goto error;
+                ASSERT(i == 1);
+        } else {
+                error = xfs_inobt_update(cur, &rec);
+                if (error)
+                        goto error;
+        }
+out:
+        error = xfs_check_agi_freecount(cur, agi);
+        if (error)
+                goto error;
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        return 0;
+error:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        xfs_ino_t               inode,          /* inode to be freed */
+        struct xfs_bmap_free    *flist,         /* extents to free */
+        int                     *deleted,/* set if inode cluster was deleted */
+        xfs_ino_t               *first_ino)/* first inode in deleted cluster */
+{
+        /* REFERENCED */
+        xfs_agblock_t           agbno;  /* block number containing inode */
+        struct xfs_buf          *agbp;  /* buffer for allocation group header */
+        xfs_agino_t             agino;  /* allocation group inode number */
+        xfs_agnumber_t          agno;   /* allocation group number */
+        int                     error;  /* error return value */
+        struct xfs_mount        *mp;    /* mount structure for filesystem */
+        struct xfs_inobt_rec_incore rec;/* btree record */
+        mp = tp->t_mountp;
+        /*
+         * Break up inode number into its components.
+         */
+        agno = XFS_INO_TO_AGNO(mp, inode);
+        if (agno >= mp->m_sb.sb_agcount)  {
+                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+                        __func__, agno, mp->m_sb.sb_agcount);
+                ASSERT(0);
+                return -EINVAL;
+        }
+        agino = XFS_INO_TO_AGINO(mp, inode);
+        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
+                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+                        __func__, (unsigned long long)inode,
+                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+                ASSERT(0);
+                return -EINVAL;
+        }
+        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+        if (agbno >= mp->m_sb.sb_agblocks)  {
+                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+                        __func__, agbno, mp->m_sb.sb_agblocks);
+                ASSERT(0);
+                return -EINVAL;
+        }
+        /*
+         * Get the allocation group header.
+         */
+        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+        if (error) {
+                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+                        __func__, error);
+                return error;
+        }
+        /*
+         * Fix up the inode allocation btree.
+         */
+        error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
+                                 &rec);
+        if (error)
+                goto error0;
+        /*
+         * Fix up the free inode btree.
+         */
+        if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+                error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+                if (error)
+                        goto error0;
+        }
+        return 0;
+error0:
+        return error;
+}
+STATIC int
+xfs_imap_lookup(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        xfs_agnumber_t          agno,
+        xfs_agino_t             agino,
+        xfs_agblock_t           agbno,
+        xfs_agblock_t           *chunk_agbno,
+        xfs_agblock_t           *offset_agbno,
+        int                     flags)
+{
+        struct xfs_inobt_rec_incore rec;
+        struct xfs_btree_cur    *cur;
+        struct xfs_buf          *agbp;
+        int                     error;
+        int                     i;
+        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+        if (error) {
+                xfs_alert(mp,
+                        "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+                        __func__, error, agno);
+                return error;
+        }
+        /*
+         * Lookup the inode record for the given agino. If the record cannot be
+         * found, then it's an invalid inode number and we should abort. Once
+         * we have a record, we need to ensure it contains the inode number
+         * we are looking up.
+         */
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+        if (!error) {
+                if (i)
+                        error = xfs_inobt_get_rec(cur, &rec, &i);
+                if (!error && i == 0)
+                        error = -EINVAL;
+        }
+        xfs_trans_brelse(tp, agbp);
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        if (error)
+                return error;
+        /* check that the returned record contains the required inode */
+        if (rec.ir_startino > agino ||
+            rec.ir_startino + mp->m_ialloc_inos <= agino)
+                return -EINVAL;
+        /* for untrusted inodes check it is allocated first */
+        if ((flags & XFS_IGET_UNTRUSTED) &&
+            (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+                return -EINVAL;
+        *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+        *offset_agbno = agbno - *chunk_agbno;
+        return 0;
+}
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+        xfs_mount_t      *mp,   /* file system mount structure */
+        xfs_trans_t      *tp,   /* transaction pointer */
+        xfs_ino_t       ino,    /* inode to locate */
+        struct xfs_imap *imap,  /* location map structure */
+        uint            flags)  /* flags for inode btree lookup */
+{
+        xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
+        xfs_agino_t     agino;  /* inode number within alloc group */
+        xfs_agnumber_t  agno;   /* allocation group number */
+        int             blks_per_cluster; /* num blocks per inode cluster */
+        xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
+        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
+        int             error;  /* error code */
+        int             offset; /* index of inode in its buffer */
+        xfs_agblock_t   offset_agbno;   /* blks from chunk start to inode */
+        ASSERT(ino != NULLFSINO);
+        /*
+         * Split up the inode number into its parts.
+         */
+        agno = XFS_INO_TO_AGNO(mp, ino);
+        agino = XFS_INO_TO_AGINO(mp, ino);
+        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+        if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
+            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+#ifdef DEBUG
+                /*
+                 * Don't output diagnostic information for untrusted inodes
+                 * as they can be invalid without implying corruption.
+                 */
+                if (flags & XFS_IGET_UNTRUSTED)
+                        return -EINVAL;
+                if (agno >= mp->m_sb.sb_agcount) {
+                        xfs_alert(mp,
+                                "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+                                __func__, agno, mp->m_sb.sb_agcount);
+                }
+                if (agbno >= mp->m_sb.sb_agblocks) {
+                        xfs_alert(mp,
+                "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+                                __func__, (unsigned long long)agbno,
+                                (unsigned long)mp->m_sb.sb_agblocks);
+                }
+                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+                        xfs_alert(mp,
+                "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+                                __func__, ino,
+                                XFS_AGINO_TO_INO(mp, agno, agino));
+                }
+                xfs_stack_trace();
+#endif /* DEBUG */
+                return -EINVAL;
+        }
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
+        /*
+         * For bulkstat and handle lookups, we have an untrusted inode number
+         * that we have to verify is valid. We cannot do this just by reading
+         * the inode buffer as it may have been unlinked and removed leaving
+         * inodes in stale state on disk. Hence we have to do a btree lookup
+         * in all cases where an untrusted inode number is passed.
+         */
+        if (flags & XFS_IGET_UNTRUSTED) {
+                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+                                        &chunk_agbno, &offset_agbno, flags);
+                if (error)
+                        return error;
+                goto out_map;
+        }
+        /*
+         * If the inode cluster size is the same as the blocksize or
+         * smaller we get to the buffer by simple arithmetics.
+         */
+        if (blks_per_cluster == 1) {
+                offset = XFS_INO_TO_OFFSET(mp, ino);
+                ASSERT(offset < mp->m_sb.sb_inopblock);
+                imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+                imap->im_len = XFS_FSB_TO_BB(mp, 1);
+                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+                return 0;
+        }
+        /*
+         * If the inode chunks are aligned then use simple maths to
+         * find the location. Otherwise we have to do a btree
+         * lookup to find the location.
+         */
+        if (mp->m_inoalign_mask) {
+                offset_agbno = agbno & mp->m_inoalign_mask;
+                chunk_agbno = agbno - offset_agbno;
+        } else {
+                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+                                        &chunk_agbno, &offset_agbno, flags);
+                if (error)
+                        return error;
+        }
+out_map:
+        ASSERT(agbno >= chunk_agbno);
+        cluster_agbno = chunk_agbno +
+                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
+        offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+                XFS_INO_TO_OFFSET(mp, ino);
+        imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+        imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+        imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+        /*
+         * If the inode number maps to a block outside the bounds
+         * of the file system then return NULL rather than calling
+         * read_buf and panicing when we get an error from the
+         * driver.
+         */
+        if ((imap->im_blkno + imap->im_len) >
+            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+                xfs_alert(mp,
+        "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+                        __func__, (unsigned long long) imap->im_blkno,
+                        (unsigned long long) imap->im_len,
+                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+        xfs_mount_t     *mp)            /* file system mount structure */
+{
+        int             level;
+        uint            maxblocks;
+        uint            maxleafents;
+        int             minleafrecs;
+        int             minnoderecs;
+        maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
+                XFS_INODES_PER_CHUNK_LOG;
+        minleafrecs = mp->m_alloc_mnr[0];
+        minnoderecs = mp->m_alloc_mnr[1];
+        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+        for (level = 1; maxblocks > 1; level++)
+                maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+        mp->m_in_maxlevels = level;
+}
+/*
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
+ */
+void
+xfs_ialloc_log_agi(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_buf_t       *bp,            /* allocation group header buffer */
+        int             fields)         /* bitmask of fields to log */
+{
+        int                     first;          /* first byte number */
+        int                     last;           /* last byte number */
+        static const short      offsets[] = {   /* field starting offsets */
+                                        /* keep in sync with bit definitions */
+                offsetof(xfs_agi_t, agi_magicnum),
+                offsetof(xfs_agi_t, agi_versionnum),
+                offsetof(xfs_agi_t, agi_seqno),
+                offsetof(xfs_agi_t, agi_length),
+                offsetof(xfs_agi_t, agi_count),
+                offsetof(xfs_agi_t, agi_root),
+                offsetof(xfs_agi_t, agi_level),
+                offsetof(xfs_agi_t, agi_freecount),
+                offsetof(xfs_agi_t, agi_newino),
+                offsetof(xfs_agi_t, agi_dirino),
+                offsetof(xfs_agi_t, agi_unlinked),
+                offsetof(xfs_agi_t, agi_free_root),
+                offsetof(xfs_agi_t, agi_free_level),
+                sizeof(xfs_agi_t)
+        };
+#ifdef DEBUG
+        xfs_agi_t               *agi;   /* allocation group header */
+        agi = XFS_BUF_TO_AGI(bp);
+        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+#endif
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+        /*
+         * Compute byte offsets for the first and last fields in the first
+         * region and log the agi buffer. This only logs up through
+         * agi_unlinked.
+         */
+        if (fields & XFS_AGI_ALL_BITS_R1) {
+                xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+                                  &first, &last);
+                xfs_trans_log_buf(tp, bp, first, last);
+        }
+        /*
+         * Mask off the bits in the first region and calculate the first and
+         * last field offsets for any bits in the second region.
+         */
+        fields &= ~XFS_AGI_ALL_BITS_R1;
+        if (fields) {
+                xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+                                  &first, &last);
+                xfs_trans_log_buf(tp, bp, first, last);
+        }
+}
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+        struct xfs_agi          *agi)
+{
+        int                     i;
+        for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+                ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
+static bool
+xfs_agi_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+                        return false;
+        /*
+         * Validate the magic number of the agi block.
+         */
+        if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+                return false;
+        if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+                return false;
+        /*
+         * during growfs operations, the perag is not fully initialised,
+         * so we can't use it for any useful checking. growfs ensures we can't
+         * use it by using uncached buffers that don't have the perag attached
+         * so we can detect and avoid this problem.
+         */
+        if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+                return false;
+        xfs_check_agi_unlinked(agi);
+        return true;
+}
+static void
+xfs_agi_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
+                                XFS_ERRTAG_IALLOC_READ_AGI,
+                                XFS_RANDOM_IALLOC_READ_AGI))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+static void
+xfs_agi_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        if (!xfs_agi_verify(bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (bip)
+                XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
+}
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+        .verify_read = xfs_agi_read_verify,
+        .verify_write = xfs_agi_write_verify,
+};
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_read_agi(
+        struct xfs_mount        *mp,    /* file system mount structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        struct xfs_buf          **bpp)  /* allocation group hdr buf */
+{
+        int                     error;
+        trace_xfs_read_agi(mp, agno);
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                        XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
+        if (error)
+                return error;
+        xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+        return 0;
+}
+int
+xfs_ialloc_read_agi(
+        struct xfs_mount        *mp,    /* file system mount structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        struct xfs_buf          **bpp)  /* allocation group hdr buf */
+{
+        struct xfs_agi          *agi;   /* allocation group header */
+        struct xfs_perag        *pag;   /* per allocation group data */
+        int                     error;
+        trace_xfs_ialloc_read_agi(mp, agno);
+        error = xfs_read_agi(mp, tp, agno, bpp);
+        if (error)
+                return error;
+        agi = XFS_BUF_TO_AGI(*bpp);
+        pag = xfs_perag_get(mp, agno);
+        if (!pag->pagi_init) {
+                pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+                pag->pagi_count = be32_to_cpu(agi->agi_count);
+                pag->pagi_init = 1;
+        }
+        /*
+         * It's possible for these to be out of sync if
+         * we are in the middle of a forced shutdown.
+         */
+        ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+                XFS_FORCED_SHUTDOWN(mp));
+        xfs_perag_put(pag);
+        return 0;
+}
+/*
+ * Read in the agi to initialise the per-ag data in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_agnumber_t  agno)           /* allocation group number */
+{
+        xfs_buf_t       *bp = NULL;
+        int             error;
+        error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+        if (error)
+                return error;
+        if (bp)
+                xfs_trans_brelse(tp, bp);
+        return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
new file mode 100644
index 000000000000..95ad1c002d60
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IALLOC_H__
+#define __XFS_IALLOC_H__
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_imap;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_btree_cur;
+/* Move inodes in clusters of this size */
+#define XFS_INODE_BIG_CLUSTER_SIZE      8192
+/* Calculate and return the number of filesystem blocks per inode cluster */
+static inline int
+xfs_icluster_size_fsb(
+        struct xfs_mount        *mp)
+{
+        if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
+                return 1;
+        return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+}
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+static inline struct xfs_dinode *
+xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
+{
+        return (struct xfs_dinode *)
+                (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+}
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * To work within the constraint of one allocation per transaction,
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.  If an inode is
+ * available without an allocation, agbp would be set to the current
+ * agbp and alloc_done set to false.
+ * If an allocation needed to be done, agbp would be set to the
+ * inode header of the allocation group and alloc_done set to true.
+ * The caller should then commit the current transaction and allocate a new
+ * transaction.  xfs_dialloc() should then be called again with
+ * the agbp value returned from the previous call.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.  The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ *
+ * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
+ */
+int                                     /* error */
+xfs_dialloc(
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_ino_t       parent,         /* parent inode (directory) */
+        umode_t         mode,           /* mode bits for new inode */
+        int             okalloc,        /* ok to allocate more space */
+        struct xfs_buf  **agbp,         /* buf for a.g. inode header */
+        xfs_ino_t       *inop);         /* inode number allocated */
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int                                     /* error */
+xfs_difree(
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_ino_t       inode,          /* inode to be freed */
+        struct xfs_bmap_free *flist,    /* extents to free */
+        int             *deleted,       /* set if inode cluster was deleted */
+        xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+        struct xfs_mount *mp,           /* file system mount structure */
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_ino_t       ino,            /* inode to locate */
+        struct xfs_imap *imap,          /* location map structure */
+        uint            flags);         /* flags for inode btree lookup */
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+        struct xfs_mount *mp);          /* file system mount structure */
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+        struct xfs_trans *tp,           /* transaction pointer */
+        struct xfs_buf  *bp,            /* allocation group header buffer */
+        int             fields);        /* bitmask of fields to log */
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int                                     /* error */
+xfs_ialloc_read_agi(
+        struct xfs_mount *mp,           /* file system mount structure */
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        struct xfs_buf  **bpp);         /* allocation group hdr buf */
+/*
+ * Read in the allocation group header to initialise the per-ag data
+ * in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+        struct xfs_mount *mp,           /* file system mount structure */
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_agnumber_t  agno);          /* allocation group number */
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+                xfs_lookup_t dir, int *stat);
+/*
+ * Get the data from the pointed-to record.
+ */
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+                xfs_inobt_rec_incore_t *rec, int *stat);
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+                          struct list_head *buffer_list,
+                          xfs_agnumber_t agno, xfs_agblock_t agbno,
+                          xfs_agblock_t length, unsigned int gen);
+#endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
new file mode 100644
index 000000000000..c9b06f30fe86
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+STATIC int
+xfs_inobt_get_minrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        return cur->bc_mp->m_inobt_mnr[level != 0];
+}
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+        struct xfs_btree_cur    *cur)
+{
+        return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.a.agbp, cur->bc_private.a.agno,
+                        cur->bc_btnum);
+}
+STATIC void
+xfs_inobt_set_root(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *nptr,
+        int                     inc)    /* level change */
+{
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+        agi->agi_root = nptr->s;
+        be32_add_cpu(&agi->agi_level, inc);
+        xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+}
+STATIC void
+xfs_finobt_set_root(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *nptr,
+        int                     inc)    /* level change */
+{
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+        agi->agi_free_root = nptr->s;
+        be32_add_cpu(&agi->agi_free_level, inc);
+        xfs_ialloc_log_agi(cur->bc_tp, agbp,
+                           XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
+STATIC int
+xfs_inobt_alloc_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *start,
+        union xfs_btree_ptr     *new,
+        int                     *stat)
+{
+        xfs_alloc_arg_t         args;           /* block allocation args */
+        int                     error;          /* error return value */
+        xfs_agblock_t           sbno = be32_to_cpu(start->s);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        memset(&args, 0, sizeof(args));
+        args.tp = cur->bc_tp;
+        args.mp = cur->bc_mp;
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+        args.minlen = 1;
+        args.maxlen = 1;
+        args.prod = 1;
+        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        error = xfs_alloc_vextent(&args);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                return error;
+        }
+        if (args.fsbno == NULLFSBLOCK) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        ASSERT(args.len == 1);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
+        *stat = 1;
+        return 0;
+}
+STATIC int
+xfs_inobt_free_block(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        xfs_fsblock_t           fsbno;
+        int                     error;
+        fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+        error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+        if (error)
+                return error;
+        xfs_trans_binval(cur->bc_tp, bp);
+        return error;
+}
+STATIC int
+xfs_inobt_get_maxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        return cur->bc_mp->m_inobt_mxr[level != 0];
+}
+STATIC void
+xfs_inobt_init_key_from_rec(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        key->inobt.ir_startino = rec->inobt.ir_startino;
+}
+STATIC void
+xfs_inobt_init_rec_from_key(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        rec->inobt.ir_startino = key->inobt.ir_startino;
+}
+STATIC void
+xfs_inobt_init_rec_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+        rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+        rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
+}
+/*
+ * initial value of ptr for lookup
+ */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+        ptr->s = agi->agi_root;
+}
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+        ptr->s = agi->agi_free_root;
+}
+STATIC __int64_t
+xfs_inobt_key_diff(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key)
+{
+        return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+                          cur->bc_rec.i.ir_startino;
+}
+static int
+xfs_inobt_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        struct xfs_perag        *pag = bp->b_pag;
+        unsigned int            level;
+        /*
+         * During growfs operations, we can't verify the exact owner as the
+         * perag is not fully initialised and hence not attached to the buffer.
+         *
+         * Similarly, during log recovery we will have a perag structure
+         * attached, but the agi information will not yet have been initialised
+         * from the on disk AGI. We don't currently use any of this information,
+         * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+         * ever do.
+         */
+        switch (block->bb_magic) {
+        case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+        case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+                if (!xfs_sb_version_hascrc(&mp->m_sb))
+                        return false;
+                if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+                        return false;
+                if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+                        return false;
+                if (pag &&
+                    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+                        return false;
+                /* fall through */
+        case cpu_to_be32(XFS_IBT_MAGIC):
+        case cpu_to_be32(XFS_FIBT_MAGIC):
+                break;
+        default:
+                return 0;
+        }
+        /* numrecs and level verification */
+        level = be16_to_cpu(block->bb_level);
+        if (level >= mp->m_in_maxlevels)
+                return false;
+        if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
+                return false;
+        /* sibling pointer verification */
+        if (!block->bb_u.s.bb_leftsib ||
+            (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+             block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+                return false;
+        if (!block->bb_u.s.bb_rightsib ||
+            (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+             block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+                return false;
+        return true;
+}
+static void
+xfs_inobt_read_verify(
+        struct xfs_buf  *bp)
+{
+        if (!xfs_btree_sblock_verify_crc(bp))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_inobt_verify(bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                xfs_verifier_error(bp);
+        }
+}
+static void
+xfs_inobt_write_verify(
+        struct xfs_buf  *bp)
+{
+        if (!xfs_inobt_verify(bp)) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        xfs_btree_sblock_calc_crc(bp);
+}
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+        .verify_read = xfs_inobt_read_verify,
+        .verify_write = xfs_inobt_write_verify,
+};
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_inobt_keys_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *k1,
+        union xfs_btree_key     *k2)
+{
+        return be32_to_cpu(k1->inobt.ir_startino) <
+                be32_to_cpu(k2->inobt.ir_startino);
+}
+STATIC int
+xfs_inobt_recs_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *r1,
+        union xfs_btree_rec     *r2)
+{
+        return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+                be32_to_cpu(r2->inobt.ir_startino);
+}
+#endif  /* DEBUG */
+static const struct xfs_btree_ops xfs_inobt_ops = {
+        .rec_len                = sizeof(xfs_inobt_rec_t),
+        .key_len                = sizeof(xfs_inobt_key_t),
+        .dup_cursor             = xfs_inobt_dup_cursor,
+        .set_root               = xfs_inobt_set_root,
+        .alloc_block            = xfs_inobt_alloc_block,
+        .free_block             = xfs_inobt_free_block,
+        .get_minrecs            = xfs_inobt_get_minrecs,
+        .get_maxrecs            = xfs_inobt_get_maxrecs,
+        .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+        .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
+        .key_diff               = xfs_inobt_key_diff,
+        .buf_ops                = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+        .keys_inorder           = xfs_inobt_keys_inorder,
+        .recs_inorder           = xfs_inobt_recs_inorder,
+#endif
+};
+static const struct xfs_btree_ops xfs_finobt_ops = {
+        .rec_len                = sizeof(xfs_inobt_rec_t),
+        .key_len                = sizeof(xfs_inobt_key_t),
+        .dup_cursor             = xfs_inobt_dup_cursor,
+        .set_root               = xfs_finobt_set_root,
+        .alloc_block            = xfs_inobt_alloc_block,
+        .free_block             = xfs_inobt_free_block,
+        .get_minrecs            = xfs_inobt_get_minrecs,
+        .get_maxrecs            = xfs_inobt_get_maxrecs,
+        .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+        .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_finobt_init_ptr_from_cur,
+        .key_diff               = xfs_inobt_key_diff,
+        .buf_ops                = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+        .keys_inorder           = xfs_inobt_keys_inorder,
+        .recs_inorder           = xfs_inobt_recs_inorder,
+#endif
+};
+/*
+ * Allocate a new inode btree cursor.
+ */
+struct xfs_btree_cur *                          /* new inode btree cursor */
+xfs_inobt_init_cursor(
+        struct xfs_mount        *mp,            /* file system mount point */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_buf          *agbp,          /* buffer for agi structure */
+        xfs_agnumber_t          agno,           /* allocation group number */
+        xfs_btnum_t             btnum)          /* ialloc or free ino btree */
+{
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+        struct xfs_btree_cur    *cur;
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+        cur->bc_tp = tp;
+        cur->bc_mp = mp;
+        cur->bc_btnum = btnum;
+        if (btnum == XFS_BTNUM_INO) {
+                cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+                cur->bc_ops = &xfs_inobt_ops;
+        } else {
+                cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+                cur->bc_ops = &xfs_finobt_ops;
+        }
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+        cur->bc_private.a.agbp = agbp;
+        cur->bc_private.a.agno = agno;
+        return cur;
+}
+/*
+ * Calculate number of records in an inobt btree block.
+ */
+int
+xfs_inobt_maxrecs(
+        struct xfs_mount        *mp,
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+        if (leaf)
+                return blocklen / sizeof(xfs_inobt_rec_t);
+        return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
new file mode 100644
index 000000000000..d7ebea72c2d0
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define __XFS_IALLOC_BTREE_H__
+/*
+ * Inode map on-disk structures
+ */
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+#define XFS_INOBT_BLOCK_LEN(mp) \
+        (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+                XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+        ((xfs_inobt_rec_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+        ((xfs_inobt_key_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_inobt_key_t)))
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_inobt_ptr_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 (maxrecs) * sizeof(xfs_inobt_key_t) + \
+                 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
+                xfs_btnum_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+#endif  /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
new file mode 100644
index 000000000000..f18fd2da49f7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_icache.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_dinode.h"
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+        xfs_mount_t     *mp,
+        xfs_buf_t       *bp)
+{
+        int             i;
+        int             j;
+        xfs_dinode_t    *dip;
+        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+        for (i = 0; i < j; i++) {
+                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                        i * mp->m_sb.sb_inodesize);
+                if (!dip->di_next_unlinked)  {
+                        xfs_alert(mp,
+        "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
+                                i, (long long)bp->b_bn);
+                }
+        }
+}
+#endif
+/*
+ * If we are doing readahead on an inode buffer, we might be in log recovery
+ * reading an inode allocation buffer that hasn't yet been replayed, and hence
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+ * If the readahead buffer is invalid, we don't want to mark it with an error,
+ * but we do want to clear the DONE status of the buffer so that a followup read
+ * will re-read it from disk. This will ensure that we don't get an unnecessary
+ * warnings during log recovery and we don't get unnecssary panics on debug
+ * kernels.
+ */
+static void
+xfs_inode_buf_verify(
+        struct xfs_buf  *bp,
+        bool            readahead)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        int             i;
+        int             ni;
+        /*
+         * Validate the magic number and version of every inode in the buffer
+         */
+        ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+        for (i = 0; i < ni; i++) {
+                int             di_ok;
+                xfs_dinode_t    *dip;
+                dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+                                        (i << mp->m_sb.sb_inodelog));
+                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                            XFS_DINODE_GOOD_VERSION(dip->di_version);
+                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                                XFS_ERRTAG_ITOBP_INOTOBP,
+                                                XFS_RANDOM_ITOBP_INOTOBP))) {
+                        if (readahead) {
+                                bp->b_flags &= ~XBF_DONE;
+                                return;
+                        }
+                        xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                        xfs_verifier_error(bp);
+#ifdef DEBUG
+                        xfs_alert(mp,
+                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                                (unsigned long long)bp->b_bn, i,
+                                be16_to_cpu(dip->di_magic));
+#endif
+                }
+        }
+        xfs_inobp_check(mp, bp);
+}
+static void
+xfs_inode_buf_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inode_buf_verify(bp, false);
+}
+static void
+xfs_inode_buf_readahead_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inode_buf_verify(bp, true);
+}
+static void
+xfs_inode_buf_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inode_buf_verify(bp, false);
+}
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+        .verify_read = xfs_inode_buf_read_verify,
+        .verify_write = xfs_inode_buf_write_verify,
+};
+const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+        .verify_read = xfs_inode_buf_readahead_verify,
+        .verify_write = xfs_inode_buf_write_verify,
+};
+/*
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
+ * pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and dipp are
+ * undefined.
+ */
+int
+xfs_imap_to_bp(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_imap         *imap,
+        struct xfs_dinode       **dipp,
+        struct xfs_buf          **bpp,
+        uint                    buf_flags,
+        uint                    iget_flags)
+{
+        struct xfs_buf          *bp;
+        int                     error;
+        buf_flags |= XBF_UNMAPPED;
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+                                   (int)imap->im_len, buf_flags, &bp,
+                                   &xfs_inode_buf_ops);
+        if (error) {
+                if (error == -EAGAIN) {
+                        ASSERT(buf_flags & XBF_TRYLOCK);
+                        return error;
+                }
+                if (error == -EFSCORRUPTED &&
+                    (iget_flags & XFS_IGET_UNTRUSTED))
+                        return -EINVAL;
+                xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+                        __func__, error);
+                return error;
+        }
+        *bpp = bp;
+        *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+        return 0;
+}
+void
+xfs_dinode_from_disk(
+        xfs_icdinode_t          *to,
+        xfs_dinode_t            *from)
+{
+        to->di_magic = be16_to_cpu(from->di_magic);
+        to->di_mode = be16_to_cpu(from->di_mode);
+        to->di_version = from ->di_version;
+        to->di_format = from->di_format;
+        to->di_onlink = be16_to_cpu(from->di_onlink);
+        to->di_uid = be32_to_cpu(from->di_uid);
+        to->di_gid = be32_to_cpu(from->di_gid);
+        to->di_nlink = be32_to_cpu(from->di_nlink);
+        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+        to->di_flushiter = be16_to_cpu(from->di_flushiter);
+        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
+        to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+        to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+        to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+        to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+        to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+        to->di_size = be64_to_cpu(from->di_size);
+        to->di_nblocks = be64_to_cpu(from->di_nblocks);
+        to->di_extsize = be32_to_cpu(from->di_extsize);
+        to->di_nextents = be32_to_cpu(from->di_nextents);
+        to->di_anextents = be16_to_cpu(from->di_anextents);
+        to->di_forkoff = from->di_forkoff;
+        to->di_aformat  = from->di_aformat;
+        to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
+        to->di_dmstate  = be16_to_cpu(from->di_dmstate);
+        to->di_flags    = be16_to_cpu(from->di_flags);
+        to->di_gen      = be32_to_cpu(from->di_gen);
+        if (to->di_version == 3) {
+                to->di_changecount = be64_to_cpu(from->di_changecount);
+                to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
+                to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+                to->di_flags2 = be64_to_cpu(from->di_flags2);
+                to->di_ino = be64_to_cpu(from->di_ino);
+                to->di_lsn = be64_to_cpu(from->di_lsn);
+                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+                uuid_copy(&to->di_uuid, &from->di_uuid);
+        }
+}
+void
+xfs_dinode_to_disk(
+        xfs_dinode_t            *to,
+        xfs_icdinode_t          *from)
+{
+        to->di_magic = cpu_to_be16(from->di_magic);
+        to->di_mode = cpu_to_be16(from->di_mode);
+        to->di_version = from ->di_version;
+        to->di_format = from->di_format;
+        to->di_onlink = cpu_to_be16(from->di_onlink);
+        to->di_uid = cpu_to_be32(from->di_uid);
+        to->di_gid = cpu_to_be32(from->di_gid);
+        to->di_nlink = cpu_to_be32(from->di_nlink);
+        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
+        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
+        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
+        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+        to->di_size = cpu_to_be64(from->di_size);
+        to->di_nblocks = cpu_to_be64(from->di_nblocks);
+        to->di_extsize = cpu_to_be32(from->di_extsize);
+        to->di_nextents = cpu_to_be32(from->di_nextents);
+        to->di_anextents = cpu_to_be16(from->di_anextents);
+        to->di_forkoff = from->di_forkoff;
+        to->di_aformat = from->di_aformat;
+        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+        to->di_dmstate = cpu_to_be16(from->di_dmstate);
+        to->di_flags = cpu_to_be16(from->di_flags);
+        to->di_gen = cpu_to_be32(from->di_gen);
+        if (from->di_version == 3) {
+                to->di_changecount = cpu_to_be64(from->di_changecount);
+                to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+                to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+                to->di_flags2 = cpu_to_be64(from->di_flags2);
+                to->di_ino = cpu_to_be64(from->di_ino);
+                to->di_lsn = cpu_to_be64(from->di_lsn);
+                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+                uuid_copy(&to->di_uuid, &from->di_uuid);
+                to->di_flushiter = 0;
+        } else {
+                to->di_flushiter = cpu_to_be16(from->di_flushiter);
+        }
+}
+static bool
+xfs_dinode_verify(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip,
+        struct xfs_dinode       *dip)
+{
+        if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+                return false;
+        /* only version 3 or greater inodes are extensively verified here */
+        if (dip->di_version < 3)
+                return true;
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return false;
+        if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+                              XFS_DINODE_CRC_OFF))
+                return false;
+        if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+                return false;
+        if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+                return false;
+        return true;
+}
+void
+xfs_dinode_calc_crc(
+        struct xfs_mount        *mp,
+        struct xfs_dinode       *dip)
+{
+        __uint32_t              crc;
+        if (dip->di_version < 3)
+                return;
+        ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+        crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+                              XFS_DINODE_CRC_OFF);
+        dip->di_crc = xfs_end_cksum(crc);
+}
+/*
+ * Read the disk inode attributes into the in-core inode structure.
+ *
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
+ */
+int
+xfs_iread(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        uint            iget_flags)
+{
+        xfs_buf_t       *bp;
+        xfs_dinode_t    *dip;
+        int             error;
+        /*
+         * Fill in the location information in the in-core inode.
+         */
+        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+        if (error)
+                return error;
+        /* shortcut IO on inode allocation if possible */
+        if ((iget_flags & XFS_IGET_CREATE) &&
+            xfs_sb_version_hascrc(&mp->m_sb) &&
+            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+                /* initialise the on-disk inode core */
+                memset(&ip->i_d, 0, sizeof(ip->i_d));
+                ip->i_d.di_magic = XFS_DINODE_MAGIC;
+                ip->i_d.di_gen = prandom_u32();
+                if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                        ip->i_d.di_version = 3;
+                        ip->i_d.di_ino = ip->i_ino;
+                        uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+                } else
+                        ip->i_d.di_version = 2;
+                return 0;
+        }
+        /*
+         * Get pointers to the on-disk inode and the buffer containing it.
+         */
+        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
+        if (error)
+                return error;
+        /* even unallocated inodes are verified */
+        if (!xfs_dinode_verify(mp, ip, dip)) {
+                xfs_alert(mp, "%s: validation failed for inode %lld failed",
+                                __func__, ip->i_ino);
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
+                error = -EFSCORRUPTED;
+                goto out_brelse;
+        }
+        /*
+         * If the on-disk inode is already linked to a directory
+         * entry, copy all of the inode into the in-core inode.
+         * xfs_iformat_fork() handles copying in the inode format
+         * specific information.
+         * Otherwise, just get the truly permanent information.
+         */
+        if (dip->di_mode) {
+                xfs_dinode_from_disk(&ip->i_d, dip);
+                error = xfs_iformat_fork(ip, dip);
+                if (error)  {
+#ifdef DEBUG
+                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+                                __func__, error);
+#endif /* DEBUG */
+                        goto out_brelse;
+                }
+        } else {
+                /*
+                 * Partial initialisation of the in-core inode. Just the bits
+                 * that xfs_ialloc won't overwrite or relies on being correct.
+                 */
+                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+                ip->i_d.di_version = dip->di_version;
+                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+                if (dip->di_version == 3) {
+                        ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
+                        uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
+                }
+                /*
+                 * Make sure to pull in the mode here as well in
+                 * case the inode is released without being used.
+                 * This ensures that xfs_inactive() will see that
+                 * the inode is already free and not try to mess
+                 * with the uninitialized part of it.
+                 */
+                ip->i_d.di_mode = 0;
+        }
+        /*
+         * Automatically convert version 1 inode formats in memory to version 2
+         * inode format. If the inode is modified, it will get logged and
+         * rewritten as a version 2 inode. We can do this because we set the
+         * superblock feature bit for v2 inodes unconditionally during mount
+         * and it means the reast of the code can assume the inode version is 2
+         * or higher.
+         */
+        if (ip->i_d.di_version == 1) {
+                ip->i_d.di_version = 2;
+                memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+                ip->i_d.di_nlink = ip->i_d.di_onlink;
+                ip->i_d.di_onlink = 0;
+                xfs_set_projid(ip, 0);
+        }
+        ip->i_delayed_blks = 0;
+        /*
+         * Mark the buffer containing the inode as something to keep
+         * around for a while.  This helps to keep recently accessed
+         * meta-data in-core longer.
+         */
+        xfs_buf_set_ref(bp, XFS_INO_REF);
+        /*
+         * Use xfs_trans_brelse() to release the buffer containing the on-disk
+         * inode, because it was acquired with xfs_trans_read_buf() in
+         * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
+         * brelse().  If we're within a transaction, then xfs_trans_brelse()
+         * will only release the buffer if it is not dirty within the
+         * transaction.  It will be OK to release the buffer in this case,
+         * because inodes on disk are never destroyed and we will be locking the
+         * new in-core inode before putting it in the cache where other
+         * processes can find it.  Thus we don't have to worry about the inode
+         * being changed just because we released the buffer.
+         */
+ out_brelse:
+        xfs_trans_brelse(tp, bp);
+        return error;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
new file mode 100644
index 000000000000..9308c47f2a52
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_INODE_BUF_H__
+#define __XFS_INODE_BUF_H__
+struct xfs_inode;
+struct xfs_dinode;
+struct xfs_icdinode;
+/*
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+        xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
+        ushort          im_len;         /* length in BBs of inode chunk */
+        ushort          im_boffset;     /* inode offset in block in bytes */
+};
+int     xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+                       struct xfs_imap *, struct xfs_dinode **,
+                       struct xfs_buf **, uint, uint);
+int     xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                  struct xfs_inode *, uint);
+void    xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void    xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
+void    xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
+#if defined(DEBUG)
+void    xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+#endif  /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
new file mode 100644
index 000000000000..8ac9411bcf2a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -0,0 +1,1906 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/log2.h>
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+kmem_zone_t *xfs_ifork_zone;
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+void
+xfs_validate_extents(
+        xfs_ifork_t             *ifp,
+        int                     nrecs,
+        xfs_exntfmt_t           fmt)
+{
+        xfs_bmbt_irec_t         irec;
+        xfs_bmbt_rec_host_t     rec;
+        int                     i;
+        for (i = 0; i < nrecs; i++) {
+                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+                rec.l0 = get_unaligned(&ep->l0);
+                rec.l1 = get_unaligned(&ep->l1);
+                xfs_bmbt_get_all(&rec, &irec);
+                if (fmt == XFS_EXTFMT_NOSTATE)
+                        ASSERT(irec.br_state == XFS_EXT_NORM);
+        }
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ifp, nrecs, fmt)
+#endif /* DEBUG */
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.  For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.  For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+int
+xfs_iformat_fork(
+        xfs_inode_t             *ip,
+        xfs_dinode_t            *dip)
+{
+        xfs_attr_shortform_t    *atp;
+        int                     size;
+        int                     error = 0;
+        xfs_fsize_t             di_size;
+        if (unlikely(be32_to_cpu(dip->di_nextents) +
+                     be16_to_cpu(dip->di_anextents) >
+                     be64_to_cpu(dip->di_nblocks))) {
+                xfs_warn(ip->i_mount,
+                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
+                        (unsigned long long)ip->i_ino,
+                        (int)(be32_to_cpu(dip->di_nextents) +
+                              be16_to_cpu(dip->di_anextents)),
+                        (unsigned long long)
+                                be64_to_cpu(dip->di_nblocks));
+                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+                                     ip->i_mount, dip);
+                return -EFSCORRUPTED;
+        }
+        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+                xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
+                        (unsigned long long)ip->i_ino,
+                        dip->di_forkoff);
+                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+                                     ip->i_mount, dip);
+                return -EFSCORRUPTED;
+        }
+        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+                     !ip->i_mount->m_rtdev_targp)) {
+                xfs_warn(ip->i_mount,
+                        "corrupt dinode %Lu, has realtime flag set.",
+                        ip->i_ino);
+                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+                                     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+                return -EFSCORRUPTED;
+        }
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFIFO:
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFSOCK:
+                if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
+                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+                                              ip->i_mount, dip);
+                        return -EFSCORRUPTED;
+                }
+                ip->i_d.di_size = 0;
+                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+                break;
+        case S_IFREG:
+        case S_IFLNK:
+        case S_IFDIR:
+                switch (dip->di_format) {
+                case XFS_DINODE_FMT_LOCAL:
+                        /*
+                         * no local regular files yet
+                         */
+                        if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
+                                xfs_warn(ip->i_mount,
+                        "corrupt inode %Lu (local format for regular file).",
+                                        (unsigned long long) ip->i_ino);
+                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+                                                     XFS_ERRLEVEL_LOW,
+                                                     ip->i_mount, dip);
+                                return -EFSCORRUPTED;
+                        }
+                        di_size = be64_to_cpu(dip->di_size);
+                        if (unlikely(di_size < 0 ||
+                                     di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+                                xfs_warn(ip->i_mount,
+                        "corrupt inode %Lu (bad size %Ld for local inode).",
+                                        (unsigned long long) ip->i_ino,
+                                        (long long) di_size);
+                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+                                                     XFS_ERRLEVEL_LOW,
+                                                     ip->i_mount, dip);
+                                return -EFSCORRUPTED;
+                        }
+                        size = (int)di_size;
+                        error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+                        break;
+                case XFS_DINODE_FMT_EXTENTS:
+                        error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+                        break;
+                case XFS_DINODE_FMT_BTREE:
+                        error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+                        break;
+                default:
+                        XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+                                         ip->i_mount);
+                        return -EFSCORRUPTED;
+                }
+                break;
+        default:
+                XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+                return -EFSCORRUPTED;
+        }
+        if (error) {
+                return error;
+        }
+        if (!XFS_DFORK_Q(dip))
+                return 0;
+        ASSERT(ip->i_afp == NULL);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+        switch (dip->di_aformat) {
+        case XFS_DINODE_FMT_LOCAL:
+                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+                size = be16_to_cpu(atp->hdr.totsize);
+                if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+                        xfs_warn(ip->i_mount,
+                                "corrupt inode %Lu (bad attr fork size %Ld).",
+                                (unsigned long long) ip->i_ino,
+                                (long long) size);
+                        XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+                                             XFS_ERRLEVEL_LOW,
+                                             ip->i_mount, dip);
+                        return -EFSCORRUPTED;
+                }
+                error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+                break;
+        case XFS_DINODE_FMT_EXTENTS:
+                error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+                break;
+        default:
+                error = -EFSCORRUPTED;
+                break;
+        }
+        if (error) {
+                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+                ip->i_afp = NULL;
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+        }
+        return error;
+}
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+        xfs_inode_t     *ip,
+        xfs_dinode_t    *dip,
+        int             whichfork,
+        int             size)
+{
+        xfs_ifork_t     *ifp;
+        int             real_size;
+        /*
+         * If the size is unreasonable, then something
+         * is wrong and we just bail out rather than crash in
+         * kmem_alloc() or memcpy() below.
+         */
+        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+                xfs_warn(ip->i_mount,
+        "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+                        (unsigned long long) ip->i_ino, size,
+                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+                                     ip->i_mount, dip);
+                return -EFSCORRUPTED;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        real_size = 0;
+        if (size == 0)
+                ifp->if_u1.if_data = NULL;
+        else if (size <= sizeof(ifp->if_u2.if_inline_data))
+                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+        else {
+                real_size = roundup(size, 4);
+                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+        }
+        ifp->if_bytes = size;
+        ifp->if_real_bytes = real_size;
+        if (size)
+                memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+        ifp->if_flags &= ~XFS_IFEXTENTS;
+        ifp->if_flags |= XFS_IFINLINE;
+        return 0;
+}
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+        xfs_inode_t     *ip,
+        xfs_dinode_t    *dip,
+        int             whichfork)
+{
+        xfs_bmbt_rec_t  *dp;
+        xfs_ifork_t     *ifp;
+        int             nex;
+        int             size;
+        int             i;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+        size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+        /*
+         * If the number of extents is unreasonable, then something
+         * is wrong and we just bail out rather than crash in
+         * kmem_alloc() or memcpy() below.
+         */
+        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+                xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
+                        (unsigned long long) ip->i_ino, nex);
+                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+                                     ip->i_mount, dip);
+                return -EFSCORRUPTED;
+        }
+        ifp->if_real_bytes = 0;
+        if (nex == 0)
+                ifp->if_u1.if_extents = NULL;
+        else if (nex <= XFS_INLINE_EXTS)
+                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+        else
+                xfs_iext_add(ifp, 0, nex);
+        ifp->if_bytes = size;
+        if (size) {
+                dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+                xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
+                for (i = 0; i < nex; i++, dp++) {
+                        xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+                        ep->l0 = get_unaligned_be64(&dp->l0);
+                        ep->l1 = get_unaligned_be64(&dp->l1);
+                }
+                XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
+                if (whichfork != XFS_DATA_FORK ||
+                        XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+                                if (unlikely(xfs_check_nostate_extents(
+                                    ifp, 0, nex))) {
+                                        XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+                                                         XFS_ERRLEVEL_LOW,
+                                                         ip->i_mount);
+                                        return -EFSCORRUPTED;
+                                }
+        }
+        ifp->if_flags |= XFS_IFEXTENTS;
+        return 0;
+}
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+        xfs_inode_t             *ip,
+        xfs_dinode_t            *dip,
+        int                     whichfork)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_bmdr_block_t        *dfp;
+        xfs_ifork_t             *ifp;
+        /* REFERENCED */
+        int                     nrecs;
+        int                     size;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+        size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+        nrecs = be16_to_cpu(dfp->bb_numrecs);
+        /*
+         * blow out if -- fork has less extents than can fit in
+         * fork (fork shouldn't be a btree format), root btree
+         * block has more records than can fit into the fork,
+         * or the number of extents is greater than the number of
+         * blocks.
+         */
+        if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                                        XFS_IFORK_MAXEXT(ip, whichfork) ||
+                     XFS_BMDR_SPACE_CALC(nrecs) >
+                                        XFS_DFORK_SIZE(dip, mp, whichfork) ||
+                     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+                xfs_warn(mp, "corrupt inode %Lu (btree).",
+                                        (unsigned long long) ip->i_ino);
+                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+                                         mp, dip);
+                return -EFSCORRUPTED;
+        }
+        ifp->if_broot_bytes = size;
+        ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+        ASSERT(ifp->if_broot != NULL);
+        /*
+         * Copy and convert from the on-disk structure
+         * to the in-memory structure.
+         */
+        xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                         ifp->if_broot, size);
+        ifp->if_flags &= ~XFS_IFEXTENTS;
+        ifp->if_flags |= XFS_IFBROOT;
+        return 0;
+}
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        int             whichfork)
+{
+        int             error;
+        xfs_ifork_t     *ifp;
+        xfs_extnum_t    nextents;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+                                 ip->i_mount);
+                return -EFSCORRUPTED;
+        }
+        nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        /*
+         * We know that the size is valid (it's checked in iformat_btree)
+         */
+        ifp->if_bytes = ifp->if_real_bytes = 0;
+        ifp->if_flags |= XFS_IFEXTENTS;
+        xfs_iext_add(ifp, 0, nextents);
+        error = xfs_bmap_read_extents(tp, ip, whichfork);
+        if (error) {
+                xfs_iext_destroy(ifp);
+                ifp->if_flags &= ~XFS_IFEXTENTS;
+                return error;
+        }
+        xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+        return 0;
+}
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.  When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we are adding records, one will be allocated.  The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *       requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+        xfs_inode_t             *ip,
+        int                     rec_diff,
+        int                     whichfork)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     cur_max;
+        xfs_ifork_t             *ifp;
+        struct xfs_btree_block  *new_broot;
+        int                     new_max;
+        size_t                  new_size;
+        char                    *np;
+        char                    *op;
+        /*
+         * Handle the degenerate case quietly.
+         */
+        if (rec_diff == 0) {
+                return;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (rec_diff > 0) {
+                /*
+                 * If there wasn't any memory allocated before, just
+                 * allocate it now and get out.
+                 */
+                if (ifp->if_broot_bytes == 0) {
+                        new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+                        ifp->if_broot_bytes = (int)new_size;
+                        return;
+                }
+                /*
+                 * If there is already an existing if_broot, then we need
+                 * to realloc() it and shift the pointers to their new
+                 * location.  The records don't change location because
+                 * they are kept butted up against the btree block header.
+                 */
+                cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+                new_max = cur_max + rec_diff;
+                new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
+                                XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
+                                KM_SLEEP | KM_NOFS);
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                     ifp->if_broot_bytes);
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                     (int)new_size);
+                ifp->if_broot_bytes = (int)new_size;
+                ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                        XFS_IFORK_SIZE(ip, whichfork));
+                memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+                return;
+        }
+        /*
+         * rec_diff is less than 0.  In this case, we are shrinking the
+         * if_broot buffer.  It must already exist.  If we go to zero
+         * records, just get rid of the root and clear the status bit.
+         */
+        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+        cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+        new_max = cur_max + rec_diff;
+        ASSERT(new_max >= 0);
+        if (new_max > 0)
+                new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+        else
+                new_size = 0;
+        if (new_size > 0) {
+                new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+                /*
+                 * First copy over the btree block header.
+                 */
+                memcpy(new_broot, ifp->if_broot,
+                        XFS_BMBT_BLOCK_LEN(ip->i_mount));
+        } else {
+                new_broot = NULL;
+                ifp->if_flags &= ~XFS_IFBROOT;
+        }
+        /*
+         * Only copy the records and pointers if there are any.
+         */
+        if (new_max > 0) {
+                /*
+                 * First copy the records.
+                 */
+                op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+                np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+                /*
+                 * Then copy the pointers.
+                 */
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                     ifp->if_broot_bytes);
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+                                                     (int)new_size);
+                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+        }
+        kmem_free(ifp->if_broot);
+        ifp->if_broot = new_broot;
+        ifp->if_broot_bytes = (int)new_size;
+        if (ifp->if_broot)
+                ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                        XFS_IFORK_SIZE(ip, whichfork));
+        return;
+}
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *       requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+        xfs_inode_t     *ip,
+        int             byte_diff,
+        int             whichfork)
+{
+        xfs_ifork_t     *ifp;
+        int             new_size;
+        int             real_size;
+        if (byte_diff == 0) {
+                return;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        new_size = (int)ifp->if_bytes + byte_diff;
+        ASSERT(new_size >= 0);
+        if (new_size == 0) {
+                if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                        kmem_free(ifp->if_u1.if_data);
+                }
+                ifp->if_u1.if_data = NULL;
+                real_size = 0;
+        } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+                /*
+                 * If the valid extents/data can fit in if_inline_ext/data,
+                 * copy them from the malloc'd vector and free it.
+                 */
+                if (ifp->if_u1.if_data == NULL) {
+                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                        ASSERT(ifp->if_real_bytes != 0);
+                        memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+                              new_size);
+                        kmem_free(ifp->if_u1.if_data);
+                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+                }
+                real_size = 0;
+        } else {
+                /*
+                 * Stuck with malloc/realloc.
+                 * For inline data, the underlying buffer must be
+                 * a multiple of 4 bytes in size so that it can be
+                 * logged and stay on word boundaries.  We enforce
+                 * that here.
+                 */
+                real_size = roundup(new_size, 4);
+                if (ifp->if_u1.if_data == NULL) {
+                        ASSERT(ifp->if_real_bytes == 0);
+                        ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                        KM_SLEEP | KM_NOFS);
+                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                        /*
+                         * Only do the realloc if the underlying size
+                         * is really changing.
+                         */
+                        if (ifp->if_real_bytes != real_size) {
+                                ifp->if_u1.if_data =
+                                        kmem_realloc(ifp->if_u1.if_data,
+                                                        real_size,
+                                                        ifp->if_real_bytes,
+                                                        KM_SLEEP | KM_NOFS);
+                        }
+                } else {
+                        ASSERT(ifp->if_real_bytes == 0);
+                        ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                        KM_SLEEP | KM_NOFS);
+                        memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+                                ifp->if_bytes);
+                }
+        }
+        ifp->if_real_bytes = real_size;
+        ifp->if_bytes = new_size;
+        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+void
+xfs_idestroy_fork(
+        xfs_inode_t     *ip,
+        int             whichfork)
+{
+        xfs_ifork_t     *ifp;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (ifp->if_broot != NULL) {
+                kmem_free(ifp->if_broot);
+                ifp->if_broot = NULL;
+        }
+        /*
+         * If the format is local, then we can't have an extents
+         * array so just look for an inline data array.  If we're
+         * not local then we may or may not have an extents list,
+         * so check and free it up if we do.
+         */
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+                if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+                    (ifp->if_u1.if_data != NULL)) {
+                        ASSERT(ifp->if_real_bytes != 0);
+                        kmem_free(ifp->if_u1.if_data);
+                        ifp->if_u1.if_data = NULL;
+                        ifp->if_real_bytes = 0;
+                }
+        } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+                   ((ifp->if_flags & XFS_IFEXTIREC) ||
+                    ((ifp->if_u1.if_extents != NULL) &&
+                     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
+                ASSERT(ifp->if_real_bytes != 0);
+                xfs_iext_destroy(ifp);
+        }
+        ASSERT(ifp->if_u1.if_extents == NULL ||
+               ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+        ASSERT(ifp->if_real_bytes == 0);
+        if (whichfork == XFS_ATTR_FORK) {
+                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+                ip->i_afp = NULL;
+        }
+}
+/*
+ * Convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only copy on-disk extents
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine.  We will return the size actually
+ * used.
+ */
+int
+xfs_iextents_copy(
+        xfs_inode_t             *ip,
+        xfs_bmbt_rec_t          *dp,
+        int                     whichfork)
+{
+        int                     copied;
+        int                     i;
+        xfs_ifork_t             *ifp;
+        int                     nrecs;
+        xfs_fsblock_t           start_block;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+        ASSERT(ifp->if_bytes > 0);
+        nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
+        ASSERT(nrecs > 0);
+        /*
+         * There are some delayed allocation extents in the
+         * inode, so copy the extents one at a time and skip
+         * the delayed ones.  There must be at least one
+         * non-delayed extent.
+         */
+        copied = 0;
+        for (i = 0; i < nrecs; i++) {
+                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+                start_block = xfs_bmbt_get_startblock(ep);
+                if (isnullstartblock(start_block)) {
+                        /*
+                         * It's a delayed allocation extent, so skip it.
+                         */
+                        continue;
+                }
+                /* Translate to on disk format */
+                put_unaligned_be64(ep->l0, &dp->l0);
+                put_unaligned_be64(ep->l1, &dp->l1);
+                dp++;
+                copied++;
+        }
+        ASSERT(copied != 0);
+        xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
+        return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+void
+xfs_iflush_fork(
+        xfs_inode_t             *ip,
+        xfs_dinode_t            *dip,
+        xfs_inode_log_item_t    *iip,
+        int                     whichfork)
+{
+        char                    *cp;
+        xfs_ifork_t             *ifp;
+        xfs_mount_t             *mp;
+        static const short      brootflag[2] =
+                { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+        static const short      dataflag[2] =
+                { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+        static const short      extflag[2] =
+                { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+        if (!iip)
+                return;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        /*
+         * This can happen if we gave up in iformat in an error path,
+         * for the attribute fork.
+         */
+        if (!ifp) {
+                ASSERT(whichfork == XFS_ATTR_FORK);
+                return;
+        }
+        cp = XFS_DFORK_PTR(dip, whichfork);
+        mp = ip->i_mount;
+        switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+        case XFS_DINODE_FMT_LOCAL:
+                if ((iip->ili_fields & dataflag[whichfork]) &&
+                    (ifp->if_bytes > 0)) {
+                        ASSERT(ifp->if_u1.if_data != NULL);
+                        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+                        memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+                }
+                break;
+        case XFS_DINODE_FMT_EXTENTS:
+                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+                       !(iip->ili_fields & extflag[whichfork]));
+                if ((iip->ili_fields & extflag[whichfork]) &&
+                    (ifp->if_bytes > 0)) {
+                        ASSERT(xfs_iext_get_ext(ifp, 0));
+                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+                        (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+                                whichfork);
+                }
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                if ((iip->ili_fields & brootflag[whichfork]) &&
+                    (ifp->if_broot_bytes > 0)) {
+                        ASSERT(ifp->if_broot != NULL);
+                        ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                                XFS_IFORK_SIZE(ip, whichfork));
+                        xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+                                (xfs_bmdr_block_t *)cp,
+                                XFS_DFORK_SIZE(dip, mp, whichfork));
+                }
+                break;
+        case XFS_DINODE_FMT_DEV:
+                if (iip->ili_fields & XFS_ILOG_DEV) {
+                        ASSERT(whichfork == XFS_DATA_FORK);
+                        xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+                }
+                break;
+        case XFS_DINODE_FMT_UUID:
+                if (iip->ili_fields & XFS_ILOG_UUID) {
+                        ASSERT(whichfork == XFS_DATA_FORK);
+                        memcpy(XFS_DFORK_DPTR(dip),
+                               &ip->i_df.if_u2.if_uuid,
+                               sizeof(uuid_t));
+                }
+                break;
+        default:
+                ASSERT(0);
+                break;
+        }
+}
+/*
+ * Return a pointer to the extent record at file index idx.
+ */
+xfs_bmbt_rec_host_t *
+xfs_iext_get_ext(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_extnum_t    idx)            /* index of target extent */
+{
+        ASSERT(idx >= 0);
+        ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+        if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
+                return ifp->if_u1.if_ext_irec->er_extbuf;
+        } else if (ifp->if_flags & XFS_IFEXTIREC) {
+                xfs_ext_irec_t  *erp;           /* irec pointer */
+                int             erp_idx = 0;    /* irec index */
+                xfs_extnum_t    page_idx = idx; /* ext index in target list */
+                erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+                return &erp->er_extbuf[page_idx];
+        } else if (ifp->if_bytes) {
+                return &ifp->if_u1.if_extents[idx];
+        } else {
+                return NULL;
+        }
+}
+/*
+ * Insert new item(s) into the extent records for incore inode
+ * fork 'ifp'.  'count' new items are inserted at index 'idx'.
+ */
+void
+xfs_iext_insert(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* starting index of new items */
+        xfs_extnum_t    count,          /* number of inserted items */
+        xfs_bmbt_irec_t *new,           /* items to insert */
+        int             state)          /* type of extent conversion */
+{
+        xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+        xfs_extnum_t    i;              /* extent record index */
+        trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+        xfs_iext_add(ifp, idx, count);
+        for (i = idx; i < idx + count; i++, new++)
+                xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
+}
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be increased. The ext_diff parameter stores the
+ * number of new extents being added and the idx parameter contains
+ * the extent index where the new extents will be added. If the new
+ * extents are being appended, then we just need to (re)allocate and
+ * initialize the space. Otherwise, if the new extents are being
+ * inserted into the middle of the existing entries, a bit more work
+ * is required to make room for the new extents to be inserted. The
+ * caller is responsible for filling in the new extent entries upon
+ * return.
+ */
+void
+xfs_iext_add(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_extnum_t    idx,            /* index to begin adding exts */
+        int             ext_diff)       /* number of extents to add */
+{
+        int             byte_diff;      /* new bytes being added */
+        int             new_size;       /* size of extents after adding */
+        xfs_extnum_t    nextents;       /* number of extents in file */
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        ASSERT((idx >= 0) && (idx <= nextents));
+        byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
+        new_size = ifp->if_bytes + byte_diff;
+        /*
+         * If the new number of extents (nextents + ext_diff)
+         * fits inside the inode, then continue to use the inline
+         * extent buffer.
+         */
+        if (nextents + ext_diff <= XFS_INLINE_EXTS) {
+                if (idx < nextents) {
+                        memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
+                                &ifp->if_u2.if_inline_ext[idx],
+                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+                        memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
+                }
+                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+                ifp->if_real_bytes = 0;
+        }
+        /*
+         * Otherwise use a linear (direct) extent list.
+         * If the extents are currently inside the inode,
+         * xfs_iext_realloc_direct will switch us from
+         * inline to direct extent allocation mode.
+         */
+        else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
+                xfs_iext_realloc_direct(ifp, new_size);
+                if (idx < nextents) {
+                        memmove(&ifp->if_u1.if_extents[idx + ext_diff],
+                                &ifp->if_u1.if_extents[idx],
+                                (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+                        memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
+                }
+        }
+        /* Indirection array */
+        else {
+                xfs_ext_irec_t  *erp;
+                int             erp_idx = 0;
+                int             page_idx = idx;
+                ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
+                if (ifp->if_flags & XFS_IFEXTIREC) {
+                        erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
+                } else {
+                        xfs_iext_irec_init(ifp);
+                        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+                        erp = ifp->if_u1.if_ext_irec;
+                }
+                /* Extents fit in target extent page */
+                if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
+                        if (page_idx < erp->er_extcount) {
+                                memmove(&erp->er_extbuf[page_idx + ext_diff],
+                                        &erp->er_extbuf[page_idx],
+                                        (erp->er_extcount - page_idx) *
+                                        sizeof(xfs_bmbt_rec_t));
+                                memset(&erp->er_extbuf[page_idx], 0, byte_diff);
+                        }
+                        erp->er_extcount += ext_diff;
+                        xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+                }
+                /* Insert a new extent page */
+                else if (erp) {
+                        xfs_iext_add_indirect_multi(ifp,
+                                erp_idx, page_idx, ext_diff);
+                }
+                /*
+                 * If extent(s) are being appended to the last page in
+                 * the indirection array and the new extent(s) don't fit
+                 * in the page, then erp is NULL and erp_idx is set to
+                 * the next index needed in the indirection array.
+                 */
+                else {
+                        uint    count = ext_diff;
+                        while (count) {
+                                erp = xfs_iext_irec_new(ifp, erp_idx);
+                                erp->er_extcount = min(count, XFS_LINEAR_EXTS);
+                                count -= erp->er_extcount;
+                                if (count)
+                                        erp_idx++;
+                        }
+                }
+        }
+        ifp->if_bytes = new_size;
+}
+/*
+ * This is called when incore extents are being added to the indirection
+ * array and the new extents do not fit in the target extent list. The
+ * erp_idx parameter contains the irec index for the target extent list
+ * in the indirection array, and the idx parameter contains the extent
+ * index within the list. The number of extents being added is stored
+ * in the count parameter.
+ *
+ *    |-------|   |-------|
+ *    |       |   |       |    idx - number of extents before idx
+ *    |  idx  |   | count |
+ *    |       |   |       |    count - number of extents being inserted at idx
+ *    |-------|   |-------|
+ *    | count |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_add_indirect_multi(
+        xfs_ifork_t     *ifp,                   /* inode fork pointer */
+        int             erp_idx,                /* target extent irec index */
+        xfs_extnum_t    idx,                    /* index within target list */
+        int             count)                  /* new extents being added */
+{
+        int             byte_diff;              /* new bytes being added */
+        xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
+        xfs_extnum_t    ext_diff;               /* number of extents to add */
+        xfs_extnum_t    ext_cnt;                /* new extents still needed */
+        xfs_extnum_t    nex2;                   /* extents after idx + count */
+        xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
+        int             nlists;                 /* number of irec's (lists) */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        erp = &ifp->if_u1.if_ext_irec[erp_idx];
+        nex2 = erp->er_extcount - idx;
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        /*
+         * Save second part of target extent list
+         * (all extents past */
+        if (nex2) {
+                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
+                memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
+                erp->er_extcount -= nex2;
+                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
+                memset(&erp->er_extbuf[idx], 0, byte_diff);
+        }
+        /*
+         * Add the new extents to the end of the target
+         * list, then allocate new irec record(s) and
+         * extent buffer(s) as needed to store the rest
+         * of the new extents.
+         */
+        ext_cnt = count;
+        ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
+        if (ext_diff) {
+                erp->er_extcount += ext_diff;
+                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+                ext_cnt -= ext_diff;
+        }
+        while (ext_cnt) {
+                erp_idx++;
+                erp = xfs_iext_irec_new(ifp, erp_idx);
+                ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
+                erp->er_extcount = ext_diff;
+                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+                ext_cnt -= ext_diff;
+        }
+        /* Add nex2 extents back to indirection array */
+        if (nex2) {
+                xfs_extnum_t    ext_avail;
+                int             i;
+                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+                ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
+                i = 0;
+                /*
+                 * If nex2 extents fit in the current page, append
+                 * nex2_ep after the new extents.
+                 */
+                if (nex2 <= ext_avail) {
+                        i = erp->er_extcount;
+                }
+                /*
+                 * Otherwise, check if space is available in the
+                 * next page.
+                 */
+                else if ((erp_idx < nlists - 1) &&
+                         (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
+                          ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
+                        erp_idx++;
+                        erp++;
+                        /* Create a hole for nex2 extents */
+                        memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
+                                erp->er_extcount * sizeof(xfs_bmbt_rec_t));
+                }
+                /*
+                 * Final choice, create a new extent page for
+                 * nex2 extents.
+                 */
+                else {
+                        erp_idx++;
+                        erp = xfs_iext_irec_new(ifp, erp_idx);
+                }
+                memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
+                kmem_free(nex2_ep);
+                erp->er_extcount += nex2;
+                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
+        }
+}
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be decreased. The ext_diff parameter stores the
+ * number of extents to be removed and the idx parameter contains
+ * the extent index where the extents will be removed from.
+ *
+ * If the amount of space needed has decreased below the linear
+ * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
+ * extent array.  Otherwise, use kmem_realloc() to adjust the
+ * size to what is needed.
+ */
+void
+xfs_iext_remove(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index to begin removing exts */
+        int             ext_diff,       /* number of extents to remove */
+        int             state)          /* type of extent conversion */
+{
+        xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+        xfs_extnum_t    nextents;       /* number of extents in file */
+        int             new_size;       /* size of extents after removal */
+        trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+        ASSERT(ext_diff > 0);
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
+        if (new_size == 0) {
+                xfs_iext_destroy(ifp);
+        } else if (ifp->if_flags & XFS_IFEXTIREC) {
+                xfs_iext_remove_indirect(ifp, idx, ext_diff);
+        } else if (ifp->if_real_bytes) {
+                xfs_iext_remove_direct(ifp, idx, ext_diff);
+        } else {
+                xfs_iext_remove_inline(ifp, idx, ext_diff);
+        }
+        ifp->if_bytes = new_size;
+}
+/*
+ * This removes ext_diff extents from the inline buffer, beginning
+ * at extent index idx.
+ */
+void
+xfs_iext_remove_inline(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_extnum_t    idx,            /* index to begin removing exts */
+        int             ext_diff)       /* number of extents to remove */
+{
+        int             nextents;       /* number of extents in file */
+        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+        ASSERT(idx < XFS_INLINE_EXTS);
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        ASSERT(((nextents - ext_diff) > 0) &&
+                (nextents - ext_diff) < XFS_INLINE_EXTS);
+        if (idx + ext_diff < nextents) {
+                memmove(&ifp->if_u2.if_inline_ext[idx],
+                        &ifp->if_u2.if_inline_ext[idx + ext_diff],
+                        (nextents - (idx + ext_diff)) *
+                         sizeof(xfs_bmbt_rec_t));
+                memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
+                        0, ext_diff * sizeof(xfs_bmbt_rec_t));
+        } else {
+                memset(&ifp->if_u2.if_inline_ext[idx], 0,
+                        ext_diff * sizeof(xfs_bmbt_rec_t));
+        }
+}
+/*
+ * This removes ext_diff extents from a linear (direct) extent list,
+ * beginning at extent index idx. If the extents are being removed
+ * from the end of the list (ie. truncate) then we just need to re-
+ * allocate the list to remove the extra space. Otherwise, if the
+ * extents are being removed from the middle of the existing extent
+ * entries, then we first need to move the extent records beginning
+ * at idx + ext_diff up in the list to overwrite the records being
+ * removed, then remove the extra space via kmem_realloc.
+ */
+void
+xfs_iext_remove_direct(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_extnum_t    idx,            /* index to begin removing exts */
+        int             ext_diff)       /* number of extents to remove */
+{
+        xfs_extnum_t    nextents;       /* number of extents in file */
+        int             new_size;       /* size of extents after removal */
+        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+        new_size = ifp->if_bytes -
+                (ext_diff * sizeof(xfs_bmbt_rec_t));
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        if (new_size == 0) {
+                xfs_iext_destroy(ifp);
+                return;
+        }
+        /* Move extents up in the list (if needed) */
+        if (idx + ext_diff < nextents) {
+                memmove(&ifp->if_u1.if_extents[idx],
+                        &ifp->if_u1.if_extents[idx + ext_diff],
+                        (nextents - (idx + ext_diff)) *
+                         sizeof(xfs_bmbt_rec_t));
+        }
+        memset(&ifp->if_u1.if_extents[nextents - ext_diff],
+                0, ext_diff * sizeof(xfs_bmbt_rec_t));
+        /*
+         * Reallocate the direct extent list. If the extents
+         * will fit inside the inode then xfs_iext_realloc_direct
+         * will switch from direct to inline extent allocation
+         * mode for us.
+         */
+        xfs_iext_realloc_direct(ifp, new_size);
+        ifp->if_bytes = new_size;
+}
+/*
+ * This is called when incore extents are being removed from the
+ * indirection array and the extents being removed span multiple extent
+ * buffers. The idx parameter contains the file extent index where we
+ * want to begin removing extents, and the count parameter contains
+ * how many extents need to be removed.
+ *
+ *    |-------|   |-------|
+ *    | nex1  |   |       |    nex1 - number of extents before idx
+ *    |-------|   | count |
+ *    |       |   |       |    count - number of extents being removed at idx
+ *    | count |   |-------|
+ *    |       |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_remove_indirect(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_extnum_t    idx,            /* index to begin removing extents */
+        int             count)          /* number of extents to remove */
+{
+        xfs_ext_irec_t  *erp;           /* indirection array pointer */
+        int             erp_idx = 0;    /* indirection array index */
+        xfs_extnum_t    ext_cnt;        /* extents left to remove */
+        xfs_extnum_t    ext_diff;       /* extents to remove in current list */
+        xfs_extnum_t    nex1;           /* number of extents before idx */
+        xfs_extnum_t    nex2;           /* extents after idx + count */
+        int             page_idx = idx; /* index in target extent list */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
+        ASSERT(erp != NULL);
+        nex1 = page_idx;
+        ext_cnt = count;
+        while (ext_cnt) {
+                nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
+                ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
+                /*
+                 * Check for deletion of entire list;
+                 * xfs_iext_irec_remove() updates extent offsets.
+                 */
+                if (ext_diff == erp->er_extcount) {
+                        xfs_iext_irec_remove(ifp, erp_idx);
+                        ext_cnt -= ext_diff;
+                        nex1 = 0;
+                        if (ext_cnt) {
+                                ASSERT(erp_idx < ifp->if_real_bytes /
+                                        XFS_IEXT_BUFSZ);
+                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
+                                nex1 = 0;
+                                continue;
+                        } else {
+                                break;
+                        }
+                }
+                /* Move extents up (if needed) */
+                if (nex2) {
+                        memmove(&erp->er_extbuf[nex1],
+                                &erp->er_extbuf[nex1 + ext_diff],
+                                nex2 * sizeof(xfs_bmbt_rec_t));
+                }
+                /* Zero out rest of page */
+                memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
+                        ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
+                /* Update remaining counters */
+                erp->er_extcount -= ext_diff;
+                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
+                ext_cnt -= ext_diff;
+                nex1 = 0;
+                erp_idx++;
+                erp++;
+        }
+        ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
+        xfs_iext_irec_compact(ifp);
+}
+/*
+ * Create, destroy, or resize a linear (direct) block of extents.
+ */
+void
+xfs_iext_realloc_direct(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        int             new_size)       /* new size of extents after adding */
+{
+        int             rnew_size;      /* real new size of extents */
+        rnew_size = new_size;
+        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
+                ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
+                 (new_size != ifp->if_real_bytes)));
+        /* Free extent records */
+        if (new_size == 0) {
+                xfs_iext_destroy(ifp);
+        }
+        /* Resize direct extent list and zero any new bytes */
+        else if (ifp->if_real_bytes) {
+                /* Check if extents will fit inside the inode */
+                if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
+                        xfs_iext_direct_to_inline(ifp, new_size /
+                                (uint)sizeof(xfs_bmbt_rec_t));
+                        ifp->if_bytes = new_size;
+                        return;
+                }
+                if (!is_power_of_2(new_size)){
+                        rnew_size = roundup_pow_of_two(new_size);
+                }
+                if (rnew_size != ifp->if_real_bytes) {
+                        ifp->if_u1.if_extents =
+                                kmem_realloc(ifp->if_u1.if_extents,
+                                                rnew_size,
+                                                ifp->if_real_bytes, KM_NOFS);
+                }
+                if (rnew_size > ifp->if_real_bytes) {
+                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
+                                (uint)sizeof(xfs_bmbt_rec_t)], 0,
+                                rnew_size - ifp->if_real_bytes);
+                }
+        }
+        /* Switch from the inline extent buffer to a direct extent list */
+        else {
+                if (!is_power_of_2(new_size)) {
+                        rnew_size = roundup_pow_of_two(new_size);
+                }
+                xfs_iext_inline_to_direct(ifp, rnew_size);
+        }
+        ifp->if_real_bytes = rnew_size;
+        ifp->if_bytes = new_size;
+}
+/*
+ * Switch from linear (direct) extent records to inline buffer.
+ */
+void
+xfs_iext_direct_to_inline(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_extnum_t    nextents)       /* number of extents in file */
+{
+        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+        ASSERT(nextents <= XFS_INLINE_EXTS);
+        /*
+         * The inline buffer was zeroed when we switched
+         * from inline to direct extent allocation mode,
+         * so we don't need to clear it here.
+         */
+        memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+                nextents * sizeof(xfs_bmbt_rec_t));
+        kmem_free(ifp->if_u1.if_extents);
+        ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+        ifp->if_real_bytes = 0;
+}
+/*
+ * Switch from inline buffer to linear (direct) extent records.
+ * new_size should already be rounded up to the next power of 2
+ * by the caller (when appropriate), so use new_size as it is.
+ * However, since new_size may be rounded up, we can't update
+ * if_bytes here. It is the caller's responsibility to update
+ * if_bytes upon return.
+ */
+void
+xfs_iext_inline_to_direct(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        int             new_size)       /* number of extents in file */
+{
+        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
+        memset(ifp->if_u1.if_extents, 0, new_size);
+        if (ifp->if_bytes) {
+                memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+                        ifp->if_bytes);
+                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+                        sizeof(xfs_bmbt_rec_t));
+        }
+        ifp->if_real_bytes = new_size;
+}
+/*
+ * Resize an extent indirection array to new_size bytes.
+ */
+STATIC void
+xfs_iext_realloc_indirect(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        int             new_size)       /* new indirection array size */
+{
+        int             nlists;         /* number of irec's (ex lists) */
+        int             size;           /* current indirection array size */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        size = nlists * sizeof(xfs_ext_irec_t);
+        ASSERT(ifp->if_real_bytes);
+        ASSERT((new_size >= 0) && (new_size != size));
+        if (new_size == 0) {
+                xfs_iext_destroy(ifp);
+        } else {
+                ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+                        kmem_realloc(ifp->if_u1.if_ext_irec,
+                                new_size, size, KM_NOFS);
+        }
+}
+/*
+ * Switch from indirection array to linear (direct) extent allocations.
+ */
+STATIC void
+xfs_iext_indirect_to_direct(
+         xfs_ifork_t    *ifp)           /* inode fork pointer */
+{
+        xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
+        xfs_extnum_t    nextents;       /* number of extents in file */
+        int             size;           /* size of file extents */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        ASSERT(nextents <= XFS_LINEAR_EXTS);
+        size = nextents * sizeof(xfs_bmbt_rec_t);
+        xfs_iext_irec_compact_pages(ifp);
+        ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
+        ep = ifp->if_u1.if_ext_irec->er_extbuf;
+        kmem_free(ifp->if_u1.if_ext_irec);
+        ifp->if_flags &= ~XFS_IFEXTIREC;
+        ifp->if_u1.if_extents = ep;
+        ifp->if_bytes = size;
+        if (nextents < XFS_LINEAR_EXTS) {
+                xfs_iext_realloc_direct(ifp, size);
+        }
+}
+/*
+ * Free incore file extents.
+ */
+void
+xfs_iext_destroy(
+        xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+        if (ifp->if_flags & XFS_IFEXTIREC) {
+                int     erp_idx;
+                int     nlists;
+                nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+                for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
+                        xfs_iext_irec_remove(ifp, erp_idx);
+                }
+                ifp->if_flags &= ~XFS_IFEXTIREC;
+        } else if (ifp->if_real_bytes) {
+                kmem_free(ifp->if_u1.if_extents);
+        } else if (ifp->if_bytes) {
+                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+                        sizeof(xfs_bmbt_rec_t));
+        }
+        ifp->if_u1.if_extents = NULL;
+        ifp->if_real_bytes = 0;
+        ifp->if_bytes = 0;
+}
+/*
+ * Return a pointer to the extent record for file system block bno.
+ */
+xfs_bmbt_rec_host_t *                   /* pointer to found extent record */
+xfs_iext_bno_to_ext(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_fileoff_t   bno,            /* block number to search for */
+        xfs_extnum_t    *idxp)          /* index of target extent */
+{
+        xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
+        xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
+        xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
+        xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
+        int             high;           /* upper boundary in search */
+        xfs_extnum_t    idx = 0;        /* index of target extent */
+        int             low;            /* lower boundary in search */
+        xfs_extnum_t    nextents;       /* number of file extents */
+        xfs_fileoff_t   startoff = 0;   /* start offset of extent */
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        if (nextents == 0) {
+                *idxp = 0;
+                return NULL;
+        }
+        low = 0;
+        if (ifp->if_flags & XFS_IFEXTIREC) {
+                /* Find target extent list */
+                int     erp_idx = 0;
+                erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
+                base = erp->er_extbuf;
+                high = erp->er_extcount - 1;
+        } else {
+                base = ifp->if_u1.if_extents;
+                high = nextents - 1;
+        }
+        /* Binary search extent records */
+        while (low <= high) {
+                idx = (low + high) >> 1;
+                ep = base + idx;
+                startoff = xfs_bmbt_get_startoff(ep);
+                blockcount = xfs_bmbt_get_blockcount(ep);
+                if (bno < startoff) {
+                        high = idx - 1;
+                } else if (bno >= startoff + blockcount) {
+                        low = idx + 1;
+                } else {
+                        /* Convert back to file-based extent index */
+                        if (ifp->if_flags & XFS_IFEXTIREC) {
+                                idx += erp->er_extoff;
+                        }
+                        *idxp = idx;
+                        return ep;
+                }
+        }
+        /* Convert back to file-based extent index */
+        if (ifp->if_flags & XFS_IFEXTIREC) {
+                idx += erp->er_extoff;
+        }
+        if (bno >= startoff + blockcount) {
+                if (++idx == nextents) {
+                        ep = NULL;
+                } else {
+                        ep = xfs_iext_get_ext(ifp, idx);
+                }
+        }
+        *idxp = idx;
+        return ep;
+}
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record for filesystem block bno. Store the index of the
+ * target irec in *erp_idxp.
+ */
+xfs_ext_irec_t *                        /* pointer to found extent record */
+xfs_iext_bno_to_irec(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_fileoff_t   bno,            /* block number to search for */
+        int             *erp_idxp)      /* irec index of target ext list */
+{
+        xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
+        xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
+        int             erp_idx;        /* indirection array index */
+        int             nlists;         /* number of extent irec's (lists) */
+        int             high;           /* binary search upper limit */
+        int             low;            /* binary search lower limit */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        erp_idx = 0;
+        low = 0;
+        high = nlists - 1;
+        while (low <= high) {
+                erp_idx = (low + high) >> 1;
+                erp = &ifp->if_u1.if_ext_irec[erp_idx];
+                erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
+                if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
+                        high = erp_idx - 1;
+                } else if (erp_next && bno >=
+                           xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
+                        low = erp_idx + 1;
+                } else {
+                        break;
+                }
+        }
+        *erp_idxp = erp_idx;
+        return erp;
+}
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record at file extent index *idxp. Store the index of the
+ * target irec in *erp_idxp and store the page index of the target
+ * extent record in *idxp.
+ */
+xfs_ext_irec_t *
+xfs_iext_idx_to_irec(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        xfs_extnum_t    *idxp,          /* extent index (file -> page) */
+        int             *erp_idxp,      /* pointer to target irec */
+        int             realloc)        /* new bytes were just added */
+{
+        xfs_ext_irec_t  *prev;          /* pointer to previous irec */
+        xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
+        int             erp_idx;        /* indirection array index */
+        int             nlists;         /* number of irec's (ex lists) */
+        int             high;           /* binary search upper limit */
+        int             low;            /* binary search lower limit */
+        xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        ASSERT(page_idx >= 0);
+        ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+        ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        erp_idx = 0;
+        low = 0;
+        high = nlists - 1;
+        /* Binary search extent irec's */
+        while (low <= high) {
+                erp_idx = (low + high) >> 1;
+                erp = &ifp->if_u1.if_ext_irec[erp_idx];
+                prev = erp_idx > 0 ? erp - 1 : NULL;
+                if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
+                     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
+                        high = erp_idx - 1;
+                } else if (page_idx > erp->er_extoff + erp->er_extcount ||
+                           (page_idx == erp->er_extoff + erp->er_extcount &&
+                            !realloc)) {
+                        low = erp_idx + 1;
+                } else if (page_idx == erp->er_extoff + erp->er_extcount &&
+                           erp->er_extcount == XFS_LINEAR_EXTS) {
+                        ASSERT(realloc);
+                        page_idx = 0;
+                        erp_idx++;
+                        erp = erp_idx < nlists ? erp + 1 : NULL;
+                        break;
+                } else {
+                        page_idx -= erp->er_extoff;
+                        break;
+                }
+        }
+        *idxp = page_idx;
+        *erp_idxp = erp_idx;
+        return erp;
+}
+/*
+ * Allocate and initialize an indirection array once the space needed
+ * for incore extents increases above XFS_IEXT_BUFSZ.
+ */
+void
+xfs_iext_irec_init(
+        xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+        xfs_ext_irec_t  *erp;           /* indirection array pointer */
+        xfs_extnum_t    nextents;       /* number of extents in file */
+        ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        ASSERT(nextents <= XFS_LINEAR_EXTS);
+        erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
+        if (nextents == 0) {
+                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+        } else if (!ifp->if_real_bytes) {
+                xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
+        } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
+                xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
+        }
+        erp->er_extbuf = ifp->if_u1.if_extents;
+        erp->er_extcount = nextents;
+        erp->er_extoff = 0;
+        ifp->if_flags |= XFS_IFEXTIREC;
+        ifp->if_real_bytes = XFS_IEXT_BUFSZ;
+        ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
+        ifp->if_u1.if_ext_irec = erp;
+        return;
+}
+/*
+ * Allocate and initialize a new entry in the indirection array.
+ */
+xfs_ext_irec_t *
+xfs_iext_irec_new(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        int             erp_idx)        /* index for new irec */
+{
+        xfs_ext_irec_t  *erp;           /* indirection array pointer */
+        int             i;              /* loop counter */
+        int             nlists;         /* number of irec's (ex lists) */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        /* Resize indirection array */
+        xfs_iext_realloc_indirect(ifp, ++nlists *
+                                  sizeof(xfs_ext_irec_t));
+        /*
+         * Move records down in the array so the
+         * new page can use erp_idx.
+         */
+        erp = ifp->if_u1.if_ext_irec;
+        for (i = nlists - 1; i > erp_idx; i--) {
+                memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
+        }
+        ASSERT(i == erp_idx);
+        /* Initialize new extent record */
+        erp = ifp->if_u1.if_ext_irec;
+        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+        memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
+        erp[erp_idx].er_extcount = 0;
+        erp[erp_idx].er_extoff = erp_idx > 0 ?
+                erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
+        return (&erp[erp_idx]);
+}
+/*
+ * Remove a record from the indirection array.
+ */
+void
+xfs_iext_irec_remove(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        int             erp_idx)        /* irec index to remove */
+{
+        xfs_ext_irec_t  *erp;           /* indirection array pointer */
+        int             i;              /* loop counter */
+        int             nlists;         /* number of irec's (ex lists) */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        erp = &ifp->if_u1.if_ext_irec[erp_idx];
+        if (erp->er_extbuf) {
+                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
+                        -erp->er_extcount);
+                kmem_free(erp->er_extbuf);
+        }
+        /* Compact extent records */
+        erp = ifp->if_u1.if_ext_irec;
+        for (i = erp_idx; i < nlists - 1; i++) {
+                memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
+        }
+        /*
+         * Manually free the last extent record from the indirection
+         * array.  A call to xfs_iext_realloc_indirect() with a size
+         * of zero would result in a call to xfs_iext_destroy() which
+         * would in turn call this function again, creating a nasty
+         * infinite loop.
+         */
+        if (--nlists) {
+                xfs_iext_realloc_indirect(ifp,
+                        nlists * sizeof(xfs_ext_irec_t));
+        } else {
+                kmem_free(ifp->if_u1.if_ext_irec);
+        }
+        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+}
+/*
+ * This is called to clean up large amounts of unused memory allocated
+ * by the indirection array.  Before compacting anything though, verify
+ * that the indirection array is still needed and switch back to the
+ * linear extent list (or even the inline buffer) if possible.  The
+ * compaction policy is as follows:
+ *
+ *    Full Compaction: Extents fit into a single page (or inline buffer)
+ * Partial Compaction: Extents occupy less than 50% of allocated space
+ *      No Compaction: Extents occupy at least 50% of allocated space
+ */
+void
+xfs_iext_irec_compact(
+        xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+        xfs_extnum_t    nextents;       /* number of extents in file */
+        int             nlists;         /* number of irec's (ex lists) */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        if (nextents == 0) {
+                xfs_iext_destroy(ifp);
+        } else if (nextents <= XFS_INLINE_EXTS) {
+                xfs_iext_indirect_to_direct(ifp);
+                xfs_iext_direct_to_inline(ifp, nextents);
+        } else if (nextents <= XFS_LINEAR_EXTS) {
+                xfs_iext_indirect_to_direct(ifp);
+        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
+                xfs_iext_irec_compact_pages(ifp);
+        }
+}
+/*
+ * Combine extents from neighboring extent pages.
+ */
+void
+xfs_iext_irec_compact_pages(
+        xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+        xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
+        int             erp_idx = 0;    /* indirection array index */
+        int             nlists;         /* number of irec's (ex lists) */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        while (erp_idx < nlists - 1) {
+                erp = &ifp->if_u1.if_ext_irec[erp_idx];
+                erp_next = erp + 1;
+                if (erp_next->er_extcount <=
+                    (XFS_LINEAR_EXTS - erp->er_extcount)) {
+                        memcpy(&erp->er_extbuf[erp->er_extcount],
+                                erp_next->er_extbuf, erp_next->er_extcount *
+                                sizeof(xfs_bmbt_rec_t));
+                        erp->er_extcount += erp_next->er_extcount;
+                        /*
+                         * Free page before removing extent record
+                         * so er_extoffs don't get modified in
+                         * xfs_iext_irec_remove.
+                         */
+                        kmem_free(erp_next->er_extbuf);
+                        erp_next->er_extbuf = NULL;
+                        xfs_iext_irec_remove(ifp, erp_idx + 1);
+                        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+                } else {
+                        erp_idx++;
+                }
+        }
+}
+/*
+ * This is called to update the er_extoff field in the indirection
+ * array when extents have been added or removed from one of the
+ * extent lists. erp_idx contains the irec index to begin updating
+ * at and ext_diff contains the number of extents that were added
+ * or removed.
+ */
+void
+xfs_iext_irec_update_extoffs(
+        xfs_ifork_t     *ifp,           /* inode fork pointer */
+        int             erp_idx,        /* irec index to update */
+        int             ext_diff)       /* number of new extents */
+{
+        int             i;              /* loop counter */
+        int             nlists;         /* number of irec's (ex lists */
+        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+        for (i = erp_idx; i < nlists; i++) {
+                ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
+        }
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
new file mode 100644
index 000000000000..7d3b1ed6dcbe
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_INODE_FORK_H__
+#define __XFS_INODE_FORK_H__
+struct xfs_inode_log_item;
+struct xfs_dinode;
+/*
+ * The following xfs_ext_irec_t struct introduces a second (top) level
+ * to the in-core extent allocation scheme. These structs are allocated
+ * in a contiguous block, creating an indirection array where each entry
+ * (irec) contains a pointer to a buffer of in-core extent records which
+ * it manages. Each extent buffer is 4k in size, since 4k is the system
+ * page size on Linux i386 and systems with larger page sizes don't seem
+ * to gain much, if anything, by using their native page size as the
+ * extent buffer size. Also, using 4k extent buffers everywhere provides
+ * a consistent interface for CXFS across different platforms.
+ *
+ * There is currently no limit on the number of irec's (extent lists)
+ * allowed, so heavily fragmented files may require an indirection array
+ * which spans multiple system pages of memory. The number of extents
+ * which would require this amount of contiguous memory is very large
+ * and should not cause problems in the foreseeable future. However,
+ * if the memory needed for the contiguous array ever becomes a problem,
+ * it is possible that a third level of indirection may be required.
+ */
+typedef struct xfs_ext_irec {
+        xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
+        xfs_extnum_t    er_extoff;      /* extent offset in file */
+        xfs_extnum_t    er_extcount;    /* number of extents in page/block */
+} xfs_ext_irec_t;
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define XFS_IEXT_BUFSZ          4096
+#define XFS_LINEAR_EXTS         (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
+#define XFS_INLINE_EXTS         2
+#define XFS_INLINE_DATA         32
+typedef struct xfs_ifork {
+        int                     if_bytes;       /* bytes in if_u1 */
+        int                     if_real_bytes;  /* bytes allocated in if_u1 */
+        struct xfs_btree_block  *if_broot;      /* file's incore btree root */
+        short                   if_broot_bytes; /* bytes allocated for root */
+        unsigned char           if_flags;       /* per-fork flags */
+        union {
+                xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
+                xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
+                char            *if_data;       /* inline file data */
+        } if_u1;
+        union {
+                xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
+                                                /* very small file extents */
+                char            if_inline_data[XFS_INLINE_DATA];
+                                                /* very small file data */
+                xfs_dev_t       if_rdev;        /* dev number if special */
+                uuid_t          if_uuid;        /* mount point value */
+        } if_u2;
+} xfs_ifork_t;
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE    0x01    /* Inline data is read in */
+#define XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
+#define XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
+#define XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
+/*
+ * Fork handling.
+ */
+#define XFS_IFORK_Q(ip)                 ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)              ((int)((ip)->i_d.di_forkoff << 3))
+#define XFS_IFORK_PTR(ip,w)             \
+        ((w) == XFS_DATA_FORK ? \
+                &(ip)->i_df : \
+                (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+        (XFS_IFORK_Q(ip) ? \
+                XFS_IFORK_BOFF(ip) : \
+                XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
+#define XFS_IFORK_ASIZE(ip) \
+        (XFS_IFORK_Q(ip) ? \
+                XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
+                        XFS_IFORK_BOFF(ip) : \
+                0)
+#define XFS_IFORK_SIZE(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                XFS_IFORK_DSIZE(ip) : \
+                XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (ip)->i_d.di_format : \
+                (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((ip)->i_d.di_format = (n)) : \
+                ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (ip)->i_d.di_nextents : \
+                (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((ip)->i_d.di_nextents = (n)) : \
+                ((ip)->i_d.di_anextents = (n)))
+#define XFS_IFORK_MAXEXT(ip, w) \
+        (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+int             xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
+void            xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+                                struct xfs_inode_log_item *, int);
+void            xfs_idestroy_fork(struct xfs_inode *, int);
+void            xfs_idata_realloc(struct xfs_inode *, int, int);
+void            xfs_iroot_realloc(struct xfs_inode *, int, int);
+int             xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int             xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
+                                  int);
+struct xfs_bmbt_rec_host *
+                xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+void            xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
+                                struct xfs_bmbt_irec *, int);
+void            xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
+void            xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
+                                            xfs_extnum_t, int);
+void            xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
+void            xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
+void            xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
+void            xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
+void            xfs_iext_realloc_direct(struct xfs_ifork *, int);
+void            xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
+void            xfs_iext_inline_to_direct(struct xfs_ifork *, int);
+void            xfs_iext_destroy(struct xfs_ifork *);
+struct xfs_bmbt_rec_host *
+                xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+                xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+                xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
+                                     int);
+void            xfs_iext_irec_init(struct xfs_ifork *);
+struct xfs_ext_irec *
+                xfs_iext_irec_new(struct xfs_ifork *, int);
+void            xfs_iext_irec_remove(struct xfs_ifork *, int);
+void            xfs_iext_irec_compact(struct xfs_ifork *);
+void            xfs_iext_irec_compact_pages(struct xfs_ifork *);
+void            xfs_iext_irec_compact_full(struct xfs_ifork *);
+void            xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
+extern struct kmem_zone *xfs_ifork_zone;
+#endif  /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
new file mode 100644
index 000000000000..90efdaf1706f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inum.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_INUM_H__
+#define __XFS_INUM_H__
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+struct xfs_mount;
+#define XFS_INO_MASK(k)                 (__uint32_t)((1ULL << (k)) - 1)
+#define XFS_INO_OFFSET_BITS(mp)         (mp)->m_sb.sb_inopblog
+#define XFS_INO_AGBNO_BITS(mp)          (mp)->m_sb.sb_agblklog
+#define XFS_INO_AGINO_BITS(mp)          (mp)->m_agino_log
+#define XFS_INO_AGNO_BITS(mp)           (mp)->m_agno_log
+#define XFS_INO_BITS(mp)                \
+        XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
+#define XFS_INO_TO_AGNO(mp,i)           \
+        ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGINO(mp,i)          \
+        ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGBNO(mp,i)          \
+        (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+                XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#define XFS_INO_TO_OFFSET(mp,i)         \
+        ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_INO_TO_FSB(mp,i)            \
+        XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#define XFS_AGINO_TO_INO(mp,a,i)        \
+        (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#define XFS_AGINO_TO_AGBNO(mp,i)        ((i) >> XFS_INO_OFFSET_BITS(mp))
+#define XFS_AGINO_TO_OFFSET(mp,i)       \
+        ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_OFFBNO_TO_AGINO(mp,b,o)     \
+        ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#if XFS_BIG_INUMS
+#define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 56) - 1ULL))
+#else
+#define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 32) - 1ULL))
+#endif
+#define XFS_MAXINUMBER_32       ((xfs_ino_t)((1ULL << 32) - 1ULL))
+#endif  /* __XFS_INUM_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
new file mode 100644
index 000000000000..f0969c77bdbe
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_LOG_FORMAT_H__
+#define __XFS_LOG_FORMAT_H__
+struct xfs_mount;
+struct xfs_trans_res;
+/*
+ * On-disk Log Format definitions.
+ *
+ * This file contains all the on-disk format definitions used within the log. It
+ * includes the physical log structure itself, as well as all the log item
+ * format structures that are written into the log and intepreted by log
+ * recovery. We start with the physical log format definitions, and then work
+ * through all the log items definitions and everything they encode into the
+ * log.
+ */
+typedef __uint32_t xlog_tid_t;
+#define XLOG_MIN_ICLOGS         2
+#define XLOG_MAX_ICLOGS         8
+#define XLOG_HEADER_MAGIC_NUM   0xFEEDbabe      /* Invalid cycle number */
+#define XLOG_VERSION_1          1
+#define XLOG_VERSION_2          2               /* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS     (XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE   (16*1024)       /* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE   (32*1024)       /* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE   (256*1024)
+#define XLOG_HEADER_CYCLE_SIZE  (32*1024)       /* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT  14              /* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT  15              /* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT  18              /* 256k == 1 << 18 */
+#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+                                 (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_HEADER_SIZE        512
+/* Minimum number of transactions that must fit in the log (defined by mkfs) */
+#define XFS_MIN_LOG_FACTOR      3
+#define XLOG_REC_SHIFT(log) \
+        BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+        return ((xfs_lsn_t)cycle << 32) | block;
+}
+static inline uint xlog_get_cycle(char *ptr)
+{
+        if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+                return be32_to_cpu(*((__be32 *)ptr + 1));
+        else
+                return be32_to_cpu(*(__be32 *)ptr);
+}
+/* Log Clients */
+#define XFS_TRANSACTION         0x69
+#define XFS_VOLUME              0x2
+#define XFS_LOG                 0xaa
+#define XLOG_UNMOUNT_TYPE       0x556e  /* Un for Unmount */
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT           1
+#define XLOG_REG_TYPE_BCHUNK            2
+#define XLOG_REG_TYPE_EFI_FORMAT        3
+#define XLOG_REG_TYPE_EFD_FORMAT        4
+#define XLOG_REG_TYPE_IFORMAT           5
+#define XLOG_REG_TYPE_ICORE             6
+#define XLOG_REG_TYPE_IEXT              7
+#define XLOG_REG_TYPE_IBROOT            8
+#define XLOG_REG_TYPE_ILOCAL            9
+#define XLOG_REG_TYPE_IATTR_EXT         10
+#define XLOG_REG_TYPE_IATTR_BROOT       11
+#define XLOG_REG_TYPE_IATTR_LOCAL       12
+#define XLOG_REG_TYPE_QFORMAT           13
+#define XLOG_REG_TYPE_DQUOT             14
+#define XLOG_REG_TYPE_QUOTAOFF          15
+#define XLOG_REG_TYPE_LRHEADER          16
+#define XLOG_REG_TYPE_UNMOUNT           17
+#define XLOG_REG_TYPE_COMMIT            18
+#define XLOG_REG_TYPE_TRANSHDR          19
+#define XLOG_REG_TYPE_ICREATE           20
+#define XLOG_REG_TYPE_MAX               20
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS        0x01    /* Start a new transaction */
+#define XLOG_COMMIT_TRANS       0x02    /* Commit this transaction */
+#define XLOG_CONTINUE_TRANS     0x04    /* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS     0x08    /* Cont this trans into new region */
+#define XLOG_END_TRANS          0x10    /* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS      0x20    /* Unmount a filesystem transaction */
+typedef struct xlog_op_header {
+        __be32     oh_tid;      /* transaction id of operation  :  4 b */
+        __be32     oh_len;      /* bytes in data region         :  4 b */
+        __u8       oh_clientid; /* who sent me this             :  1 b */
+        __u8       oh_flags;    /*                              :  1 b */
+        __u16      oh_res2;     /* 32 bit align                 :  2 b */
+} xlog_op_header_t;
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+typedef struct xlog_rec_header {
+        __be32    h_magicno;    /* log record (LR) identifier           :  4 */
+        __be32    h_cycle;      /* write cycle of log                   :  4 */
+        __be32    h_version;    /* LR version                           :  4 */
+        __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
+        __be64    h_lsn;        /* lsn of this LR                       :  8 */
+        __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
+        __le32    h_crc;        /* crc of log record                    :  4 */
+        __be32    h_prev_block; /* block number to previous LR          :  4 */
+        __be32    h_num_logops; /* number of log operations in this LR  :  4 */
+        __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+        /* new fields */
+        __be32    h_fmt;        /* format of log record                 :  4 */
+        uuid_t    h_fs_uuid;    /* uuid of FS                           : 16 */
+        __be32    h_size;       /* iclog size                           :  4 */
+} xlog_rec_header_t;
+typedef struct xlog_rec_ext_header {
+        __be32    xh_cycle;     /* write cycle of log                   : 4 */
+        __be32    xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*    : 256 */
+} xlog_rec_ext_header_t;
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+        xlog_rec_header_t       hic_header;
+        xlog_rec_ext_header_t   hic_xheader;
+        char                    hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+/* not an on-disk structure, but needed by log recovery in userspace */
+typedef struct xfs_log_iovec {
+        void            *i_addr;        /* beginning address of region */
+        int             i_len;          /* length in bytes of region */
+        uint            i_type;         /* type of region */
+} xfs_log_iovec_t;
+/*
+ * Transaction Header definitions.
+ *
+ * This is the structure written in the log at the head of every transaction. It
+ * identifies the type and id of the transaction, and contains the number of
+ * items logged by the transaction so we know how many to expect during
+ * recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+        uint            th_magic;               /* magic number */
+        uint            th_type;                /* transaction type */
+        __int32_t       th_tid;                 /* transaction id (unused) */
+        uint            th_num_items;           /* num items logged by trans */
+} xfs_trans_header_t;
+#define XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
+/*
+ * Log item types.
+ */
+#define XFS_LI_EFI              0x1236
+#define XFS_LI_EFD              0x1237
+#define XFS_LI_IUNLINK          0x1238
+#define XFS_LI_INODE            0x123b  /* aligned ino chunks, var-size ibufs */
+#define XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
+#define XFS_LI_DQUOT            0x123d
+#define XFS_LI_QUOTAOFF         0x123e
+#define XFS_LI_ICREATE          0x123f
+#define XFS_LI_TYPE_DESC \
+        { XFS_LI_EFI,           "XFS_LI_EFI" }, \
+        { XFS_LI_EFD,           "XFS_LI_EFD" }, \
+        { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
+        { XFS_LI_INODE,         "XFS_LI_INODE" }, \
+        { XFS_LI_BUF,           "XFS_LI_BUF" }, \
+        { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
+        { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }, \
+        { XFS_LI_ICREATE,       "XFS_LI_ICREATE" }
+/*
+ * Inode Log Item Format definitions.
+ *
+ * This is the structure used to lay out an inode log item in the
+ * log.  The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ */
+typedef struct xfs_inode_log_format {
+        __uint16_t              ilf_type;       /* inode log item type */
+        __uint16_t              ilf_size;       /* size of this item */
+        __uint32_t              ilf_fields;     /* flags for fields logged */
+        __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+        __uint16_t              ilf_dsize;      /* size of data/ext/root */
+        __uint64_t              ilf_ino;        /* inode number */
+        union {
+                __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+                uuid_t          ilfu_uuid;      /* mount point value */
+        } ilf_u;
+        __int64_t               ilf_blkno;      /* blkno of inode buffer */
+        __int32_t               ilf_len;        /* len of inode buffer */
+        __int32_t               ilf_boffset;    /* off of inode in buffer */
+} xfs_inode_log_format_t;
+typedef struct xfs_inode_log_format_32 {
+        __uint16_t              ilf_type;       /* inode log item type */
+        __uint16_t              ilf_size;       /* size of this item */
+        __uint32_t              ilf_fields;     /* flags for fields logged */
+        __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+        __uint16_t              ilf_dsize;      /* size of data/ext/root */
+        __uint64_t              ilf_ino;        /* inode number */
+        union {
+                __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+                uuid_t          ilfu_uuid;      /* mount point value */
+        } ilf_u;
+        __int64_t               ilf_blkno;      /* blkno of inode buffer */
+        __int32_t               ilf_len;        /* len of inode buffer */
+        __int32_t               ilf_boffset;    /* off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+typedef struct xfs_inode_log_format_64 {
+        __uint16_t              ilf_type;       /* inode log item type */
+        __uint16_t              ilf_size;       /* size of this item */
+        __uint32_t              ilf_fields;     /* flags for fields logged */
+        __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+        __uint16_t              ilf_dsize;      /* size of data/ext/root */
+        __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
+        __uint64_t              ilf_ino;        /* inode number */
+        union {
+                __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+                uuid_t          ilfu_uuid;      /* mount point value */
+        } ilf_u;
+        __int64_t               ilf_blkno;      /* blkno of inode buffer */
+        __int32_t               ilf_len;        /* len of inode buffer */
+        __int32_t               ilf_boffset;    /* off of inode in buffer */
+} xfs_inode_log_format_64_t;
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define XFS_ILOG_CORE   0x001   /* log standard inode fields */
+#define XFS_ILOG_DDATA  0x002   /* log i_df.if_data */
+#define XFS_ILOG_DEXT   0x004   /* log i_df.if_extents */
+#define XFS_ILOG_DBROOT 0x008   /* log i_df.i_broot */
+#define XFS_ILOG_DEV    0x010   /* log the dev field */
+#define XFS_ILOG_UUID   0x020   /* log the uuid field */
+#define XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
+#define XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
+#define XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
+#define XFS_ILOG_DOWNER 0x200   /* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER 0x400   /* change the attr fork owner on replay */
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core.  Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP      0x4000
+#define XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+                                 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+                                 XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+                                 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+#define XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                 XFS_ILOG_DBROOT)
+#define XFS_ILOG_AFORK          (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                 XFS_ILOG_ABROOT)
+#define XFS_ILOG_ALL            (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+                                 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+                                 XFS_ILOG_DEV | XFS_ILOG_UUID | \
+                                 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
+                                 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+static inline int xfs_ilog_fbroot(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+static inline int xfs_ilog_fext(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+static inline int xfs_ilog_fdata(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+/*
+ * Incore version of the on-disk inode core structures. We log this directly
+ * into the journal in host CPU format (for better or worse) and as such
+ * directly mirrors the xfs_dinode structure as it must contain all the same
+ * information.
+ */
+typedef struct xfs_ictimestamp {
+        __int32_t       t_sec;          /* timestamp seconds */
+        __int32_t       t_nsec;         /* timestamp nanoseconds */
+} xfs_ictimestamp_t;
+/*
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
+ *        in xfs_dinode.h except for the endianness annotations.
+ */
+typedef struct xfs_icdinode {
+        __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
+        __uint16_t      di_mode;        /* mode and type of file */
+        __int8_t        di_version;     /* inode version */
+        __int8_t        di_format;      /* format of di_c data */
+        __uint16_t      di_onlink;      /* old number of links to file */
+        __uint32_t      di_uid;         /* owner's user id */
+        __uint32_t      di_gid;         /* owner's group id */
+        __uint32_t      di_nlink;       /* number of links to file */
+        __uint16_t      di_projid_lo;   /* lower part of owner's project id */
+        __uint16_t      di_projid_hi;   /* higher part of owner's project id */
+        __uint8_t       di_pad[6];      /* unused, zeroed space */
+        __uint16_t      di_flushiter;   /* incremented on flush */
+        xfs_ictimestamp_t di_atime;     /* time last accessed */
+        xfs_ictimestamp_t di_mtime;     /* time last modified */
+        xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
+        xfs_fsize_t     di_size;        /* number of bytes in file */
+        xfs_drfsbno_t   di_nblocks;     /* # of direct & btree blocks used */
+        xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
+        xfs_extnum_t    di_nextents;    /* number of extents in data fork */
+        xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
+        __uint8_t       di_forkoff;     /* attr fork offs, <<3 for 64b align */
+        __int8_t        di_aformat;     /* format of attr fork's data */
+        __uint32_t      di_dmevmask;    /* DMIG event mask */
+        __uint16_t      di_dmstate;     /* DMIG state info */
+        __uint16_t      di_flags;       /* random flags, XFS_DIFLAG_... */
+        __uint32_t      di_gen;         /* generation number */
+        /* di_next_unlinked is the only non-core field in the old dinode */
+        xfs_agino_t     di_next_unlinked;/* agi unlinked list ptr */
+        /* start of the extended dinode, writable fields */
+        __uint32_t      di_crc;         /* CRC of the inode */
+        __uint64_t      di_changecount; /* number of attribute changes */
+        xfs_lsn_t       di_lsn;         /* flush sequence */
+        __uint64_t      di_flags2;      /* more random flags */
+        __uint8_t       di_pad2[16];    /* more padding for future expansion */
+        /* fields only written to during inode creation */
+        xfs_ictimestamp_t di_crtime;    /* time created */
+        xfs_ino_t       di_ino;         /* inode number */
+        uuid_t          di_uuid;        /* UUID of the filesystem */
+        /* structure must be padded to 64 bit alignment */
+} xfs_icdinode_t;
+static inline uint xfs_icdinode_size(int version)
+{
+        if (version == 3)
+                return sizeof(struct xfs_icdinode);
+        return offsetof(struct xfs_icdinode, di_next_unlinked);
+}
+/*
+ * Buffer Log Format defintions
+ *
+ * These are the physical dirty bitmap defintions for the log format structure.
+ */
+#define XFS_BLF_CHUNK           128
+#define XFS_BLF_SHIFT           7
+#define BIT_TO_WORD_SHIFT       5
+#define NBWORD                  (NBBY * sizeof(unsigned int))
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define XFS_BLF_INODE_BUF       (1<<0)
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define XFS_BLF_CANCEL          (1<<1)
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define XFS_BLF_UDQUOT_BUF      (1<<2)
+#define XFS_BLF_PDQUOT_BUF      (1<<3)
+#define XFS_BLF_GDQUOT_BUF      (1<<4)
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log.  The data map describes which 128 byte chunks of the buffer
+ * have been logged.
+ */
+#define XFS_BLF_DATAMAP_SIZE    ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+typedef struct xfs_buf_log_format {
+        unsigned short  blf_type;       /* buf log item type indicator */
+        unsigned short  blf_size;       /* size of this item */
+        ushort          blf_flags;      /* misc state */
+        ushort          blf_len;        /* number of blocks in this buf */
+        __int64_t       blf_blkno;      /* starting blkno of this buf */
+        unsigned int    blf_map_size;   /* used size of data bitmap in words */
+        unsigned int    blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
+} xfs_buf_log_format_t;
+/*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS   5
+#define XFS_BLFT_SHIFT  11
+#define XFS_BLFT_MASK   (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+enum xfs_blft {
+        XFS_BLFT_UNKNOWN_BUF = 0,
+        XFS_BLFT_UDQUOT_BUF,
+        XFS_BLFT_PDQUOT_BUF,
+        XFS_BLFT_GDQUOT_BUF,
+        XFS_BLFT_BTREE_BUF,
+        XFS_BLFT_AGF_BUF,
+        XFS_BLFT_AGFL_BUF,
+        XFS_BLFT_AGI_BUF,
+        XFS_BLFT_DINO_BUF,
+        XFS_BLFT_SYMLINK_BUF,
+        XFS_BLFT_DIR_BLOCK_BUF,
+        XFS_BLFT_DIR_DATA_BUF,
+        XFS_BLFT_DIR_FREE_BUF,
+        XFS_BLFT_DIR_LEAF1_BUF,
+        XFS_BLFT_DIR_LEAFN_BUF,
+        XFS_BLFT_DA_NODE_BUF,
+        XFS_BLFT_ATTR_LEAF_BUF,
+        XFS_BLFT_ATTR_RMT_BUF,
+        XFS_BLFT_SB_BUF,
+        XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+        ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+        blf->blf_flags &= ~XFS_BLFT_MASK;
+        blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+static inline __uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+        return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+/*
+ * EFI/EFD log format definitions
+ */
+typedef struct xfs_extent {
+        xfs_dfsbno_t    ext_start;
+        xfs_extlen_t    ext_len;
+} xfs_extent_t;
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+typedef struct xfs_extent_32 {
+        __uint64_t      ext_start;
+        __uint32_t      ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+typedef struct xfs_extent_64 {
+        __uint64_t      ext_start;
+        __uint32_t      ext_len;
+        __uint32_t      ext_pad;
+} xfs_extent_64_t;
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.  The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+        __uint16_t              efi_type;       /* efi log item type */
+        __uint16_t              efi_size;       /* size of this item */
+        __uint32_t              efi_nextents;   /* # extents to free */
+        __uint64_t              efi_id;         /* efi identifier */
+        xfs_extent_t            efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_t;
+typedef struct xfs_efi_log_format_32 {
+        __uint16_t              efi_type;       /* efi log item type */
+        __uint16_t              efi_size;       /* size of this item */
+        __uint32_t              efi_nextents;   /* # extents to free */
+        __uint64_t              efi_id;         /* efi identifier */
+        xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+typedef struct xfs_efi_log_format_64 {
+        __uint16_t              efi_type;       /* efi log item type */
+        __uint16_t              efi_size;       /* size of this item */
+        __uint32_t              efi_nextents;   /* # extents to free */
+        __uint64_t              efi_id;         /* efi identifier */
+        xfs_extent_64_t         efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_64_t;
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.  The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+        __uint16_t              efd_type;       /* efd log item type */
+        __uint16_t              efd_size;       /* size of this item */
+        __uint32_t              efd_nextents;   /* # of extents freed */
+        __uint64_t              efd_efi_id;     /* id of corresponding efi */
+        xfs_extent_t            efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_t;
+typedef struct xfs_efd_log_format_32 {
+        __uint16_t              efd_type;       /* efd log item type */
+        __uint16_t              efd_size;       /* size of this item */
+        __uint32_t              efd_nextents;   /* # of extents freed */
+        __uint64_t              efd_efi_id;     /* id of corresponding efi */
+        xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+typedef struct xfs_efd_log_format_64 {
+        __uint16_t              efd_type;       /* efd log item type */
+        __uint16_t              efd_size;       /* size of this item */
+        __uint32_t              efd_nextents;   /* # of extents freed */
+        __uint64_t              efd_efi_id;     /* id of corresponding efi */
+        xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_64_t;
+/*
+ * Dquot Log format definitions.
+ *
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+        __uint16_t              qlf_type;      /* dquot log item type */
+        __uint16_t              qlf_size;      /* size of this item */
+        xfs_dqid_t              qlf_id;        /* usr/grp/proj id : 32 bits */
+        __int64_t               qlf_blkno;     /* blkno of dquot buffer */
+        __int32_t               qlf_len;       /* len of dquot buffer */
+        __uint32_t              qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+        unsigned short          qf_type;        /* quotaoff log item type */
+        unsigned short          qf_size;        /* size of this item */
+        unsigned int            qf_flags;       /* USR and/or GRP */
+        char                    qf_pad[12];     /* padding for future */
+} xfs_qoff_logformat_t;
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT 0x0001  /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD 0x0002  /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD 0x0004  /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT 0x0008  /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD 0x0010  /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD 0x0020  /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT 0x0040  /* group quota accounting ON */
+/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD 0x0080  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0100  /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD 0x0200  /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD 0x0400  /* quotacheck run on project quotas */
+#define XFS_ALL_QUOTA_ACCT      \
+                (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD      \
+                (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD      \
+                (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
+#define XFS_MOUNT_QUOTA_ALL     (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+                                 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+                                 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+                                 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+                                 XFS_PQUOTA_CHKD)
+/*
+ * Inode create log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+        __uint16_t      icl_type;       /* type of log format structure */
+        __uint16_t      icl_size;       /* size of log format structure */
+        __be32          icl_ag;         /* ag being allocated in */
+        __be32          icl_agbno;      /* start block of inode range */
+        __be32          icl_count;      /* number of inodes to initialise */
+        __be32          icl_isize;      /* size of inodes */
+        __be32          icl_length;     /* length of extent to initialise */
+        __be32          icl_gen;        /* inode generation number to use */
+};
+#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
new file mode 100644
index 000000000000..1c55ccbb379d
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+#define XLOG_RHASH_BITS  4
+#define XLOG_RHASH_SIZE 16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid) \
+        ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
+/*
+ * item headers are in ri_buf[0].  Additional buffers follow.
+ */
+typedef struct xlog_recover_item {
+        struct list_head        ri_list;
+        int                     ri_type;
+        int                     ri_cnt; /* count of regions found */
+        int                     ri_total;       /* total regions */
+        xfs_log_iovec_t         *ri_buf;        /* ptr to regions buffer */
+} xlog_recover_item_t;
+struct xlog_tid;
+typedef struct xlog_recover {
+        struct hlist_node       r_list;
+        xlog_tid_t              r_log_tid;      /* log's transaction id */
+        xfs_trans_header_t      r_theader;      /* trans header for partial */
+        int                     r_state;        /* not needed */
+        xfs_lsn_t               r_lsn;          /* xact lsn */
+        struct list_head        r_itemq;        /* q for items */
+} xlog_recover_t;
+#define ITEM_TYPE(i)    (*(ushort *)(i)->ri_buf[0].i_addr)
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define XLOG_BC_TABLE_SIZE      64
+#define XLOG_RECOVER_PASS1      1
+#define XLOG_RECOVER_PASS2      2
+#endif  /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
new file mode 100644
index 000000000000..ee7e0e80246b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2013 Jie Liu.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_trans_space.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_bmap_btree.h"
+/*
+ * Calculate the maximum length in bytes that would be required for a local
+ * attribute value as large attributes out of line are not logged.
+ */
+STATIC int
+xfs_log_calc_max_attrsetm_res(
+        struct xfs_mount        *mp)
+{
+        int                     size;
+        int                     nblks;
+        size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
+               MAXNAMELEN - 1;
+        nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+        nblks += XFS_B_TO_FSB(mp, size);
+        nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
+        return  M_RES(mp)->tr_attrsetm.tr_logres +
+                M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
+}
+/*
+ * Iterate over the log space reservation table to figure out and return
+ * the maximum one in terms of the pre-calculated values which were done
+ * at mount time.
+ */
+STATIC void
+xfs_log_get_max_trans_res(
+        struct xfs_mount        *mp,
+        struct xfs_trans_res    *max_resp)
+{
+        struct xfs_trans_res    *resp;
+        struct xfs_trans_res    *end_resp;
+        int                     log_space = 0;
+        int                     attr_space;
+        attr_space = xfs_log_calc_max_attrsetm_res(mp);
+        resp = (struct xfs_trans_res *)M_RES(mp);
+        end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
+        for (; resp < end_resp; resp++) {
+                int             tmp = resp->tr_logcount > 1 ?
+                                      resp->tr_logres * resp->tr_logcount :
+                                      resp->tr_logres;
+                if (log_space < tmp) {
+                        log_space = tmp;
+                        *max_resp = *resp;              /* struct copy */
+                }
+        }
+        if (attr_space > log_space) {
+                *max_resp = M_RES(mp)->tr_attrsetm;     /* struct copy */
+                max_resp->tr_logres = attr_space;
+        }
+}
+/*
+ * Calculate the minimum valid log size for the given superblock configuration.
+ * Used to calculate the minimum log size at mkfs time, and to determine if
+ * the log is large enough or not at mount time. Returns the minimum size in
+ * filesystem block size units.
+ */
+int
+xfs_log_calc_minimum_size(
+        struct xfs_mount        *mp)
+{
+        struct xfs_trans_res    tres = {0};
+        int                     max_logres;
+        int                     min_logblks = 0;
+        int                     lsunit = 0;
+        xfs_log_get_max_trans_res(mp, &tres);
+        max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
+        if (tres.tr_logcount > 1)
+                max_logres *= tres.tr_logcount;
+        if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+                lsunit = BTOBB(mp->m_sb.sb_logsunit);
+        /*
+         * Two factors should be taken into account for calculating the minimum
+         * log space.
+         * 1) The fundamental limitation is that no single transaction can be
+         *    larger than half size of the log.
+         *
+         *    From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
+         *    define, which is set to 3. That means we can definitely fit
+         *    maximally sized 2 transactions in the log. We'll use this same
+         *    value here.
+         *
+         * 2) If the lsunit option is specified, a transaction requires 2 LSU
+         *    for the reservation because there are two log writes that can
+         *    require padding - the transaction data and the commit record which
+         *    are written separately and both can require padding to the LSU.
+         *    Consider that we can have an active CIL reservation holding 2*LSU,
+         *    but the CIL is not over a push threshold, in this case, if we
+         *    don't have enough log space for at one new transaction, which
+         *    includes another 2*LSU in the reservation, we will run into dead
+         *    loop situation in log space grant procedure. i.e.
+         *    xlog_grant_head_wait().
+         *
+         *    Hence the log size needs to be able to contain two maximally sized
+         *    and padded transactions, which is (2 * (2 * LSU + maxlres)).
+         *
+         * Also, the log size should be a multiple of the log stripe unit, round
+         * it up to lsunit boundary if lsunit is specified.
+         */
+        if (lsunit) {
+                min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
+                              2 * lsunit;
+        } else
+                min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
+        min_logblks *= XFS_MIN_LOG_FACTOR;
+        return XFS_BB_TO_FSB(mp, min_logblks);
+}
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
new file mode 100644
index 000000000000..137e20937077
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QUOTA_DEFS_H__
+#define __XFS_QUOTA_DEFS_H__
+/*
+ * Quota definitions shared between user and kernel source trees.
+ */
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t      xfs_qcnt_t;
+typedef __uint16_t      xfs_qwarncnt_t;
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER             0x0001          /* a user quota */
+#define XFS_DQ_PROJ             0x0002          /* project quota */
+#define XFS_DQ_GROUP            0x0004          /* a group quota */
+#define XFS_DQ_DIRTY            0x0008          /* dquot is dirty */
+#define XFS_DQ_FREEING          0x0010          /* dquot is beeing torn down */
+#define XFS_DQ_ALLTYPES         (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+#define XFS_DQ_FLAGS \
+        { XFS_DQ_USER,          "USER" }, \
+        { XFS_DQ_PROJ,          "PROJ" }, \
+        { XFS_DQ_GROUP,         "GROUP" }, \
+        { XFS_DQ_DIRTY,         "DIRTY" }, \
+        { XFS_DQ_FREEING,       "FREEING" }
+/*
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
+ */
+#define XFS_DQUOT_LOGRES(mp)    \
+        ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
+#define XFS_IS_QUOTA_RUNNING(mp)        ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp)      ((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)      ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp)      ((mp)->m_qflags & XFS_PQUOTA_ENFD)
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE       0x1000  /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE       0x2000  /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE       0x4000  /* pquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE    \
+        (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp)     ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+                                                   XFS_GQUOTA_ACTIVE | \
+                                                   XFS_PQUOTA_ACTIVE))
+#define XFS_IS_OQUOTA_ON(mp)    ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
+                                                   XFS_PQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp)    ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp)    ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+#define XFS_IS_PQUOTA_ON(mp)    ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQALLOC       0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA        0x0000004 /* user dquot requested */
+#define XFS_QMOPT_PQUOTA        0x0000008 /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES     0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION     0x0000040 /* change superblock version num */
+#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
+#define XFS_QMOPT_DQREPAIR      0x0001000 /* repair dquot if damaged */
+#define XFS_QMOPT_GQUOTA        0x0002000 /* group dquot requested */
+#define XFS_QMOPT_ENOSPC        0x0004000 /* enospc instead of edquot (prj) */
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS   0x0010000
+#define XFS_QMOPT_RES_RTBLKS    0x0020000
+#define XFS_QMOPT_BCOUNT        0x0040000
+#define XFS_QMOPT_ICOUNT        0x0080000
+#define XFS_QMOPT_RTBCOUNT      0x0100000
+#define XFS_QMOPT_DELBCOUNT     0x0200000
+#define XFS_QMOPT_DELRTBCOUNT   0x0400000
+#define XFS_QMOPT_RES_INOS      0x0800000
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT       0x1000000
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS   XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS   XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT     XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT  XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT     XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT   XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+#define XFS_QMOPT_QUOTALL       \
+                (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK   (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
+                       xfs_dqid_t id, uint type, uint flags, char *str);
+extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
+#endif  /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
new file mode 100644
index 000000000000..f4dd697cac08
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_rtalloc.h"
+/*
+ * Realtime allocator bitmap functions shared with userspace.
+ */
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+int
+xfs_rtbuf_get(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   block,          /* block number in bitmap or summary */
+        int             issum,          /* is summary not bitmap */
+        xfs_buf_t       **bpp)          /* output: buffer for the block */
+{
+        xfs_buf_t       *bp;            /* block buffer, result */
+        xfs_inode_t     *ip;            /* bitmap or summary inode */
+        xfs_bmbt_irec_t map;
+        int             nmap = 1;
+        int             error;          /* error value */
+        ip = issum ? mp->m_rsumip : mp->m_rbmip;
+        error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+        if (error)
+                return error;
+        ASSERT(map.br_startblock != NULLFSBLOCK);
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                   XFS_FSB_TO_DADDR(mp, map.br_startblock),
+                                   mp->m_bsize, 0, &bp, NULL);
+        if (error)
+                return error;
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_back(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block to look at */
+        xfs_rtblock_t   limit,          /* last block to look at */
+        xfs_rtblock_t   *rtblock)       /* out: start block found */
+{
+        xfs_rtword_t    *b;             /* current word in buffer */
+        int             bit;            /* bit number in the word */
+        xfs_rtblock_t   block;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* buf for the block */
+        xfs_rtword_t    *bufp;          /* starting word in buffer */
+        int             error;          /* error value */
+        xfs_rtblock_t   firstbit;       /* first useful bit in the word */
+        xfs_rtblock_t   i;              /* current bit number rel. to start */
+        xfs_rtblock_t   len;            /* length of inspected area */
+        xfs_rtword_t    mask;           /* mask of relevant bits for value */
+        xfs_rtword_t    want;           /* mask for "good" values */
+        xfs_rtword_t    wdiff;          /* difference from wanted value */
+        int             word;           /* word number in the buffer */
+        /*
+         * Compute and read in starting bitmap block for starting block.
+         */
+        block = XFS_BITTOBLOCK(mp, start);
+        error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+        if (error) {
+                return error;
+        }
+        bufp = bp->b_addr;
+        /*
+         * Get the first word's index & point to it.
+         */
+        word = XFS_BITTOWORD(mp, start);
+        b = &bufp[word];
+        bit = (int)(start & (XFS_NBWORD - 1));
+        len = start - limit + 1;
+        /*
+         * Compute match value, based on the bit at start: if 1 (free)
+         * then all-ones, else all-zeroes.
+         */
+        want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+        /*
+         * If the starting position is not word-aligned, deal with the
+         * partial word.
+         */
+        if (bit < XFS_NBWORD - 1) {
+                /*
+                 * Calculate first (leftmost) bit number to look at,
+                 * and mask for all the relevant bits in this word.
+                 */
+                firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+                mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+                        firstbit;
+                /*
+                 * Calculate the difference between the value there
+                 * and what we're looking for.
+                 */
+                if ((wdiff = (*b ^ want) & mask)) {
+                        /*
+                         * Different.  Mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i = bit - XFS_RTHIBIT(wdiff);
+                        *rtblock = start - i + 1;
+                        return 0;
+                }
+                i = bit - firstbit + 1;
+                /*
+                 * Go on to previous block if that's where the previous word is
+                 * and we need the previous word.
+                 */
+                if (--word == -1 && i < len) {
+                        /*
+                         * If done with this block, get the previous one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        bufp = bp->b_addr;
+                        word = XFS_BLOCKWMASK(mp);
+                        b = &bufp[word];
+                } else {
+                        /*
+                         * Go on to the previous word in the buffer.
+                         */
+                        b--;
+                }
+        } else {
+                /*
+                 * Starting on a word boundary, no partial word.
+                 */
+                i = 0;
+        }
+        /*
+         * Loop over whole words in buffers.  When we use up one buffer
+         * we move on to the previous one.
+         */
+        while (len - i >= XFS_NBWORD) {
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = *b ^ want)) {
+                        /*
+                         * Different, mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+                        *rtblock = start - i + 1;
+                        return 0;
+                }
+                i += XFS_NBWORD;
+                /*
+                 * Go on to previous block if that's where the previous word is
+                 * and we need the previous word.
+                 */
+                if (--word == -1 && i < len) {
+                        /*
+                         * If done with this block, get the previous one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        bufp = bp->b_addr;
+                        word = XFS_BLOCKWMASK(mp);
+                        b = &bufp[word];
+                } else {
+                        /*
+                         * Go on to the previous word in the buffer.
+                         */
+                        b--;
+                }
+        }
+        /*
+         * If not ending on a word boundary, deal with the last
+         * (partial) word.
+         */
+        if (len - i) {
+                /*
+                 * Calculate first (leftmost) bit number to look at,
+                 * and mask for all the relevant bits in this word.
+                 */
+                firstbit = XFS_NBWORD - (len - i);
+                mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = (*b ^ want) & mask)) {
+                        /*
+                         * Different, mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+                        *rtblock = start - i + 1;
+                        return 0;
+                } else
+                        i = len;
+        }
+        /*
+         * No match, return that we scanned the whole area.
+         */
+        xfs_trans_brelse(tp, bp);
+        *rtblock = start - i + 1;
+        return 0;
+}
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_forw(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block to look at */
+        xfs_rtblock_t   limit,          /* last block to look at */
+        xfs_rtblock_t   *rtblock)       /* out: start block found */
+{
+        xfs_rtword_t    *b;             /* current word in buffer */
+        int             bit;            /* bit number in the word */
+        xfs_rtblock_t   block;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* buf for the block */
+        xfs_rtword_t    *bufp;          /* starting word in buffer */
+        int             error;          /* error value */
+        xfs_rtblock_t   i;              /* current bit number rel. to start */
+        xfs_rtblock_t   lastbit;        /* last useful bit in the word */
+        xfs_rtblock_t   len;            /* length of inspected area */
+        xfs_rtword_t    mask;           /* mask of relevant bits for value */
+        xfs_rtword_t    want;           /* mask for "good" values */
+        xfs_rtword_t    wdiff;          /* difference from wanted value */
+        int             word;           /* word number in the buffer */
+        /*
+         * Compute and read in starting bitmap block for starting block.
+         */
+        block = XFS_BITTOBLOCK(mp, start);
+        error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+        if (error) {
+                return error;
+        }
+        bufp = bp->b_addr;
+        /*
+         * Get the first word's index & point to it.
+         */
+        word = XFS_BITTOWORD(mp, start);
+        b = &bufp[word];
+        bit = (int)(start & (XFS_NBWORD - 1));
+        len = limit - start + 1;
+        /*
+         * Compute match value, based on the bit at start: if 1 (free)
+         * then all-ones, else all-zeroes.
+         */
+        want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+        /*
+         * If the starting position is not word-aligned, deal with the
+         * partial word.
+         */
+        if (bit) {
+                /*
+                 * Calculate last (rightmost) bit number to look at,
+                 * and mask for all the relevant bits in this word.
+                 */
+                lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+                mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+                /*
+                 * Calculate the difference between the value there
+                 * and what we're looking for.
+                 */
+                if ((wdiff = (*b ^ want) & mask)) {
+                        /*
+                         * Different.  Mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i = XFS_RTLOBIT(wdiff) - bit;
+                        *rtblock = start + i - 1;
+                        return 0;
+                }
+                i = lastbit - bit;
+                /*
+                 * Go on to next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * If done with this block, get the previous one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        b = bufp = bp->b_addr;
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the previous word in the buffer.
+                         */
+                        b++;
+                }
+        } else {
+                /*
+                 * Starting on a word boundary, no partial word.
+                 */
+                i = 0;
+        }
+        /*
+         * Loop over whole words in buffers.  When we use up one buffer
+         * we move on to the next one.
+         */
+        while (len - i >= XFS_NBWORD) {
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = *b ^ want)) {
+                        /*
+                         * Different, mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_RTLOBIT(wdiff);
+                        *rtblock = start + i - 1;
+                        return 0;
+                }
+                i += XFS_NBWORD;
+                /*
+                 * Go on to next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * If done with this block, get the next one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        b = bufp = bp->b_addr;
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer.
+                         */
+                        b++;
+                }
+        }
+        /*
+         * If not ending on a word boundary, deal with the last
+         * (partial) word.
+         */
+        if ((lastbit = len - i)) {
+                /*
+                 * Calculate mask for all the relevant bits in this word.
+                 */
+                mask = ((xfs_rtword_t)1 << lastbit) - 1;
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = (*b ^ want) & mask)) {
+                        /*
+                         * Different, mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_RTLOBIT(wdiff);
+                        *rtblock = start + i - 1;
+                        return 0;
+                } else
+                        i = len;
+        }
+        /*
+         * No match, return that we scanned the whole area.
+         */
+        xfs_trans_brelse(tp, bp);
+        *rtblock = start + i - 1;
+        return 0;
+}
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+int
+xfs_rtmodify_summary(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        int             log,            /* log2 of extent size */
+        xfs_rtblock_t   bbno,           /* bitmap block number */
+        int             delta,          /* change to make to summary info */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb)           /* in/out: summary block number */
+{
+        xfs_buf_t       *bp;            /* buffer for the summary block */
+        int             error;          /* error value */
+        xfs_fsblock_t   sb;             /* summary fsblock */
+        int             so;             /* index into the summary file */
+        xfs_suminfo_t   *sp;            /* pointer to returned data */
+        /*
+         * Compute entry number in the summary file.
+         */
+        so = XFS_SUMOFFS(mp, log, bbno);
+        /*
+         * Compute the block number in the summary file.
+         */
+        sb = XFS_SUMOFFSTOBLOCK(mp, so);
+        /*
+         * If we have an old buffer, and the block number matches, use that.
+         */
+        if (rbpp && *rbpp && *rsb == sb)
+                bp = *rbpp;
+        /*
+         * Otherwise we have to get the buffer.
+         */
+        else {
+                /*
+                 * If there was an old one, get rid of it first.
+                 */
+                if (rbpp && *rbpp)
+                        xfs_trans_brelse(tp, *rbpp);
+                error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+                if (error) {
+                        return error;
+                }
+                /*
+                 * Remember this buffer and block for the next call.
+                 */
+                if (rbpp) {
+                        *rbpp = bp;
+                        *rsb = sb;
+                }
+        }
+        /*
+         * Point to the summary information, modify and log it.
+         */
+        sp = XFS_SUMPTR(mp, bp, so);
+        *sp += delta;
+        xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
+                (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
+        return 0;
+}
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+int
+xfs_rtmodify_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block to modify */
+        xfs_extlen_t    len,            /* length of extent to modify */
+        int             val)            /* 1 for free, 0 for allocated */
+{
+        xfs_rtword_t    *b;             /* current word in buffer */
+        int             bit;            /* bit number in the word */
+        xfs_rtblock_t   block;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* buf for the block */
+        xfs_rtword_t    *bufp;          /* starting word in buffer */
+        int             error;          /* error value */
+        xfs_rtword_t    *first;         /* first used word in the buffer */
+        int             i;              /* current bit number rel. to start */
+        int             lastbit;        /* last useful bit in word */
+        xfs_rtword_t    mask;           /* mask o frelevant bits for value */
+        int             word;           /* word number in the buffer */
+        /*
+         * Compute starting bitmap block number.
+         */
+        block = XFS_BITTOBLOCK(mp, start);
+        /*
+         * Read the bitmap block, and point to its data.
+         */
+        error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+        if (error) {
+                return error;
+        }
+        bufp = bp->b_addr;
+        /*
+         * Compute the starting word's address, and starting bit.
+         */
+        word = XFS_BITTOWORD(mp, start);
+        first = b = &bufp[word];
+        bit = (int)(start & (XFS_NBWORD - 1));
+        /*
+         * 0 (allocated) => all zeroes; 1 (free) => all ones.
+         */
+        val = -val;
+        /*
+         * If not starting on a word boundary, deal with the first
+         * (partial) word.
+         */
+        if (bit) {
+                /*
+                 * Compute first bit not changed and mask of relevant bits.
+                 */
+                lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+                mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+                /*
+                 * Set/clear the active bits.
+                 */
+                if (val)
+                        *b |= mask;
+                else
+                        *b &= ~mask;
+                i = lastbit - bit;
+                /*
+                 * Go on to the next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * Log the changed part of this block.
+                         * Get the next one.
+                         */
+                        xfs_trans_log_buf(tp, bp,
+                                (uint)((char *)first - (char *)bufp),
+                                (uint)((char *)b - (char *)bufp));
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        first = b = bufp = bp->b_addr;
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer
+                         */
+                        b++;
+                }
+        } else {
+                /*
+                 * Starting on a word boundary, no partial word.
+                 */
+                i = 0;
+        }
+        /*
+         * Loop over whole words in buffers.  When we use up one buffer
+         * we move on to the next one.
+         */
+        while (len - i >= XFS_NBWORD) {
+                /*
+                 * Set the word value correctly.
+                 */
+                *b = val;
+                i += XFS_NBWORD;
+                /*
+                 * Go on to the next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * Log the changed part of this block.
+                         * Get the next one.
+                         */
+                        xfs_trans_log_buf(tp, bp,
+                                (uint)((char *)first - (char *)bufp),
+                                (uint)((char *)b - (char *)bufp));
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        first = b = bufp = bp->b_addr;
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer
+                         */
+                        b++;
+                }
+        }
+        /*
+         * If not ending on a word boundary, deal with the last
+         * (partial) word.
+         */
+        if ((lastbit = len - i)) {
+                /*
+                 * Compute a mask of relevant bits.
+                 */
+                bit = 0;
+                mask = ((xfs_rtword_t)1 << lastbit) - 1;
+                /*
+                 * Set/clear the active bits.
+                 */
+                if (val)
+                        *b |= mask;
+                else
+                        *b &= ~mask;
+                b++;
+        }
+        /*
+         * Log any remaining changed bytes.
+         */
+        if (b > first)
+                xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+                        (uint)((char *)b - (char *)bufp - 1));
+        return 0;
+}
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+int
+xfs_rtfree_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block to free */
+        xfs_extlen_t    len,            /* length to free */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb)           /* in/out: summary block number */
+{
+        xfs_rtblock_t   end;            /* end of the freed extent */
+        int             error;          /* error value */
+        xfs_rtblock_t   postblock;      /* first block freed > end */
+        xfs_rtblock_t   preblock;       /* first block freed < start */
+        end = start + len - 1;
+        /*
+         * Modify the bitmap to mark this extent freed.
+         */
+        error = xfs_rtmodify_range(mp, tp, start, len, 1);
+        if (error) {
+                return error;
+        }
+        /*
+         * Assume we're freeing out of the middle of an allocated extent.
+         * We need to find the beginning and end of the extent so we can
+         * properly update the summary.
+         */
+        error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+        if (error) {
+                return error;
+        }
+        /*
+         * Find the next allocated block (end of allocated extent).
+         */
+        error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+                &postblock);
+        if (error)
+                return error;
+        /*
+         * If there are blocks not being freed at the front of the
+         * old extent, add summary data for them to be allocated.
+         */
+        if (preblock < start) {
+                error = xfs_rtmodify_summary(mp, tp,
+                        XFS_RTBLOCKLOG(start - preblock),
+                        XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+                if (error) {
+                        return error;
+                }
+        }
+        /*
+         * If there are blocks not being freed at the end of the
+         * old extent, add summary data for them to be allocated.
+         */
+        if (postblock > end) {
+                error = xfs_rtmodify_summary(mp, tp,
+                        XFS_RTBLOCKLOG(postblock - end),
+                        XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+                if (error) {
+                        return error;
+                }
+        }
+        /*
+         * Increment the summary information corresponding to the entire
+         * (new) free extent.
+         */
+        error = xfs_rtmodify_summary(mp, tp,
+                XFS_RTBLOCKLOG(postblock + 1 - preblock),
+                XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+        return error;
+}
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+int
+xfs_rtcheck_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block number of extent */
+        xfs_extlen_t    len,            /* length of extent */
+        int             val,            /* 1 for free, 0 for allocated */
+        xfs_rtblock_t   *new,           /* out: first block not matching */
+        int             *stat)          /* out: 1 for matches, 0 for not */
+{
+        xfs_rtword_t    *b;             /* current word in buffer */
+        int             bit;            /* bit number in the word */
+        xfs_rtblock_t   block;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* buf for the block */
+        xfs_rtword_t    *bufp;          /* starting word in buffer */
+        int             error;          /* error value */
+        xfs_rtblock_t   i;              /* current bit number rel. to start */
+        xfs_rtblock_t   lastbit;        /* last useful bit in word */
+        xfs_rtword_t    mask;           /* mask of relevant bits for value */
+        xfs_rtword_t    wdiff;          /* difference from wanted value */
+        int             word;           /* word number in the buffer */
+        /*
+         * Compute starting bitmap block number
+         */
+        block = XFS_BITTOBLOCK(mp, start);
+        /*
+         * Read the bitmap block.
+         */
+        error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+        if (error) {
+                return error;
+        }
+        bufp = bp->b_addr;
+        /*
+         * Compute the starting word's address, and starting bit.
+         */
+        word = XFS_BITTOWORD(mp, start);
+        b = &bufp[word];
+        bit = (int)(start & (XFS_NBWORD - 1));
+        /*
+         * 0 (allocated) => all zero's; 1 (free) => all one's.
+         */
+        val = -val;
+        /*
+         * If not starting on a word boundary, deal with the first
+         * (partial) word.
+         */
+        if (bit) {
+                /*
+                 * Compute first bit not examined.
+                 */
+                lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+                /*
+                 * Mask of relevant bits.
+                 */
+                mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = (*b ^ val) & mask)) {
+                        /*
+                         * Different, compute first wrong bit and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i = XFS_RTLOBIT(wdiff) - bit;
+                        *new = start + i;
+                        *stat = 0;
+                        return 0;
+                }
+                i = lastbit - bit;
+                /*
+                 * Go on to next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * If done with this block, get the next one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        b = bufp = bp->b_addr;
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer.
+                         */
+                        b++;
+                }
+        } else {
+                /*
+                 * Starting on a word boundary, no partial word.
+                 */
+                i = 0;
+        }
+        /*
+         * Loop over whole words in buffers.  When we use up one buffer
+         * we move on to the next one.
+         */
+        while (len - i >= XFS_NBWORD) {
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = *b ^ val)) {
+                        /*
+                         * Different, compute first wrong bit and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_RTLOBIT(wdiff);
+                        *new = start + i;
+                        *stat = 0;
+                        return 0;
+                }
+                i += XFS_NBWORD;
+                /*
+                 * Go on to next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * If done with this block, get the next one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        b = bufp = bp->b_addr;
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer.
+                         */
+                        b++;
+                }
+        }
+        /*
+         * If not ending on a word boundary, deal with the last
+         * (partial) word.
+         */
+        if ((lastbit = len - i)) {
+                /*
+                 * Mask of relevant bits.
+                 */
+                mask = ((xfs_rtword_t)1 << lastbit) - 1;
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = (*b ^ val) & mask)) {
+                        /*
+                         * Different, compute first wrong bit and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_RTLOBIT(wdiff);
+                        *new = start + i;
+                        *stat = 0;
+                        return 0;
+                } else
+                        i = len;
+        }
+        /*
+         * Successful, return.
+         */
+        xfs_trans_brelse(tp, bp);
+        *new = start + i;
+        *stat = 1;
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int                              /* error */
+xfs_rtcheck_alloc_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   bno,            /* starting block number of extent */
+        xfs_extlen_t    len)            /* length of extent */
+{
+        xfs_rtblock_t   new;            /* dummy for xfs_rtcheck_range */
+        int             stat;
+        int             error;
+        error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+        if (error)
+                return error;
+        ASSERT(stat);
+        return 0;
+}
+#else
+#define xfs_rtcheck_alloc_range(m,t,b,l)        (0)
+#endif
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int                                     /* error */
+xfs_rtfree_extent(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   bno,            /* starting block number to free */
+        xfs_extlen_t    len)            /* length of extent freed */
+{
+        int             error;          /* error value */
+        xfs_mount_t     *mp;            /* file system mount structure */
+        xfs_fsblock_t   sb;             /* summary file block number */
+        xfs_buf_t       *sumbp = NULL;  /* summary file block buffer */
+        mp = tp->t_mountp;
+        ASSERT(mp->m_rbmip->i_itemp != NULL);
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+        error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+        if (error)
+                return error;
+        /*
+         * Free the range of realtime blocks.
+         */
+        error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+        if (error) {
+                return error;
+        }
+        /*
+         * Mark more blocks free in the superblock.
+         */
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+        /*
+         * If we've now freed all the blocks, reset the file sequence
+         * number to 0.
+         */
+        if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+            mp->m_sb.sb_rextents) {
+                if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+                        mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+                *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+                xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+        }
+        return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
new file mode 100644
index 000000000000..6e93b5ef0a6b
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -0,0 +1,836 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_dinode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+/*
+ * Physical superblock buffer manipulations. Shared with libxfs in userspace.
+ */
+static const struct {
+        short offset;
+        short type;     /* 0 = integer
+                         * 1 = binary / string (no translation)
+                         */
+} xfs_sb_info[] = {
+        { offsetof(xfs_sb_t, sb_magicnum),      0 },
+        { offsetof(xfs_sb_t, sb_blocksize),     0 },
+        { offsetof(xfs_sb_t, sb_dblocks),       0 },
+        { offsetof(xfs_sb_t, sb_rblocks),       0 },
+        { offsetof(xfs_sb_t, sb_rextents),      0 },
+        { offsetof(xfs_sb_t, sb_uuid),          1 },
+        { offsetof(xfs_sb_t, sb_logstart),      0 },
+        { offsetof(xfs_sb_t, sb_rootino),       0 },
+        { offsetof(xfs_sb_t, sb_rbmino),        0 },
+        { offsetof(xfs_sb_t, sb_rsumino),       0 },
+        { offsetof(xfs_sb_t, sb_rextsize),      0 },
+        { offsetof(xfs_sb_t, sb_agblocks),      0 },
+        { offsetof(xfs_sb_t, sb_agcount),       0 },
+        { offsetof(xfs_sb_t, sb_rbmblocks),     0 },
+        { offsetof(xfs_sb_t, sb_logblocks),     0 },
+        { offsetof(xfs_sb_t, sb_versionnum),    0 },
+        { offsetof(xfs_sb_t, sb_sectsize),      0 },
+        { offsetof(xfs_sb_t, sb_inodesize),     0 },
+        { offsetof(xfs_sb_t, sb_inopblock),     0 },
+        { offsetof(xfs_sb_t, sb_fname[0]),      1 },
+        { offsetof(xfs_sb_t, sb_blocklog),      0 },
+        { offsetof(xfs_sb_t, sb_sectlog),       0 },
+        { offsetof(xfs_sb_t, sb_inodelog),      0 },
+        { offsetof(xfs_sb_t, sb_inopblog),      0 },
+        { offsetof(xfs_sb_t, sb_agblklog),      0 },
+        { offsetof(xfs_sb_t, sb_rextslog),      0 },
+        { offsetof(xfs_sb_t, sb_inprogress),    0 },
+        { offsetof(xfs_sb_t, sb_imax_pct),      0 },
+        { offsetof(xfs_sb_t, sb_icount),        0 },
+        { offsetof(xfs_sb_t, sb_ifree),         0 },
+        { offsetof(xfs_sb_t, sb_fdblocks),      0 },
+        { offsetof(xfs_sb_t, sb_frextents),     0 },
+        { offsetof(xfs_sb_t, sb_uquotino),      0 },
+        { offsetof(xfs_sb_t, sb_gquotino),      0 },
+        { offsetof(xfs_sb_t, sb_qflags),        0 },
+        { offsetof(xfs_sb_t, sb_flags),         0 },
+        { offsetof(xfs_sb_t, sb_shared_vn),     0 },
+        { offsetof(xfs_sb_t, sb_inoalignmt),    0 },
+        { offsetof(xfs_sb_t, sb_unit),          0 },
+        { offsetof(xfs_sb_t, sb_width),         0 },
+        { offsetof(xfs_sb_t, sb_dirblklog),     0 },
+        { offsetof(xfs_sb_t, sb_logsectlog),    0 },
+        { offsetof(xfs_sb_t, sb_logsectsize),   0 },
+        { offsetof(xfs_sb_t, sb_logsunit),      0 },
+        { offsetof(xfs_sb_t, sb_features2),     0 },
+        { offsetof(xfs_sb_t, sb_bad_features2), 0 },
+        { offsetof(xfs_sb_t, sb_features_compat),       0 },
+        { offsetof(xfs_sb_t, sb_features_ro_compat),    0 },
+        { offsetof(xfs_sb_t, sb_features_incompat),     0 },
+        { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
+        { offsetof(xfs_sb_t, sb_crc),           0 },
+        { offsetof(xfs_sb_t, sb_pad),           0 },
+        { offsetof(xfs_sb_t, sb_pquotino),      0 },
+        { offsetof(xfs_sb_t, sb_lsn),           0 },
+        { sizeof(xfs_sb_t),                     0 }
+};
+/*
+ * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
+ */
+struct xfs_perag *
+xfs_perag_get(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno)
+{
+        struct xfs_perag        *pag;
+        int                     ref = 0;
+        rcu_read_lock();
+        pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+        if (pag) {
+                ASSERT(atomic_read(&pag->pag_ref) >= 0);
+                ref = atomic_inc_return(&pag->pag_ref);
+        }
+        rcu_read_unlock();
+        trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+        return pag;
+}
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          first,
+        int                     tag)
+{
+        struct xfs_perag        *pag;
+        int                     found;
+        int                     ref;
+        rcu_read_lock();
+        found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                        (void **)&pag, first, 1, tag);
+        if (found <= 0) {
+                rcu_read_unlock();
+                return NULL;
+        }
+        ref = atomic_inc_return(&pag->pag_ref);
+        rcu_read_unlock();
+        trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+        return pag;
+}
+void
+xfs_perag_put(
+        struct xfs_perag        *pag)
+{
+        int     ref;
+        ASSERT(atomic_read(&pag->pag_ref) > 0);
+        ref = atomic_dec_return(&pag->pag_ref);
+        trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+        xfs_mount_t     *mp,
+        xfs_sb_t        *sbp,
+        bool            check_inprogress,
+        bool            check_version)
+{
+        /*
+         * If the log device and data device have the
+         * same device number, the log is internal.
+         * Consequently, the sb_logstart should be non-zero.  If
+         * we have a zero sb_logstart in this case, we may be trying to mount
+         * a volume filesystem in a non-volume manner.
+         */
+        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+                xfs_warn(mp, "bad magic number");
+                return -EWRONGFS;
+        }
+        if (!xfs_sb_good_version(sbp)) {
+                xfs_warn(mp, "bad version");
+                return -EWRONGFS;
+        }
+        /*
+         * Version 5 superblock feature mask validation. Reject combinations the
+         * kernel cannot support up front before checking anything else. For
+         * write validation, we don't need to check feature masks.
+         */
+        if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+                if (xfs_sb_has_compat_feature(sbp,
+                                        XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+                        xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.\n"
+"Using a more recent kernel is recommended.",
+                                (sbp->sb_features_compat &
+                                                XFS_SB_FEAT_COMPAT_UNKNOWN));
+                }
+                if (xfs_sb_has_ro_compat_feature(sbp,
+                                        XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+                        xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+                                (sbp->sb_features_ro_compat &
+                                                XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+                        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+                                xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.\n"
+"Filesystem can only be safely mounted read only.");
+                                return -EINVAL;
+                        }
+                }
+                if (xfs_sb_has_incompat_feature(sbp,
+                                        XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+                        xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.\n"
+"Filesystem can not be safely mounted by this kernel.",
+                                (sbp->sb_features_incompat &
+                                                XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+                        return -EINVAL;
+                }
+        }
+        if (xfs_sb_version_has_pquotino(sbp)) {
+                if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+                        xfs_notice(mp,
+                           "Version 5 of Super block has XFS_OQUOTA bits.");
+                        return -EFSCORRUPTED;
+                }
+        } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+                                XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+                        xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
+                        return -EFSCORRUPTED;
+        }
+        if (unlikely(
+            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+                xfs_warn(mp,
+                "filesystem is marked as having an external log; "
+                "specify logdev on the mount command line.");
+                return -EINVAL;
+        }
+        if (unlikely(
+            sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+                xfs_warn(mp,
+                "filesystem is marked as having an internal log; "
+                "do not specify logdev on the mount command line.");
+                return -EINVAL;
+        }
+        /*
+         * More sanity checking.  Most of these were stolen directly from
+         * xfs_repair.
+         */
+        if (unlikely(
+            sbp->sb_agcount <= 0                                        ||
+            sbp->sb_sectsize < XFS_MIN_SECTORSIZE                       ||
+            sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
+            sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
+            sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
+            sbp->sb_sectsize != (1 << sbp->sb_sectlog)                  ||
+            sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
+            sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
+            sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
+            sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
+            sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
+            sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
+            sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
+            sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
+            sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
+            sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
+            sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
+            (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
+            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
+            (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
+            (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)    ||
+            sbp->sb_dblocks == 0                                        ||
+            sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
+            sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp)                      ||
+            sbp->sb_shared_vn != 0)) {
+                xfs_notice(mp, "SB sanity check failed");
+                return -EFSCORRUPTED;
+        }
+        /*
+         * Until this is fixed only page-sized or smaller data blocks work.
+         */
+        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+                xfs_warn(mp,
+                "File system with blocksize %d bytes. "
+                "Only pagesize (%ld) or less will currently work.",
+                                sbp->sb_blocksize, PAGE_SIZE);
+                return -ENOSYS;
+        }
+        /*
+         * Currently only very few inode sizes are supported.
+         */
+        switch (sbp->sb_inodesize) {
+        case 256:
+        case 512:
+        case 1024:
+        case 2048:
+                break;
+        default:
+                xfs_warn(mp, "inode size of %d bytes not supported",
+                                sbp->sb_inodesize);
+                return -ENOSYS;
+        }
+        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
+            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
+                xfs_warn(mp,
+                "file system too large to be mounted on this system.");
+                return -EFBIG;
+        }
+        if (check_inprogress && sbp->sb_inprogress) {
+                xfs_warn(mp, "Offline file system operation in progress!");
+                return -EFSCORRUPTED;
+        }
+        return 0;
+}
+void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+        /*
+         * older mkfs doesn't initialize quota inodes to NULLFSINO. This
+         * leads to in-core values having two different values for a quota
+         * inode to be invalid: 0 and NULLFSINO. Change it to a single value
+         * NULLFSINO.
+         *
+         * Note that this change affect only the in-core values. These
+         * values are not written back to disk unless any quota information
+         * is written to the disk. Even in that case, sb_pquotino field is
+         * not written to disk unless the superblock supports pquotino.
+         */
+        if (sbp->sb_uquotino == 0)
+                sbp->sb_uquotino = NULLFSINO;
+        if (sbp->sb_gquotino == 0)
+                sbp->sb_gquotino = NULLFSINO;
+        if (sbp->sb_pquotino == 0)
+                sbp->sb_pquotino = NULLFSINO;
+        /*
+         * We need to do these manipilations only if we are working
+         * with an older version of on-disk superblock.
+         */
+        if (xfs_sb_version_has_pquotino(sbp))
+                return;
+        if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+                sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                        XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+        if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+                sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                        XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+        sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+        if (sbp->sb_qflags & XFS_PQUOTA_ACCT)  {
+                /*
+                 * In older version of superblock, on-disk superblock only
+                 * has sb_gquotino, and in-core superblock has both sb_gquotino
+                 * and sb_pquotino. But, only one of them is supported at any
+                 * point of time. So, if PQUOTA is set in disk superblock,
+                 * copy over sb_gquotino to sb_pquotino.
+                 */
+                sbp->sb_pquotino = sbp->sb_gquotino;
+                sbp->sb_gquotino = NULLFSINO;
+        }
+}
+void
+xfs_sb_from_disk(
+        struct xfs_sb   *to,
+        xfs_dsb_t       *from)
+{
+        to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+        to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+        to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+        to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+        to->sb_rextents = be64_to_cpu(from->sb_rextents);
+        memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+        to->sb_logstart = be64_to_cpu(from->sb_logstart);
+        to->sb_rootino = be64_to_cpu(from->sb_rootino);
+        to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+        to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+        to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+        to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+        to->sb_agcount = be32_to_cpu(from->sb_agcount);
+        to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+        to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+        to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+        to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+        to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+        to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+        memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+        to->sb_blocklog = from->sb_blocklog;
+        to->sb_sectlog = from->sb_sectlog;
+        to->sb_inodelog = from->sb_inodelog;
+        to->sb_inopblog = from->sb_inopblog;
+        to->sb_agblklog = from->sb_agblklog;
+        to->sb_rextslog = from->sb_rextslog;
+        to->sb_inprogress = from->sb_inprogress;
+        to->sb_imax_pct = from->sb_imax_pct;
+        to->sb_icount = be64_to_cpu(from->sb_icount);
+        to->sb_ifree = be64_to_cpu(from->sb_ifree);
+        to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+        to->sb_frextents = be64_to_cpu(from->sb_frextents);
+        to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+        to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+        to->sb_qflags = be16_to_cpu(from->sb_qflags);
+        to->sb_flags = from->sb_flags;
+        to->sb_shared_vn = from->sb_shared_vn;
+        to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+        to->sb_unit = be32_to_cpu(from->sb_unit);
+        to->sb_width = be32_to_cpu(from->sb_width);
+        to->sb_dirblklog = from->sb_dirblklog;
+        to->sb_logsectlog = from->sb_logsectlog;
+        to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+        to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+        to->sb_features2 = be32_to_cpu(from->sb_features2);
+        to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+        to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+        to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+        to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+        to->sb_features_log_incompat =
+                                be32_to_cpu(from->sb_features_log_incompat);
+        to->sb_pad = 0;
+        to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+        to->sb_lsn = be64_to_cpu(from->sb_lsn);
+}
+static inline void
+xfs_sb_quota_to_disk(
+        xfs_dsb_t       *to,
+        xfs_sb_t        *from,
+        __int64_t       *fields)
+{
+        __uint16_t      qflags = from->sb_qflags;
+        /*
+         * We need to do these manipilations only if we are working
+         * with an older version of on-disk superblock.
+         */
+        if (xfs_sb_version_has_pquotino(from))
+                return;
+        if (*fields & XFS_SB_QFLAGS) {
+                /*
+                 * The in-core version of sb_qflags do not have
+                 * XFS_OQUOTA_* flags, whereas the on-disk version
+                 * does.  So, convert incore XFS_{PG}QUOTA_* flags
+                 * to on-disk XFS_OQUOTA_* flags.
+                 */
+                qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+                                XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+                if (from->sb_qflags &
+                                (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+                        qflags |= XFS_OQUOTA_ENFD;
+                if (from->sb_qflags &
+                                (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+                        qflags |= XFS_OQUOTA_CHKD;
+                to->sb_qflags = cpu_to_be16(qflags);
+                *fields &= ~XFS_SB_QFLAGS;
+        }
+        /*
+         * GQUOTINO and PQUOTINO cannot be used together in versions of
+         * superblock that do not have pquotino. from->sb_flags tells us which
+         * quota is active and should be copied to disk. If neither are active,
+         * make sure we write NULLFSINO to the sb_gquotino field as a quota
+         * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
+         * bit is set.
+         *
+         * Note that we don't need to handle the sb_uquotino or sb_pquotino here
+         * as they do not require any translation. Hence the main sb field loop
+         * will write them appropriately from the in-core superblock.
+         */
+        if ((*fields & XFS_SB_GQUOTINO) &&
+                                (from->sb_qflags & XFS_GQUOTA_ACCT))
+                to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+        else if ((*fields & XFS_SB_PQUOTINO) &&
+                                (from->sb_qflags & XFS_PQUOTA_ACCT))
+                to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+        else {
+                /*
+                 * We can't rely on just the fields being logged to tell us
+                 * that it is safe to write NULLFSINO - we should only do that
+                 * if quotas are not actually enabled. Hence only write
+                 * NULLFSINO if both in-core quota inodes are NULL.
+                 */
+                if (from->sb_gquotino == NULLFSINO &&
+                    from->sb_pquotino == NULLFSINO)
+                        to->sb_gquotino = cpu_to_be64(NULLFSINO);
+        }
+        *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
+}
+/*
+ * Copy in core superblock to ondisk one.
+ *
+ * The fields argument is mask of superblock fields to copy.
+ */
+void
+xfs_sb_to_disk(
+        xfs_dsb_t       *to,
+        xfs_sb_t        *from,
+        __int64_t       fields)
+{
+        xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
+        xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
+        xfs_sb_field_t  f;
+        int             first;
+        int             size;
+        ASSERT(fields);
+        if (!fields)
+                return;
+        xfs_sb_quota_to_disk(to, from, &fields);
+        while (fields) {
+                f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+                first = xfs_sb_info[f].offset;
+                size = xfs_sb_info[f + 1].offset - first;
+                ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+                if (size == 1 || xfs_sb_info[f].type == 1) {
+                        memcpy(to_ptr + first, from_ptr + first, size);
+                } else {
+                        switch (size) {
+                        case 2:
+                                *(__be16 *)(to_ptr + first) =
+                                      cpu_to_be16(*(__u16 *)(from_ptr + first));
+                                break;
+                        case 4:
+                                *(__be32 *)(to_ptr + first) =
+                                      cpu_to_be32(*(__u32 *)(from_ptr + first));
+                                break;
+                        case 8:
+                                *(__be64 *)(to_ptr + first) =
+                                      cpu_to_be64(*(__u64 *)(from_ptr + first));
+                                break;
+                        default:
+                                ASSERT(0);
+                        }
+                }
+                fields &= ~(1LL << f);
+        }
+}
+static int
+xfs_sb_verify(
+        struct xfs_buf  *bp,
+        bool            check_version)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_sb   sb;
+        xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+        /*
+         * Only check the in progress field for the primary superblock as
+         * mkfs.xfs doesn't clear it from secondary superblocks.
+         */
+        return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+                                     check_version);
+}
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid.  We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ *
+ * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
+ * last field in V4 secondary superblocks.  So for secondary superblocks,
+ * we are more forgiving, and ignore CRC failures if the primary doesn't
+ * indicate that the fs version is V5.
+ */
+static void
+xfs_sb_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
+        int             error;
+        /*
+         * open code the version check to avoid needing to convert the entire
+         * superblock from disk order just to check the version number
+         */
+        if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+            (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+                                                XFS_SB_VERSION_5) ||
+             dsb->sb_crc != 0)) {
+                if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
+                        /* Only fail bad secondaries on a known V5 filesystem */
+                        if (bp->b_bn == XFS_SB_DADDR ||
+                            xfs_sb_version_hascrc(&mp->m_sb)) {
+                                error = -EFSBADCRC;
+                                goto out_error;
+                        }
+                }
+        }
+        error = xfs_sb_verify(bp, true);
+out_error:
+        if (error) {
+                xfs_buf_ioerror(bp, error);
+                if (error == -EFSCORRUPTED || error == -EFSBADCRC)
+                        xfs_verifier_error(bp);
+        }
+}
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, then run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
+        if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
+                /* XFS filesystem, verify noisily! */
+                xfs_sb_read_verify(bp);
+                return;
+        }
+        /* quietly fail */
+        xfs_buf_ioerror(bp, -EWRONGFS);
+}
+static void
+xfs_sb_write_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        int                     error;
+        error = xfs_sb_verify(bp, false);
+        if (error) {
+                xfs_buf_ioerror(bp, error);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (bip)
+                XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
+}
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+        .verify_read = xfs_sb_read_verify,
+        .verify_write = xfs_sb_write_verify,
+};
+const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+        .verify_read = xfs_sb_quiet_read_verify,
+        .verify_write = xfs_sb_write_verify,
+};
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+void
+xfs_sb_mount_common(
+        struct xfs_mount *mp,
+        struct xfs_sb   *sbp)
+{
+        mp->m_agfrotor = mp->m_agirotor = 0;
+        spin_lock_init(&mp->m_agirotor_lock);
+        mp->m_maxagi = mp->m_sb.sb_agcount;
+        mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+        mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+        mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+        mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+        mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+        mp->m_blockmask = sbp->sb_blocksize - 1;
+        mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+        mp->m_blockwmask = mp->m_blockwsize - 1;
+        mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+        mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+        mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+        mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+        mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+        mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+        mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+        mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+        mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+        mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+        mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+        mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+        mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+        mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+                                        sbp->sb_inopblock);
+        mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+        struct xfs_mount *mp,
+        xfs_agnumber_t  agcount)
+{
+        xfs_agnumber_t  index;
+        xfs_perag_t     *pag;
+        xfs_sb_t        *sbp = &mp->m_sb;
+        uint64_t        ifree = 0;
+        uint64_t        ialloc = 0;
+        uint64_t        bfree = 0;
+        uint64_t        bfreelst = 0;
+        uint64_t        btree = 0;
+        int             error;
+        for (index = 0; index < agcount; index++) {
+                /*
+                 * read the agf, then the agi. This gets us
+                 * all the information we need and populates the
+                 * per-ag structures for us.
+                 */
+                error = xfs_alloc_pagf_init(mp, NULL, index, 0);
+                if (error)
+                        return error;
+                error = xfs_ialloc_pagi_init(mp, NULL, index);
+                if (error)
+                        return error;
+                pag = xfs_perag_get(mp, index);
+                ifree += pag->pagi_freecount;
+                ialloc += pag->pagi_count;
+                bfree += pag->pagf_freeblks;
+                bfreelst += pag->pagf_flcount;
+                btree += pag->pagf_btreeblks;
+                xfs_perag_put(pag);
+        }
+        /*
+         * Overwrite incore superblock counters with just-read data
+         */
+        spin_lock(&mp->m_sb_lock);
+        sbp->sb_ifree = ifree;
+        sbp->sb_icount = ialloc;
+        sbp->sb_fdblocks = bfree + bfreelst + btree;
+        spin_unlock(&mp->m_sb_lock);
+        /* Fixup the per-cpu counters as well. */
+        xfs_icsb_reinit_counters(mp);
+        return 0;
+}
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+        xfs_buf_t       *bp;
+        int             first;
+        int             last;
+        xfs_mount_t     *mp;
+        xfs_sb_field_t  f;
+        ASSERT(fields);
+        if (!fields)
+                return;
+        mp = tp->t_mountp;
+        bp = xfs_trans_getsb(tp, mp, 0);
+        first = sizeof(xfs_sb_t);
+        last = 0;
+        /* translate/copy */
+        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+        /* find modified range */
+        f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+        last = xfs_sb_info[f + 1].offset - 1;
+        f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+        first = xfs_sb_info[f].offset;
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+        xfs_trans_log_buf(tp, bp, first, last);
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
new file mode 100644
index 000000000000..c43c2d609a24
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SB_H__
+#define __XFS_SB_H__
+/*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_trans;
+#define XFS_SB_MAGIC            0x58465342      /* 'XFSB' */
+#define XFS_SB_VERSION_1        1               /* 5.3, 6.0.1, 6.1 */
+#define XFS_SB_VERSION_2        2               /* 6.2 - attributes */
+#define XFS_SB_VERSION_3        3               /* 6.2 - new inode version */
+#define XFS_SB_VERSION_4        4               /* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_5        5               /* CRC enabled filesystem */
+#define XFS_SB_VERSION_NUMBITS          0x000f
+#define XFS_SB_VERSION_ALLFBITS         0xfff0
+#define XFS_SB_VERSION_ATTRBIT          0x0010
+#define XFS_SB_VERSION_NLINKBIT         0x0020
+#define XFS_SB_VERSION_QUOTABIT         0x0040
+#define XFS_SB_VERSION_ALIGNBIT         0x0080
+#define XFS_SB_VERSION_DALIGNBIT        0x0100
+#define XFS_SB_VERSION_SHAREDBIT        0x0200
+#define XFS_SB_VERSION_LOGV2BIT         0x0400
+#define XFS_SB_VERSION_SECTORBIT        0x0800
+#define XFS_SB_VERSION_EXTFLGBIT        0x1000
+#define XFS_SB_VERSION_DIRV2BIT         0x2000
+#define XFS_SB_VERSION_BORGBIT          0x4000  /* ASCII only case-insens. */
+#define XFS_SB_VERSION_MOREBITSBIT      0x8000
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define XFS_SB_VERSION_OKBITS           \
+        ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+                ~XFS_SB_VERSION_SHAREDBIT)
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_RESERVED1BIT    0x00000001
+#define XFS_SB_VERSION2_LAZYSBCOUNTBIT  0x00000002      /* Superblk counters */
+#define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
+#define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT     0x00000080      /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT          0x00000100      /* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE           0x00000200      /* inode type in dir */
+#define XFS_SB_VERSION2_OKBITS          \
+        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+         XFS_SB_VERSION2_ATTR2BIT       | \
+         XFS_SB_VERSION2_PROJID32BIT    | \
+         XFS_SB_VERSION2_FTYPE)
+/*
+ * Superblock - in core version.  Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_sb {
+        __uint32_t      sb_magicnum;    /* magic number == XFS_SB_MAGIC */
+        __uint32_t      sb_blocksize;   /* logical block size, bytes */
+        xfs_drfsbno_t   sb_dblocks;     /* number of data blocks */
+        xfs_drfsbno_t   sb_rblocks;     /* number of realtime blocks */
+        xfs_drtbno_t    sb_rextents;    /* number of realtime extents */
+        uuid_t          sb_uuid;        /* file system unique id */
+        xfs_dfsbno_t    sb_logstart;    /* starting block of log if internal */
+        xfs_ino_t       sb_rootino;     /* root inode number */
+        xfs_ino_t       sb_rbmino;      /* bitmap inode for realtime extents */
+        xfs_ino_t       sb_rsumino;     /* summary inode for rt bitmap */
+        xfs_agblock_t   sb_rextsize;    /* realtime extent size, blocks */
+        xfs_agblock_t   sb_agblocks;    /* size of an allocation group */
+        xfs_agnumber_t  sb_agcount;     /* number of allocation groups */
+        xfs_extlen_t    sb_rbmblocks;   /* number of rt bitmap blocks */
+        xfs_extlen_t    sb_logblocks;   /* number of log blocks */
+        __uint16_t      sb_versionnum;  /* header version == XFS_SB_VERSION */
+        __uint16_t      sb_sectsize;    /* volume sector size, bytes */
+        __uint16_t      sb_inodesize;   /* inode size, bytes */
+        __uint16_t      sb_inopblock;   /* inodes per block */
+        char            sb_fname[12];   /* file system name */
+        __uint8_t       sb_blocklog;    /* log2 of sb_blocksize */
+        __uint8_t       sb_sectlog;     /* log2 of sb_sectsize */
+        __uint8_t       sb_inodelog;    /* log2 of sb_inodesize */
+        __uint8_t       sb_inopblog;    /* log2 of sb_inopblock */
+        __uint8_t       sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
+        __uint8_t       sb_rextslog;    /* log2 of sb_rextents */
+        __uint8_t       sb_inprogress;  /* mkfs is in progress, don't mount */
+        __uint8_t       sb_imax_pct;    /* max % of fs for inode space */
+                                        /* statistics */
+        /*
+         * These fields must remain contiguous.  If you really
+         * want to change their layout, make sure you fix the
+         * code in xfs_trans_apply_sb_deltas().
+         */
+        __uint64_t      sb_icount;      /* allocated inodes */
+        __uint64_t      sb_ifree;       /* free inodes */
+        __uint64_t      sb_fdblocks;    /* free data blocks */
+        __uint64_t      sb_frextents;   /* free realtime extents */
+        /*
+         * End contiguous fields.
+         */
+        xfs_ino_t       sb_uquotino;    /* user quota inode */
+        xfs_ino_t       sb_gquotino;    /* group quota inode */
+        __uint16_t      sb_qflags;      /* quota flags */
+        __uint8_t       sb_flags;       /* misc. flags */
+        __uint8_t       sb_shared_vn;   /* shared version number */
+        xfs_extlen_t    sb_inoalignmt;  /* inode chunk alignment, fsblocks */
+        __uint32_t      sb_unit;        /* stripe or raid unit */
+        __uint32_t      sb_width;       /* stripe or raid width */
+        __uint8_t       sb_dirblklog;   /* log2 of dir block size (fsbs) */
+        __uint8_t       sb_logsectlog;  /* log2 of the log sector size */
+        __uint16_t      sb_logsectsize; /* sector size for the log, bytes */
+        __uint32_t      sb_logsunit;    /* stripe unit size for the log */
+        __uint32_t      sb_features2;   /* additional feature bits */
+        /*
+         * bad features2 field as a result of failing to pad the sb
+         * structure to 64 bits. Some machines will be using this field
+         * for features2 bits. Easiest just to mark it bad and not use
+         * it for anything else.
+         */
+        __uint32_t      sb_bad_features2;
+        /* version 5 superblock fields start here */
+        /* feature masks */
+        __uint32_t      sb_features_compat;
+        __uint32_t      sb_features_ro_compat;
+        __uint32_t      sb_features_incompat;
+        __uint32_t      sb_features_log_incompat;
+        __uint32_t      sb_crc;         /* superblock crc */
+        __uint32_t      sb_pad;
+        xfs_ino_t       sb_pquotino;    /* project quota inode */
+        xfs_lsn_t       sb_lsn;         /* last write sequence */
+        /* must be padded to 64 bit alignment */
+} xfs_sb_t;
+#define XFS_SB_CRC_OFF          offsetof(struct xfs_sb, sb_crc)
+/*
+ * Superblock - on disk version.  Must match the in core version above.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_dsb {
+        __be32          sb_magicnum;    /* magic number == XFS_SB_MAGIC */
+        __be32          sb_blocksize;   /* logical block size, bytes */
+        __be64          sb_dblocks;     /* number of data blocks */
+        __be64          sb_rblocks;     /* number of realtime blocks */
+        __be64          sb_rextents;    /* number of realtime extents */
+        uuid_t          sb_uuid;        /* file system unique id */
+        __be64          sb_logstart;    /* starting block of log if internal */
+        __be64          sb_rootino;     /* root inode number */
+        __be64          sb_rbmino;      /* bitmap inode for realtime extents */
+        __be64          sb_rsumino;     /* summary inode for rt bitmap */
+        __be32          sb_rextsize;    /* realtime extent size, blocks */
+        __be32          sb_agblocks;    /* size of an allocation group */
+        __be32          sb_agcount;     /* number of allocation groups */
+        __be32          sb_rbmblocks;   /* number of rt bitmap blocks */
+        __be32          sb_logblocks;   /* number of log blocks */
+        __be16          sb_versionnum;  /* header version == XFS_SB_VERSION */
+        __be16          sb_sectsize;    /* volume sector size, bytes */
+        __be16          sb_inodesize;   /* inode size, bytes */
+        __be16          sb_inopblock;   /* inodes per block */
+        char            sb_fname[12];   /* file system name */
+        __u8            sb_blocklog;    /* log2 of sb_blocksize */
+        __u8            sb_sectlog;     /* log2 of sb_sectsize */
+        __u8            sb_inodelog;    /* log2 of sb_inodesize */
+        __u8            sb_inopblog;    /* log2 of sb_inopblock */
+        __u8            sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
+        __u8            sb_rextslog;    /* log2 of sb_rextents */
+        __u8            sb_inprogress;  /* mkfs is in progress, don't mount */
+        __u8            sb_imax_pct;    /* max % of fs for inode space */
+                                        /* statistics */
+        /*
+         * These fields must remain contiguous.  If you really
+         * want to change their layout, make sure you fix the
+         * code in xfs_trans_apply_sb_deltas().
+         */
+        __be64          sb_icount;      /* allocated inodes */
+        __be64          sb_ifree;       /* free inodes */
+        __be64          sb_fdblocks;    /* free data blocks */
+        __be64          sb_frextents;   /* free realtime extents */
+        /*
+         * End contiguous fields.
+         */
+        __be64          sb_uquotino;    /* user quota inode */
+        __be64          sb_gquotino;    /* group quota inode */
+        __be16          sb_qflags;      /* quota flags */
+        __u8            sb_flags;       /* misc. flags */
+        __u8            sb_shared_vn;   /* shared version number */
+        __be32          sb_inoalignmt;  /* inode chunk alignment, fsblocks */
+        __be32          sb_unit;        /* stripe or raid unit */
+        __be32          sb_width;       /* stripe or raid width */
+        __u8            sb_dirblklog;   /* log2 of dir block size (fsbs) */
+        __u8            sb_logsectlog;  /* log2 of the log sector size */
+        __be16          sb_logsectsize; /* sector size for the log, bytes */
+        __be32          sb_logsunit;    /* stripe unit size for the log */
+        __be32          sb_features2;   /* additional feature bits */
+        /*
+         * bad features2 field as a result of failing to pad the sb
+         * structure to 64 bits. Some machines will be using this field
+         * for features2 bits. Easiest just to mark it bad and not use
+         * it for anything else.
+         */
+        __be32          sb_bad_features2;
+        /* version 5 superblock fields start here */
+        /* feature masks */
+        __be32          sb_features_compat;
+        __be32          sb_features_ro_compat;
+        __be32          sb_features_incompat;
+        __be32          sb_features_log_incompat;
+        __le32          sb_crc;         /* superblock crc */
+        __be32          sb_pad;
+        __be64          sb_pquotino;    /* project quota inode */
+        __be64          sb_lsn;         /* last write sequence */
+        /* must be padded to 64 bit alignment */
+} xfs_dsb_t;
+/*
+ * Sequence number values for the fields.
+ */
+typedef enum {
+        XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
+        XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
+        XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
+        XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
+        XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
+        XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
+        XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
+        XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
+        XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
+        XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
+        XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
+        XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
+        XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
+        XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
+        XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
+        XFS_SBS_PQUOTINO, XFS_SBS_LSN,
+        XFS_SBS_FIELDCOUNT
+} xfs_sb_field_t;
+/*
+ * Mask values, defined based on the xfs_sb_field_t values.
+ * Only define the ones we're using.
+ */
+#define XFS_SB_MVAL(x)          (1LL << XFS_SBS_ ## x)
+#define XFS_SB_UUID             XFS_SB_MVAL(UUID)
+#define XFS_SB_FNAME            XFS_SB_MVAL(FNAME)
+#define XFS_SB_ROOTINO          XFS_SB_MVAL(ROOTINO)
+#define XFS_SB_RBMINO           XFS_SB_MVAL(RBMINO)
+#define XFS_SB_RSUMINO          XFS_SB_MVAL(RSUMINO)
+#define XFS_SB_VERSIONNUM       XFS_SB_MVAL(VERSIONNUM)
+#define XFS_SB_UQUOTINO         XFS_SB_MVAL(UQUOTINO)
+#define XFS_SB_GQUOTINO         XFS_SB_MVAL(GQUOTINO)
+#define XFS_SB_QFLAGS           XFS_SB_MVAL(QFLAGS)
+#define XFS_SB_SHARED_VN        XFS_SB_MVAL(SHARED_VN)
+#define XFS_SB_UNIT             XFS_SB_MVAL(UNIT)
+#define XFS_SB_WIDTH            XFS_SB_MVAL(WIDTH)
+#define XFS_SB_ICOUNT           XFS_SB_MVAL(ICOUNT)
+#define XFS_SB_IFREE            XFS_SB_MVAL(IFREE)
+#define XFS_SB_FDBLOCKS         XFS_SB_MVAL(FDBLOCKS)
+#define XFS_SB_FEATURES2        XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_BAD_FEATURES2    XFS_SB_MVAL(BAD_FEATURES2)
+#define XFS_SB_FEATURES_COMPAT  XFS_SB_MVAL(FEATURES_COMPAT)
+#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
+#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
+#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
+#define XFS_SB_CRC              XFS_SB_MVAL(CRC)
+#define XFS_SB_PQUOTINO         XFS_SB_MVAL(PQUOTINO)
+#define XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
+#define XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
+#define XFS_SB_MOD_BITS         \
+        (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
+         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
+         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
+         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
+         XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+         XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+         XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS         0x00    /* no flags set */
+#define XFS_SBF_READONLY        0x01    /* only read-only mounts allowed */
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN    0
+#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+/*
+ * The first XFS version we support is a v4 superblock with V2 directories.
+ */
+static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
+{
+        if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+                return false;
+        /* check for unknown features in the fs */
+        if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+            ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+             (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+                return false;
+        return true;
+}
+static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
+{
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+                return true;
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+                return xfs_sb_good_v4_features(sbp);
+        return false;
+}
+/*
+ * Detect a mismatched features2 field.  Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
+{
+        return sbp->sb_bad_features2 != sbp->sb_features2;
+}
+static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
+}
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
+{
+        sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+}
+static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+}
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
+{
+        sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+}
+static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+                (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
+}
+static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+}
+static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+               (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+}
+static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+               (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+}
+static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+}
+static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
+{
+        return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+               (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+}
+/*
+ * sb_features2 bit version macros.
+ */
+static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+               (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+}
+static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+               (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
+}
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
+{
+        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+        sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+}
+static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
+{
+        sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+        sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+        if (!sbp->sb_features2)
+                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
+static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+               (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
+}
+static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
+{
+        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+        sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+        sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+}
+/*
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
+ *
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
+ *
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
+ */
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN      ~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+        struct xfs_sb   *sbp,
+        __uint32_t      feature)
+{
+        return (sbp->sb_features_compat & feature) != 0;
+}
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)         /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+                (XFS_SB_FEAT_RO_COMPAT_FINOBT)
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN   ~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+        struct xfs_sb   *sbp,
+        __uint32_t      feature)
+{
+        return (sbp->sb_features_ro_compat & feature) != 0;
+}
+#define XFS_SB_FEAT_INCOMPAT_FTYPE      (1 << 0)        /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+                (XFS_SB_FEAT_INCOMPAT_FTYPE)
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN    ~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+        struct xfs_sb   *sbp,
+        __uint32_t      feature)
+{
+        return (sbp->sb_features_incompat & feature) != 0;
+}
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN        ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+        struct xfs_sb   *sbp,
+        __uint32_t      feature)
+{
+        return (sbp->sb_features_log_incompat & feature) != 0;
+}
+/*
+ * V5 superblock specific feature checks
+ */
+static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+                xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
+               (xfs_sb_version_hasmorebits(sbp) &&
+                 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
+}
+static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+                (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
+}
+/*
+ * end of superblock version macros
+ */
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+        return (ino == sbp->sb_uquotino ||
+                ino == sbp->sb_gquotino ||
+                ino == sbp->sb_pquotino);
+}
+#define XFS_SB_DADDR            ((xfs_daddr_t)0) /* daddr in filesystem/ag */
+#define XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#define XFS_BUF_TO_SBP(bp)      ((xfs_dsb_t *)((bp)->b_addr))
+#define XFS_HDR_BLOCK(mp,d)     ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
+#define XFS_DADDR_TO_FSB(mp,d)  XFS_AGB_TO_FSB(mp, \
+                        xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
+#define XFS_FSB_TO_DADDR(mp,fsbno)      XFS_AGB_TO_DADDR(mp, \
+                        XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec)   ((sec) << (mp)->m_sectbb_log)
+/*
+ * File system block to basic block conversions.
+ */
+#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSB(mp,bb)    \
+        (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSBT(mp,bb)   ((bb) >> (mp)->m_blkbb_log)
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno)  ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b)      \
+        ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b)     (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b)  ((b) & (mp)->m_blockmask)
+/*
+ * perag get/put wrappers for ref counting
+ */
+extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
+extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
+                                           int tag);
+extern void     xfs_perag_put(struct xfs_perag *pag);
+extern int      xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
+extern void     xfs_sb_calc_crc(struct xfs_buf  *);
+extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern void     xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
+extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern void     xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+#endif  /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
new file mode 100644
index 000000000000..82404da2ca67
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SHARED_H__
+#define __XFS_SHARED_H__
+/*
+ * Definitions shared between kernel and userspace that don't fit into any other
+ * header file that is shared with userspace.
+ */
+struct xfs_ifork;
+struct xfs_buf;
+struct xfs_buf_ops;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+/*
+ * Buffer verifier operations are widely used, including userspace tools
+ */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+/*
+ * Transaction types.  Used to distinguish types of buffers. These never reach
+ * the log.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE      1
+#define XFS_TRANS_SETATTR_SIZE          2
+#define XFS_TRANS_INACTIVE              3
+#define XFS_TRANS_CREATE                4
+#define XFS_TRANS_CREATE_TRUNC          5
+#define XFS_TRANS_TRUNCATE_FILE         6
+#define XFS_TRANS_REMOVE                7
+#define XFS_TRANS_LINK                  8
+#define XFS_TRANS_RENAME                9
+#define XFS_TRANS_MKDIR                 10
+#define XFS_TRANS_RMDIR                 11
+#define XFS_TRANS_SYMLINK               12
+#define XFS_TRANS_SET_DMATTRS           13
+#define XFS_TRANS_GROWFS                14
+#define XFS_TRANS_STRAT_WRITE           15
+#define XFS_TRANS_DIOSTRAT              16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define XFS_TRANS_WRITEID               18
+#define XFS_TRANS_ADDAFORK              19
+#define XFS_TRANS_ATTRINVAL             20
+#define XFS_TRANS_ATRUNCATE             21
+#define XFS_TRANS_ATTR_SET              22
+#define XFS_TRANS_ATTR_RM               23
+#define XFS_TRANS_ATTR_FLAG             24
+#define XFS_TRANS_CLEAR_AGI_BUCKET      25
+#define XFS_TRANS_QM_SBCHANGE           26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1                27
+#define XFS_TRANS_DUMMY2                28
+#define XFS_TRANS_QM_QUOTAOFF           29
+#define XFS_TRANS_QM_DQALLOC            30
+#define XFS_TRANS_QM_SETQLIM            31
+#define XFS_TRANS_QM_DQCLUSTER          32
+#define XFS_TRANS_QM_QINOCREATE         33
+#define XFS_TRANS_QM_QUOTAOFF_END       34
+#define XFS_TRANS_SB_UNIT               35
+#define XFS_TRANS_FSYNC_TS              36
+#define XFS_TRANS_GROWFSRT_ALLOC        37
+#define XFS_TRANS_GROWFSRT_ZERO         38
+#define XFS_TRANS_GROWFSRT_FREE         39
+#define XFS_TRANS_SWAPEXT               40
+#define XFS_TRANS_SB_COUNT              41
+#define XFS_TRANS_CHECKPOINT            42
+#define XFS_TRANS_ICREATE               43
+#define XFS_TRANS_CREATE_TMPFILE        44
+#define XFS_TRANS_TYPE_MAX              44
+/* new transaction types need to be reflected in xfs_logprint(8) */
+#define XFS_TRANS_TYPES \
+        { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
+        { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
+        { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
+        { XFS_TRANS_CREATE,             "CREATE" }, \
+        { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
+        { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
+        { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
+        { XFS_TRANS_REMOVE,             "REMOVE" }, \
+        { XFS_TRANS_LINK,               "LINK" }, \
+        { XFS_TRANS_RENAME,             "RENAME" }, \
+        { XFS_TRANS_MKDIR,              "MKDIR" }, \
+        { XFS_TRANS_RMDIR,              "RMDIR" }, \
+        { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
+        { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
+        { XFS_TRANS_GROWFS,             "GROWFS" }, \
+        { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
+        { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
+        { XFS_TRANS_WRITEID,            "WRITEID" }, \
+        { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
+        { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
+        { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
+        { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
+        { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
+        { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
+        { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
+        { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
+        { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
+        { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
+        { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
+        { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
+        { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
+        { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
+        { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
+        { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
+        { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
+        { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
+        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
+        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
+        { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
+        { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
+        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
+        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+        struct xfs_log_item     *lid_item;
+        struct list_head        lid_trans;
+        unsigned char           lid_flags;
+};
+#define XFS_LID_DIRTY           0x1
+/* log size calculation functions */
+int     xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int     xfs_log_calc_minimum_size(struct xfs_mount *);
+/*
+ * Values for t_flags.
+ */
+#define XFS_TRANS_DIRTY         0x01    /* something needs to be logged */
+#define XFS_TRANS_SB_DIRTY      0x02    /* superblock is modified */
+#define XFS_TRANS_PERM_LOG_RES  0x04    /* xact took a permanent log res */
+#define XFS_TRANS_SYNC          0x08    /* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY      0x10    /* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE       0x20    /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT   0x40    /* Transaction has elevated writer
+                                           count in superblock */
+/*
+ * Values for call flags parameter.
+ */
+#define XFS_TRANS_RELEASE_LOG_RES       0x4
+#define XFS_TRANS_ABORT                 0x8
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define XFS_TRANS_SB_ICOUNT             0x00000001
+#define XFS_TRANS_SB_IFREE              0x00000002
+#define XFS_TRANS_SB_FDBLOCKS           0x00000004
+#define XFS_TRANS_SB_RES_FDBLOCKS       0x00000008
+#define XFS_TRANS_SB_FREXTENTS          0x00000010
+#define XFS_TRANS_SB_RES_FREXTENTS      0x00000020
+#define XFS_TRANS_SB_DBLOCKS            0x00000040
+#define XFS_TRANS_SB_AGCOUNT            0x00000080
+#define XFS_TRANS_SB_IMAXPCT            0x00000100
+#define XFS_TRANS_SB_REXTSIZE           0x00000200
+#define XFS_TRANS_SB_RBMBLOCKS          0x00000400
+#define XFS_TRANS_SB_RBLOCKS            0x00000800
+#define XFS_TRANS_SB_REXTENTS           0x00001000
+#define XFS_TRANS_SB_REXTSLOG           0x00002000
+/*
+ * Here we centralize the specification of XFS meta-data buffer reference count
+ * values.  This determines how hard the buffer cache tries to hold onto the
+ * buffer.
+ */
+#define XFS_AGF_REF             4
+#define XFS_AGI_REF             4
+#define XFS_AGFL_REF            3
+#define XFS_INO_BTREE_REF       3
+#define XFS_ALLOC_BTREE_REF     2
+#define XFS_BMAP_BTREE_REF      2
+#define XFS_DIR_BTREE_REF       2
+#define XFS_INO_REF             2
+#define XFS_ATTR_BTREE_REF      1
+#define XFS_DQUOT_REF           1
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+#define XFS_ICHGTIME_CREATE     0x4     /* inode create timestamp */
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+                        uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+                        uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+                                 struct xfs_inode *ip, struct xfs_ifork *ifp);
+#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
new file mode 100644
index 000000000000..5782f037eab4
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+        struct xfs_mount *mp,
+        int             pathlen)
+{
+        int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+        return (pathlen + buflen - 1) / buflen;
+}
+int
+xfs_symlink_hdr_set(
+        struct xfs_mount        *mp,
+        xfs_ino_t               ino,
+        uint32_t                offset,
+        uint32_t                size,
+        struct xfs_buf          *bp)
+{
+        struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return 0;
+        dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+        dsl->sl_offset = cpu_to_be32(offset);
+        dsl->sl_bytes = cpu_to_be32(size);
+        uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+        dsl->sl_owner = cpu_to_be64(ino);
+        dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+        bp->b_ops = &xfs_symlink_buf_ops;
+        return sizeof(struct xfs_dsymlink_hdr);
+}
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+        xfs_ino_t               ino,
+        uint32_t                offset,
+        uint32_t                size,
+        struct xfs_buf          *bp)
+{
+        struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+        if (offset != be32_to_cpu(dsl->sl_offset))
+                return false;
+        if (size != be32_to_cpu(dsl->sl_bytes))
+                return false;
+        if (ino != be64_to_cpu(dsl->sl_owner))
+                return false;
+        /* ok */
+        return true;
+}
+static bool
+xfs_symlink_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return false;
+        if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+                return false;
+        if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+                return false;
+        if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+                return false;
+        if (be32_to_cpu(dsl->sl_offset) +
+                                be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+                return false;
+        if (dsl->sl_owner == 0)
+                return false;
+        return true;
+}
+static void
+xfs_symlink_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        /* no verification of non-crc buffers */
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
+                xfs_buf_ioerror(bp, -EFSBADCRC);
+        else if (!xfs_symlink_verify(bp))
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+        if (bp->b_error)
+                xfs_verifier_error(bp);
+}
+static void
+xfs_symlink_write_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_buf_log_item *bip = bp->b_fspriv;
+        /* no verification of non-crc buffers */
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
+        if (!xfs_symlink_verify(bp)) {
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
+        }
+        if (bip) {
+                struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+                dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+        }
+        xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
+}
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+        .verify_read = xfs_symlink_read_verify,
+        .verify_write = xfs_symlink_write_verify,
+};
+void
+xfs_symlink_local_to_remote(
+        struct xfs_trans        *tp,
+        struct xfs_buf          *bp,
+        struct xfs_inode        *ip,
+        struct xfs_ifork        *ifp)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        char                    *buf;
+        if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+                bp->b_ops = NULL;
+                memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+                return;
+        }
+        /*
+         * As this symlink fits in an inode literal area, it must also fit in
+         * the smallest buffer the filesystem supports.
+         */
+        ASSERT(BBTOB(bp->b_length) >=
+                        ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+        bp->b_ops = &xfs_symlink_buf_ops;
+        buf = bp->b_addr;
+        buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+        memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
new file mode 100644
index 000000000000..f2bda7c76b8a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -0,0 +1,894 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer.  Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+        return round_up(sizeof(struct xlog_op_header) +
+                        sizeof(struct xfs_buf_log_format), 128);
+}
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction.  size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+        uint            nbufs,
+        uint            size)
+{
+        return nbufs * (size + xfs_buf_log_overhead());
+}
+/*
+ * Logging inodes is really tricksy. They are logged in memory format,
+ * which means that what we write into the log doesn't directly translate into
+ * the amount of space they use on disk.
+ *
+ * Case in point - btree format forks in memory format use more space than the
+ * on-disk format. In memory, the buffer contains a normal btree block header so
+ * the btree code can treat it as though it is just another generic buffer.
+ * However, when we write it to the inode fork, we don't write all of this
+ * header as it isn't needed. e.g. the root is only ever in the inode, so
+ * there's no need for sibling pointers which would waste 16 bytes of space.
+ *
+ * Hence when we have an inode with a maximally sized btree format fork, then
+ * amount of information we actually log is greater than the size of the inode
+ * on disk. Hence we need an inode reservation function that calculates all this
+ * correctly. So, we log:
+ *
+ * - 4 log op headers for object
+ *      - for the ilf, the inode core and 2 forks
+ * - inode log format object
+ * - the inode core
+ * - two inode forks containing bmap btree root blocks.
+ *      - the btree data contained by both forks will fit into the inode size,
+ *        hence when combined with the inode core above, we have a total of the
+ *        actual inode size.
+ *      - the BMBT headers need to be accounted separately, as they are
+ *        additional to the records and pointers that fit inside the inode
+ *        forks.
+ */
+STATIC uint
+xfs_calc_inode_res(
+        struct xfs_mount        *mp,
+        uint                    ninodes)
+{
+        return ninodes *
+                (4 * sizeof(struct xlog_op_header) +
+                 sizeof(struct xfs_inode_log_format) +
+                 mp->m_sb.sb_inodesize +
+                 2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+/*
+ * The free inode btree is a conditional feature and the log reservation
+ * requirements differ slightly from that of the traditional inode allocation
+ * btree. The finobt tracks records for inode chunks with at least one free
+ * inode. A record can be removed from the tree for an inode allocation
+ * or free and thus the finobt reservation is unconditional across:
+ *
+ *      - inode allocation
+ *      - inode free
+ *      - inode chunk allocation
+ *
+ * The 'modify' param indicates to include the record modification scenario. The
+ * 'alloc' param indicates to include the reservation for free space btree
+ * modifications on behalf of finobt modifications. This is required only for
+ * transactions that do not already account for free space btree modifications.
+ *
+ * the free inode btree: max depth * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the free inode btree entry: block size
+ */
+STATIC uint
+xfs_calc_finobt_res(
+        struct xfs_mount        *mp,
+        int                     alloc,
+        int                     modify)
+{
+        uint res;
+        if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+                return 0;
+        res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
+        if (alloc)
+                res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 
+                                        XFS_FSB_TO_B(mp, 1));
+        if (modify)
+                res += (uint)XFS_FSB_TO_B(mp, 1);
+        return res;
+}
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_write_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((xfs_calc_inode_res(mp, 1) +
+                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+                                      XFS_FSB_TO_B(mp, 1)) +
+                     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                      XFS_FSB_TO_B(mp, 1))),
+                    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                      XFS_FSB_TO_B(mp, 1))));
+}
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((xfs_calc_inode_res(mp, 1) +
+                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+                                      XFS_FSB_TO_B(mp, 1))),
+                    (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                                      XFS_FSB_TO_B(mp, 1)) +
+                    xfs_calc_buf_res(5, 0) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                     XFS_FSB_TO_B(mp, 1)) +
+                    xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+                                     mp->m_in_maxlevels, 0)));
+}
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *      of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((xfs_calc_inode_res(mp, 4) +
+                     xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+                                      XFS_FSB_TO_B(mp, 1))),
+                    (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+                                      XFS_FSB_TO_B(mp, 1))));
+}
+/*
+ * For removing an inode from unlinked list at first, we can modify:
+ *    the agi hash list and counters: sector size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_remove_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+}
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                xfs_calc_iunlink_remove_reservation(mp) +
+                MAX((xfs_calc_inode_res(mp, 2) +
+                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+                                      XFS_FSB_TO_B(mp, 1))),
+                    (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                      XFS_FSB_TO_B(mp, 1))));
+}
+/*
+ * For adding an inode to unlinked list we can modify:
+ *    the agi hash list: sector size
+ *    the unlinked inode: inode size
+ */
+STATIC uint
+xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                xfs_calc_inode_res(mp, 1);
+}
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                xfs_calc_iunlink_add_reservation(mp) +
+                MAX((xfs_calc_inode_res(mp, 1) +
+                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+                                      XFS_FSB_TO_B(mp, 1))),
+                    (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                      XFS_FSB_TO_B(mp, 1))));
+}
+/*
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the finobt (record modification and allocation btrees)
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_inode_res(mp, 2) +
+                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                (uint)XFS_FSB_TO_B(mp, 1) +
+                xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_finobt_res(mp, 1, 1);
+}
+/*
+ * For create we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: mp->m_ialloc_blks * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_create_resv_alloc(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+                mp->m_sb.sb_sectsize +
+                xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                 XFS_FSB_TO_B(mp, 1));
+}
+STATIC uint
+__xfs_calc_create_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX(xfs_calc_create_resv_alloc(mp),
+                    xfs_calc_create_resv_modify(mp));
+}
+/*
+ * For icreate we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion)
+ */
+STATIC uint
+xfs_calc_icreate_resv_alloc(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+                mp->m_sb.sb_sectsize +
+                xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                 XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_finobt_res(mp, 0, 0);
+}
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX(xfs_calc_icreate_resv_alloc(mp),
+                    xfs_calc_create_resv_modify(mp));
+}
+STATIC uint
+xfs_calc_create_reservation(
+        struct xfs_mount        *mp)
+{
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                return xfs_calc_icreate_reservation(mp);
+        return __xfs_calc_create_reservation(mp);
+}
+STATIC uint
+xfs_calc_create_tmpfile_reservation(
+        struct xfs_mount        *mp)
+{
+        uint    res = XFS_DQUOT_LOGRES(mp);
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                res += xfs_calc_icreate_resv_alloc(mp);
+        else
+                res += xfs_calc_create_resv_alloc(mp);
+        return res + xfs_calc_iunlink_add_reservation(mp);
+}
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_create_reservation(mp);
+}
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_create_reservation(mp) +
+               xfs_calc_buf_res(1, MAXPATHLEN);
+}
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ *    the finobt (record insertion, removal or modification)
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                xfs_calc_inode_res(mp, 1) +
+                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_iunlink_remove_reservation(mp) +
+                xfs_calc_buf_res(1, 0) +
+                xfs_calc_buf_res(2 + mp->m_ialloc_blks +
+                                 mp->m_in_maxlevels, 0) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                 XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_finobt_res(mp, 0, 1);
+}
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                xfs_calc_inode_res(mp, 1) +
+                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+/*
+ * Growing the data section of the filesystem.
+ *      superblock
+ *      agi and agf
+ *      allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                 XFS_FSB_TO_B(mp, 1));
+}
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *      superblock: sector size
+ *      agf of the ag from which the extent is allocated: sector size
+ *      bmap btree for bitmap/summary inode: max depth * blocksize
+ *      bitmap/summary inode: inode size
+ *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+                xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+                                 XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_inode_res(mp, 1) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                 XFS_FSB_TO_B(mp, 1));
+}
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *      one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
+}
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *      superblock: sector size
+ *      bitmap inode: inode size
+ *      summary inode: inode size
+ *      one bitmap block: blocksize
+ *      summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                xfs_calc_inode_res(mp, 2) +
+                xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+                xfs_calc_buf_res(1, mp->m_rsumsize);
+}
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *      inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_inode_res(mp, 1);
+}
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *      inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_inode_res(mp, 1);
+}
+/*
+ * Converting the inode from non-attributed to attributed.
+ *      the inode being converted: inode size
+ *      agf block and superblock (for block allocation)
+ *      the new block (directory sized)
+ *      bmap blocks for the new directory block
+ *      allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                xfs_calc_inode_res(mp, 1) +
+                xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+                xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
+                xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+                                 XFS_FSB_TO_B(mp, 1)) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                 XFS_FSB_TO_B(mp, 1));
+}
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+        struct xfs_mount        *mp)
+{
+        return MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+/*
+ * Setting an attribute at mount time.
+ *      the inode getting the attribute
+ *      the superblock for allocations
+ *      the agfs extents are allocated from
+ *      the attribute btree * max depth
+ *      the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
+ */
+STATIC uint
+xfs_calc_attrsetm_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                xfs_calc_inode_res(mp, 1) +
+                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ *      the superblock for allocations: sector size
+ *      the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by:
+ *      ext * M_RES(mp)->tr_attrsetrt.tr_logres
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+                                 XFS_FSB_TO_B(mp, 1));
+}
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((xfs_calc_inode_res(mp, 1) +
+                     xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+                                      XFS_FSB_TO_B(mp, 1)) +
+                     (uint)XFS_FSB_TO_B(mp,
+                                        XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+                    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                      XFS_FSB_TO_B(mp, 1))));
+}
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+/*
+ * Clearing the quotaflags in the superblock.
+ *      the super block for changing quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_sbchange_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+/*
+ * Adjusting quota limits.
+ *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+/*
+ * Allocating quota on disk if needed.
+ *      the write transaction log space for quota file extent allocation
+ *      the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_write_reservation(mp) +
+                xfs_calc_buf_res(1,
+                        XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+/*
+ * Turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ *    the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+        struct xfs_mount        *mp)
+{
+        return sizeof(struct xfs_qoff_logitem) * 2 +
+                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+/*
+ * End of turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+        struct xfs_mount        *mp)
+{
+        return sizeof(struct xfs_qoff_logitem) * 2;
+}
+/*
+ * Syncing the incore super block changes to disk.
+ *     the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+void
+xfs_trans_resv_calc(
+        struct xfs_mount        *mp,
+        struct xfs_trans_resv   *resp)
+{
+        /*
+         * The following transactions are logged in physical format and
+         * require a permanent reservation on space.
+         */
+        resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
+        resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+        resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
+        resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+        resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
+        resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+        resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
+        resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+        resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
+        resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+        resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
+        resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+        resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
+        resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+        resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_create_tmpfile.tr_logres =
+                        xfs_calc_create_tmpfile_reservation(mp);
+        resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
+        resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+        resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
+        resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
+        resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
+        resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
+        resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
+        resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
+        resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
+        resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
+        resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+        resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
+        resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
+        resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
+        resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+        resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
+        resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+        resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        /*
+         * The following transactions are logged in logical format with
+         * a default log count.
+         */
+        resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
+        resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+        resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
+        resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+        resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
+        resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+        resp->tr_qm_equotaoff.tr_logres =
+                xfs_calc_qm_quotaoff_end_reservation(mp);
+        resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+        resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
+        resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+        /* The following transaction are logged in logical format */
+        resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
+        resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+        resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
+        resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
+        resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
+        resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
+        resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
+        resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
new file mode 100644
index 000000000000..1097d14cd583
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_TRANS_RESV_H__
+#define __XFS_TRANS_RESV_H__
+struct xfs_mount;
+/*
+ * structure for maintaining pre-calculated transaction reservations.
+ */
+struct xfs_trans_res {
+        uint    tr_logres;      /* log space unit in bytes per log ticket */
+        int     tr_logcount;    /* number of log operations per log ticket */
+        int     tr_logflags;    /* log flags, currently only used for indicating
+                                 * a reservation request is permanent or not */
+};
+struct xfs_trans_resv {
+        struct xfs_trans_res    tr_write;       /* extent alloc trans */
+        struct xfs_trans_res    tr_itruncate;   /* truncate trans */
+        struct xfs_trans_res    tr_rename;      /* rename trans */
+        struct xfs_trans_res    tr_link;        /* link trans */
+        struct xfs_trans_res    tr_remove;      /* unlink trans */
+        struct xfs_trans_res    tr_symlink;     /* symlink trans */
+        struct xfs_trans_res    tr_create;      /* create trans */
+        struct xfs_trans_res    tr_create_tmpfile; /* create O_TMPFILE trans */
+        struct xfs_trans_res    tr_mkdir;       /* mkdir trans */
+        struct xfs_trans_res    tr_ifree;       /* inode free trans */
+        struct xfs_trans_res    tr_ichange;     /* inode update trans */
+        struct xfs_trans_res    tr_growdata;    /* fs data section grow trans */
+        struct xfs_trans_res    tr_addafork;    /* add inode attr fork trans */
+        struct xfs_trans_res    tr_writeid;     /* write setuid/setgid file */
+        struct xfs_trans_res    tr_attrinval;   /* attr fork buffer
+                                                 * invalidation */
+        struct xfs_trans_res    tr_attrsetm;    /* set/create an attribute at
+                                                 * mount time */
+        struct xfs_trans_res    tr_attrsetrt;   /* set/create an attribute at
+                                                 * runtime */
+        struct xfs_trans_res    tr_attrrm;      /* remove an attribute */
+        struct xfs_trans_res    tr_clearagi;    /* clear agi unlinked bucket */
+        struct xfs_trans_res    tr_growrtalloc; /* grow realtime allocations */
+        struct xfs_trans_res    tr_growrtzero;  /* grow realtime zeroing */
+        struct xfs_trans_res    tr_growrtfree;  /* grow realtime freeing */
+        struct xfs_trans_res    tr_qm_sbchange; /* change quota flags */
+        struct xfs_trans_res    tr_qm_setqlim;  /* adjust quota limits */
+        struct xfs_trans_res    tr_qm_dqalloc;  /* allocate quota on disk */
+        struct xfs_trans_res    tr_qm_quotaoff; /* turn quota off */
+        struct xfs_trans_res    tr_qm_equotaoff;/* end of turn quota off */
+        struct xfs_trans_res    tr_sb;          /* modify superblock */
+        struct xfs_trans_res    tr_fsyncts;     /* update timestamps on fsync */
+};
+/* shorthand way of accessing reservation structure */
+#define M_RES(mp)       (&(mp)->m_resv)
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
+        ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+        ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define XFS_DIROP_LOG_RES(mp)   \
+        (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+         (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define XFS_DIROP_LOG_COUNT(mp) \
+        (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+         XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+/*
+ * Various log count values.
+ */
+#define XFS_DEFAULT_LOG_COUNT           1
+#define XFS_DEFAULT_PERM_LOG_COUNT      2
+#define XFS_ITRUNCATE_LOG_COUNT         2
+#define XFS_INACTIVE_LOG_COUNT          2
+#define XFS_CREATE_LOG_COUNT            2
+#define XFS_CREATE_TMPFILE_LOG_COUNT    2
+#define XFS_MKDIR_LOG_COUNT             3
+#define XFS_SYMLINK_LOG_COUNT           3
+#define XFS_REMOVE_LOG_COUNT            2
+#define XFS_LINK_LOG_COUNT              2
+#define XFS_RENAME_LOG_COUNT            2
+#define XFS_WRITE_LOG_COUNT             2
+#define XFS_ADDAFORK_LOG_COUNT          2
+#define XFS_ATTRINVAL_LOG_COUNT         1
+#define XFS_ATTRSET_LOG_COUNT           3
+#define XFS_ATTRRM_LOG_COUNT            3
+void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+#endif  /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
new file mode 100644
index 000000000000..bf9c4579334d
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+/*
+ * Components of space reservations.
+ */
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
+                (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define XFS_EXTENTADD_SPACE_RES(mp,w)   (XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+        (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+          XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+          XFS_EXTENTADD_SPACE_RES(mp,w))
+#define XFS_DAENTER_1B(mp,w)    \
+        ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
+#define XFS_DAENTER_DBS(mp,w)   \
+        (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
+#define XFS_DAENTER_BLOCKS(mp,w)        \
+        (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define XFS_DAENTER_BMAP1B(mp,w)        \
+        XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define XFS_DAENTER_BMAPS(mp,w)         \
+        (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define XFS_DAENTER_SPACE_RES(mp,w)     \
+        (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define XFS_DAREMOVE_SPACE_RES(mp,w)    XFS_DAENTER_BMAPS(mp,w)
+#define XFS_DIRENTER_MAX_SPLIT(mp,nl)   1
+#define XFS_DIRENTER_SPACE_RES(mp,nl)   \
+        (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+         XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define XFS_DIRREMOVE_SPACE_RES(mp)     \
+        XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define XFS_IALLOC_SPACE_RES(mp)        \
+        ((mp)->m_ialloc_blks + \
+         (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
+          ((mp)->m_in_maxlevels - 1)))
+/*
+ * Space reservation values for various transactions.
+ */
+#define XFS_ADDAFORK_SPACE_RES(mp)      \
+        ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+#define XFS_ATTRRM_SPACE_RES(mp)        \
+        XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define XFS_ATTRSET_SPACE_RES(mp, v)    \
+        (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define XFS_CREATE_SPACE_RES(mp,nl)     \
+        (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_DIOSTRAT_SPACE_RES(mp, v)   \
+        (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define XFS_GROWFS_SPACE_RES(mp)        \
+        (2 * XFS_AG_MAXLEVELS(mp))
+#define XFS_GROWFSRT_SPACE_RES(mp,b)    \
+        ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define XFS_LINK_SPACE_RES(mp,nl)       \
+        XFS_DIRENTER_SPACE_RES(mp,nl)
+#define XFS_MKDIR_SPACE_RES(mp,nl)      \
+        (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_QM_DQALLOC_SPACE_RES(mp)    \
+        (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+         XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
+        XFS_IALLOC_SPACE_RES(mp)
+#define XFS_REMOVE_SPACE_RES(mp)        \
+        XFS_DIRREMOVE_SPACE_RES(mp)
+#define XFS_RENAME_SPACE_RES(mp,nl)     \
+        (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_SYMLINK_SPACE_RES(mp,nl,b)  \
+        (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#define XFS_IFREE_SPACE_RES(mp)         \
+        (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
+#endif  /* __XFS_TRANS_SPACE_H__ */
author	Dave Chinner <david@fromorbit.com>	2014-07-14 17:37:18 -0400
committer	Dave Chinner <david@fromorbit.com>	2014-07-14 17:37:18 -0400
commit	7f8a058f6dc52219117bc2469b1fb816f7fa1a4b (patch)
tree	43ce8eed4d26beb6f2acff2279c43eae7f79f83a /fs/xfs/libxfs
parent	03e01349c654fbdea80d3d9b4ab599244eb55bb7 (diff)
parent	2451337dd043901b5270b7586942abe564443e3d (diff)